In [None]:
import pyspark
from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [None]:
print(spark.version)

In [None]:
print(pyspark.__version__)

In [None]:
spark

** defining schema and reading data that was downloaded using **.sh scripts**


In [None]:
df = spark.read.csv('data/raw/divvy/*', header=True, inferSchema=True)

In [None]:
df.show(2)

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType

In [None]:
# Define the new schema
new_schema = StructType([
    StructField("ride_id", StringType(), True),
    StructField("rideable_type", StringType(), True),
    StructField("started_at", TimestampType(), True),
    StructField("ended_at", TimestampType(), True),
    StructField("start_station_name", StringType(), True),
    StructField("start_station_id", StringType(), True),
    StructField("end_station_name", StringType(), True),
    StructField("end_station_id", StringType(), True),
    StructField("start_lat", FloatType(), True),
    StructField("start_lng", FloatType(), True),
    StructField("end_lat", FloatType(), True),
    StructField("end_lng", FloatType(), True),
    StructField("member_casual", StringType(), True)
])

In [None]:
# Read the CSV file with the defined schema
df = spark.read.schema(new_schema).csv('data/raw/divvy/*', header=True)

In [None]:
# Show the schema to verify
df.printSchema()

In [None]:
#df_result.coalesce(1).write.parquet('data/taxi/', mode='overwrite')

In [None]:
spark.stop()

**dividing parquet to partitioned parquet

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month

# Initialize Spark session with more memory
spark = SparkSession.builder \
    .appName("PartitionTaxiData") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

In [None]:
# Load the large Parquet file
df = spark.read.parquet('data/taxi/part-00000-40d493c8-d381-4a07-9a71-39d7b190bb43-c000.snappy.parquet')

# Add partition columns (adjust the column name as needed)
df = df.withColumn('year', year('started_at'))  # Use 'started_at' as the timestamp column
df = df.withColumn('month', month('started_at'))  # Use 'started_at' as the timestamp column



In [None]:
# Write partitioned data to a new folder
df.write.partitionBy('year', 'month').parquet('data/taxi_partitioned',mode="overwrite" )

**now reading from partioned parquet

In [None]:
# Read the partitioned data
data = spark.read.parquet('data/taxi_partitioned')

In [None]:
data.show(2)


In [None]:
data.groupBy('member_casual').count().show()