In [None]:
spark

In [None]:
import pandas as pd
from google.cloud import storage
from io import BytesIO
from datetime import datetime, date

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count, udf, to_date, year, month, date_format, size, split, dayofweek
from pyspark.sql import functions as F

In [None]:
bucket_name = 'my-bigdataproject-jg'
gs_path  = f'gs://{bucket_name}/'
landing_folder = 'landing/'
cleaned_folder = 'cleaned/'
destination_folder = 'code_and_models/'

storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)

In [None]:
spark = SparkSession.builder.master("local[*]") \
    .appName("Taxi Demand Prediction") \
    .getOrCreate()

In [None]:
weather_files = gs_path + landing_folder + 'weather_data_*.csv'
weather_df = spark.read.csv(weather_files, header=True, inferSchema=True)

In [None]:
weather_df.printSchema()


In [None]:
taxi_files = gs_path + landing_folder + 'yellow_tripdata_2023-*.parquet'
taxi_df = spark.read.parquet(taxi_files)

In [None]:
taxi_df.printSchema()

In [None]:
"""
Column names
DataTypes

VendorID
long

tpep_pickup_datetime
timestamp_ntz

tpep_dropoff_datetime
timestamp_ntz

passenger_count
double

trip_distance
double

RatecodeID
double

store_and_fwd_flag
string

PULocationID
long

DOLocationID
long

payment_type
long

fare_amount
double

extra
double

mta_tax
double

tip_amount
double

tolls_amount
double

improvement_surcharge
double

total_amount
double

congestion_surcharge
double

airport_fee
double
"""

# Check the min and max dates for the two columns: tpep_pickup_datetime and tpep_dropoff_datetime
taxi_df.select(F.min('tpep_pickup_datetime'), F.max('tpep_pickup_datetime')).show()
taxi_df.select(F.min('tpep_dropoff_datetime'), F.max('tpep_dropoff_datetime')).show()

# Only keep the records within the date range: 2021-01-01 to 2024-01-01

taxi_df = taxi_df.filter((col('tpep_pickup_datetime') >= '2021-01-01') & (col('tpep_pickup_datetime') < '2024-01-01'))
taxi_df = taxi_df.filter((col('tpep_dropoff_datetime') >= '2021-01-01') & (col('tpep_dropoff_datetime') < '2024-01-01'))

# Check the min and max dates for the two columns: tpep_pickup_datetime and tpep_dropoff_datetime
taxi_df.select(F.min('tpep_pickup_datetime'), F.max('tpep_pickup_datetime')).show()
taxi_df.select(F.min('tpep_dropoff_datetime'), F.max('tpep_dropoff_datetime')).show()




In [None]:
# create a new column for trip duration in minutes
# first convert the tpep_pickup_datetime and tpep_dropoff_datetime to unix timestamp
taxi_df = taxi_df.withColumn('pickup_unix', F.unix_timestamp('tpep_pickup_datetime'))
taxi_df = taxi_df.withColumn('dropoff_unix', F.unix_timestamp('tpep_dropoff_datetime'))

# calculate the trip duration in minutes
taxi_df = taxi_df.withColumn('trip_duration', (col('dropoff_unix') - col('pickup_unix')) / 60)

# describe the trip duration
taxi_df.select('trip_duration').describe().show()

# show me the columns where trip duration is less than 0
taxi_df.filter(col('trip_duration') < 0).select('tpickup_datetime', 'tpep_dropoff_datetime', 'trip_duration').show()

In [None]:
# Show a graph of the number of trips per week from 2021 to 2023 (based on tpep_pickup_datetime)
# Create a new column: week_of_year
taxi_df = taxi_df.withColumn('week_of_year', F.weekofyear('tpep_pickup_datetime'))

# Group by week_of_year and count the number of trips
trips_per_week = taxi_df.groupBy('week_of_year').count().orderBy('week_of_year')

# Convert the spark dataframe to pandas dataframe
trips_per_week_pd = trips_per_week.toPandas()

# Plot the graph
trips_per_week_pd.plot(x='week_of_year', y='count', kind='line', figsize=(15, 6), title='Number of trips per week from 2021 to 2023')
