In [None]:
spark

# Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from google.cloud import storage
from io import BytesIO
from datetime import datetime, date

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, BooleanType, IntegerType , DateType, FloatType, StructType, StructField
from pyspark.sql.functions import col, isnan, when, count, udf, to_date, year, month, date_format, size, split, dayofweek
from pyspark.sql import functions as F

# Taxi Data Frame

In [None]:
bucket_name = 'my-bigdataproject-jg'
gs_path  = f'gs://{bucket_name}/'
landing_folder = 'landing/'
cleaned_folder = 'cleaned/'
destination_folder = 'code_and_models/'

storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)

In [None]:
spark = SparkSession.builder.master("local[*]") \
    .appName("Taxi Demand Prediction") \
    .getOrCreate()

In [None]:
data_years = [2023]
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

taxi_file_path = gs_path + landing_folder
taxi_file_list = [taxi_file_path + f'yellow_tripdata_{year}-{month}.parquet' for year in data_years for month in months]

taxi_df = None
for file in taxi_file_list:
    df = spark.read.parquet(file)
    df = df.withColumn('VendorID', df['VendorID'].cast(IntegerType()))
    df = df.withColumn('passenger_count', df['passenger_count'].cast(IntegerType()))
    if taxi_df is None:
        taxi_df = df
    else:
        taxi_df = taxi_df.union(df)

taxi_df.show(5)

# Shape

In [None]:
print(f'Number of records in the dataframe: {taxi_df.count():,}')
print(f'Number of columns in the dataframe: {len(taxi_df.columns)}')

taxi_df.printSchema()



# Empty Recoreds

In [None]:
taxi_df.select([count(when(col(c).isNull(), c)).alias(c) for c in taxi_df.columns]).show()


# passenger_count

In [None]:
taxi_df.select('passenger_count').summary().show()

In [None]:
taxi_df.groupBy('passenger_count').count().show()

In [None]:
# Create a histogram of the passenger_count column to show the distribution of the number of passengers
passenger_count_df = taxi_df.groupBy('passenger_count').count().toPandas()
passenger_count_df = passenger_count_df.sort_values(by='passenger_count')
passenger_count_df.plot(kind='bar', x='passenger_count', y='count', color='blue', figsize=(10, 6))
plt.title('Distribution of the number of passengers')
plt.xlabel('Number of passengers')
plt.ylabel('Count')
plt.xticks(rotation=0)
for i, count in enumerate(passenger_count_df['count']):
    plt.text(i, count, f'{count:,}', ha='center', va='bottom')

plt.show()


In [None]:
"""
Column names
DataTypes

VendorID
long

tpep_pickup_datetime
timestamp_ntz

tpep_dropoff_datetime
timestamp_ntz

passenger_count
double

trip_distance
double

RatecodeID
double

store_and_fwd_flag
string

PULocationID
long

DOLocationID
long

payment_type
long

fare_amount
double

extra
double

mta_tax
double

tip_amount
double

tolls_amount
double

improvement_surcharge
double

total_amount
double

congestion_surcharge
double

airport_fee
double
"""



# Date Time

In [None]:
# Check the min and max dates for the two columns: tpep_pickup_datetime and tpep_dropoff_datetime
taxi_df.select(F.min('tpep_pickup_datetime'), F.max('tpep_pickup_datetime')).show()
taxi_df.select(F.min('tpep_dropoff_datetime'), F.max('tpep_dropoff_datetime')).show()

# Only keep the records within the date range: 2021-01-01 to 2024-01-01

taxi_df = taxi_df.filter((col('tpep_pickup_datetime') >= '2021-01-01') & (col('tpep_pickup_datetime') < '2024-01-01'))
taxi_df = taxi_df.filter((col('tpep_dropoff_datetime') >= '2021-01-01') & (col('tpep_dropoff_datetime') < '2024-01-01'))

# Check the min and max dates for the two columns: tpep_pickup_datetime and tpep_dropoff_datetime
taxi_df.select(F.min('tpep_pickup_datetime'), F.max('tpep_pickup_datetime')).show()
taxi_df.select(F.min('tpep_dropoff_datetime'), F.max('tpep_dropoff_datetime')).show()

In [None]:
# create a new column for trip duration in minutes
# first convert the tpep_pickup_datetime and tpep_dropoff_datetime to unix timestamp
taxi_df = taxi_df.withColumn('pickup_unix', F.unix_timestamp('tpep_pickup_datetime'))
taxi_df = taxi_df.withColumn('dropoff_unix', F.unix_timestamp('tpep_dropoff_datetime'))

# calculate the trip duration in minutes
taxi_df = taxi_df.withColumn('trip_duration', (col('dropoff_unix') - col('pickup_unix')) / 60)

# describe the trip duration
taxi_df.select('trip_duration').describe().show()

# show me the columns where trip duration is less than 0
taxi_df.filter(col('trip_duration') < 0).select('tpickup_datetime', 'tpep_dropoff_datetime', 'trip_duration').show()

In [None]:
# Show a graph of the number of trips per week from 2021 to 2023 (based on tpep_pickup_datetime)
# Create a new column: week_of_year
taxi_df = taxi_df.withColumn('week_of_year', F.weekofyear('tpep_pickup_datetime'))

# Group by week_of_year and count the number of trips
trips_per_week = taxi_df.groupBy('week_of_year').count().orderBy('week_of_year')

# Convert the spark dataframe to pandas dataframe
trips_per_week_pd = trips_per_week.toPandas()

# Plot the graph
trips_per_week_pd.plot(x='week_of_year', y='count', kind='line', figsize=(15, 6), title='Number of trips per week from 2021 to 2023')
