In [None]:
spark

In [None]:
import pandas as pd
from google.cloud import storage
from io import BytesIO
from datetime import datetime, date


import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count, udf, to_date, year, month, date_format, size, split, dayofweek
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml import Pipeline

In [None]:
bucket_name = 'my-bigdataproject-jg'
gs_path  = f'gs://{bucket_name}/'
cleaned_folder = 'cleaned/'
destination_folder = 'code_and_models/'

storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)

In [None]:
spark = SparkSession.builder.master("local[*]") \
    .appName("Taxi Demand Prediction") \
    .getOrCreate()

In [None]:
taxi_files = gs_path + cleaned_folder + "yellow_tripdata_*.parquet"
taxi_df = spark.read.parquet(taxi_files)
taxi_df.show(15)

In [None]:
taxi_df = taxi_df.drop('__index_level_0__')
taxi_df = taxi_df.drop('dropoff_datetime')
taxi_df = taxi_df.drop('DOLocationID')
taxi_df = taxi_df.withColumn('pickup_datetime', to_date(col('pickup_datetime')))

taxi_df.show(15)

In [None]:
taxi_zone_file = gs_path + cleaned_folder + 'taxi_zones_data.csv'
taxi_zone_df = spark.read.parquet(taxi_zone_file)

taxi_df.show(15)

In [None]:
taxi_df = taxi_df.join(taxi_zone_df, taxi_df.PULocationID == taxi_zone_df.LocationID)
taxi_df = taxi_df.drop('zone')
taxi_df = taxi_df.drop('PULocationID')
taxi_df = taxi_df.drop('LocationID')
taxi_df.show(15)

In [None]:
taxi_df.createOrReplaceTempView('taxi_df')
taxi_df = spark.sql('SELECT pickup_datetime, borough, COUNT(*) as total_trips FROM taxi_df GROUP BY pickup_datetime, borough')
taxi_df = taxi_df.dropna(subset=['total_trips'])
taxi_df = taxi_df.filter(taxi_df.borough != 'EWR')
taxi_df.show(15)

In [None]:
# Analyze the data

# Convert the Spark DataFrame to a Pandas DataFrame
taxi_df_pd = taxi_df.toPandas()

# Show a histogram of the total trips for each borough
taxi_df_pd.hist(column='total_trips', by='borough', bins=30, figsize=(15, 10))

# Show a historgam of the total trips for the entire dataset by date
taxi_df_pd['pickup_datetime'] = pd.to_datetime(taxi_df_pd['pickup_datetime'])
taxi_df_pd.set_index('pickup_datetime', inplace=True)
taxi_df_pd['total_trips'].plot(figsize=(15, 10))3

# Show a histogram of the total trips for the entire dataset by month
taxi_df_pd['month'] = taxi_df_pd.index.month
taxi_df_pd['year'] = taxi_df_pd.index.year
taxi_df_pd.groupby(['year', 'month'])['total_trips'].sum().plot(kind='bar', figsize=(15, 10))

# Show a histogram of the daily trips for september 2023
taxi_df_pd['day'] = taxi_df_pd.index.day
taxi_df_pd['year'] = taxi_df_pd.index.year
taxi_df_pd['month'] = taxi_df_pd.index.month
taxi_df_pd[(taxi_df_pd['year'] == 2023) & (taxi_df_pd['month'] == 9)].groupby('day')['total_trips'].sum().plot(kind='bar', figsize=(15, 10))


In [None]:
# Describe the total trips for each borough in all years
taxi_df_pd.groupby('borough')['total_trips'].describe()

