In [None]:
spark

In [1]:
import pandas as pd
from google.cloud import storage
from io import BytesIO
from datetime import datetime, date
import holidays

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, year, month, dayofweek, hour
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator



In [None]:
bucket_name = 'my-bigdataproject-jg'
gs_path  = f'gs://{bucket_name}/'
cleaned_folder = 'cleaned/'
destination_folder = 'code_and_models/'

storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)

In [None]:
spark = SparkSession.builder.master("local[*]") \
    .appName("Taxi Demand Prediction") \
    .getOrCreate()

In [None]:
weather_files = gs_path + cleaned_folder + 'weather_data_*.parquet'
weather_df = spark.read.parquet(weather_files)
weather_df.show(15)

In [None]:
taxi_files = gs_path + cleaned_folder + "cleaned_yellow_tripdata_2023-??.parquet"
taxi_df = spark.read.parquet(taxi_files)
taxi_df.show(15)

In [None]:
taxi_df = taxi_df.drop('__index_level_0__')
taxi_df = taxi_df.drop('dropoff_datetime')
taxi_df = taxi_df.drop('DOLocationID')
taxi_df = taxi_df.withColumn('pickup_datetime', to_date(col('pickup_datetime')))

taxi_df.show(15)

In [None]:
taxi_zone_file = gs_path + cleaned_folder + 'taxi_zones_data.csv'
taxi_zone_df = spark.read.parquet(taxi_zone_file)

taxi_df.show(15)

In [None]:
taxi_df = taxi_df.join(taxi_zone_df, taxi_df.PULocationID == taxi_zone_df.LocationID)
taxi_df = taxi_df.drop('zone')
taxi_df = taxi_df.drop('PULocationID')
taxi_df = taxi_df.drop('LocationID')
taxi_df.show(15)

In [None]:
taxi_df.createOrReplaceTempView('taxi_df')
taxi_df = spark.sql('SELECT pickup_datetime, borough, COUNT(*) as total_trips FROM taxi_df GROUP BY pickup_datetime, borough')
taxi_df = taxi_df.dropna(subset=['total_trips'])
taxi_df = taxi_df.filter(taxi_df.borough != 'EWR')
taxi_df.show(15)

In [None]:
weather_df = weather_df.withColumnRenamed('borough', 'weather_borough')

combined_df = taxi_df.join(weather_df, [taxi_df.pickup_datetime == weather_df.datetime, taxi_df.borough == weather_df.weather_borough])

combined_df = combined_df.drop('pickup_datetime').
combined_df = combined_df.drop('weather_borough')

combined_df.show(15)

In [None]:
"""
# Feature Engineering
Original column - 

Features:

datetime

Features:
Month of the Year
Day of the Week 
Weekend
Holiday

borough - StringIndexer

temp
 - Use the lowest tempmin value and highest tempmax value to create a range for min max scaling
 - MinMaxScaler

feels_like
- Use the lowest tempmin value and highest tempmax value to create a range for min max scaling
- MinMaxScaler

use min max scaling for the following columns:
humidity
precip
snow
snowdepth
windspeed
cloudcover
visibility

uvindex - Encode directly as integer

conditions
 - Conditions has these options Clear, Snow, Overcast, Rain, Partially cloudy, Overcast
 - Some records have these options joint, for example: Snow, Rain, Overcast |      Rain, Overcast
 - split these options into separate columns and encode them as binary values
"""

# datetime
combined_df = combined_df.withColumn('month', month(col('datetime')))
combined_df = combined_df.withColumn('dayofweek', dayofweek(col('datetime')))
combined_df = combined_df.withColumn('weekend', when(col('dayofweek') == 1, 1.0).when(col('dayofweek') == 7, 1.0).otherwise(0))

# borough
indexer = StringIndexer(inputCol='borough', outputCol='borough_index')
encoder = OneHotEncoder(inputCol='borough_index', outputCol='borough_Vector', dropLast=False)

# temp

temp_min = combined_df.agg(F.min('tempmin')).collect()[0][0]
temp_max = combined_df.agg(F.max('tempmax')).collect()[0][0]

temp_assembler = VectorAssembler(inputCols=['temp'], outputCol='temp_vector')
temp_scaler = MinMaxScaler(inputCol='temp_vector', outputCol='temp_scaled', min=temp_min, max=temp_max)

# feels_like
feelslike_min = combined_df.agg(F.min('feelslikemin')).collect()[0][0]
feelslike_max = combined_df.agg(F.max('feelslikemax')).collect()[0][0]

feelslike_assembler = VectorAssembler(inputCols=['feelslike'], outputCol='feelslike_vector')
feelslike_scaler = MinMaxScaler(inputCol='feelslike_vector', outputCol='feelslike_scaled', min=feelslike_min, max=feelslike_max)

# humidity
humidity_assembler = VectorAssembler(inputCols=['humidity'], outputCol='humidity_vector')
humidity_scaler = MinMaxScaler(inputCol='humidity_vector', outputCol='humidity_scaled')

# precip
precip_assembler = VectorAssembler(inputCols=['precip'], outputCol='precip_vector')
precip_scaler = MinMaxScaler(inputCol='precip_vector', outputCol='precip_scaled')

# snow
snow_assembler = VectorAssembler(inputCols=['snow'], outputCol='snow_vector')
snow_scaler = MinMaxScaler(inputCol='snow_vector', outputCol='snow_scaled')

# snowdepth
snowdepth_assembler = VectorAssembler(inputCols=['snowdepth'], outputCol='snowdepth_vector')
snowdepth_scaler = MinMaxScaler(inputCol='snowdepth_vector', outputCol='snowdepth_scaled')

# windspeed
windspeed_assembler = VectorAssembler(inputCols=['windspeed'], outputCol='windspeed_vector')
windspeed_scaler = MinMaxScaler(inputCol='windspeed_vector', outputCol='windspeed_scaled')

# cloudcover
cloudcover_assembler = VectorAssembler(inputCols=['cloudcover'], outputCol='cloudcover_vector')
cloudcover_scaler = MinMaxScaler(inputCol='cloudcover_vector', outputCol='cloudcover_scaled')

# visibility
visibility_assembler = VectorAssembler(inputCols=['visibility'], outputCol='visibility_vector')
visibility_scaler = MinMaxScaler(inputCol='visibility_vector', outputCol='visibility_scaled')


# skip conditions for now

"""# encode the new features
combined_df = combined_df.drop('tempmax')
combined_df = combined_df.drop('tempmin')
combined_df = combined_df.drop('feelslikemax')
combined_df = combined_df.drop('feelslikemin')
combined_df = combined_df.drop('preciptype')
combined_df = combined_df.drop('conditions')"""

assembler = VectorAssembler(inputCols=['month', 'dayofweek', 'weekend', 'borough_Vector', 'temp_scaled', 'feelslike_scaled', 'humidity_scaled', 'precip_scaled', 'snow_scaled', 'snowdepth_scaled', 'windspeed_scaled', 'cloudcover_scaled', 'visibility_scaled', 'uvindex'], outputCol='features')
# Ex: pizzaria_pipe = Pipeline(stages=[indexer, encoder, age_assembler, age_scaler, assembler])
pipeline = Pipeline(stages=[indexer, encoder, temp_assembler, temp_scaler, feelslike_assembler, feelslike_scaler, humidity_assembler, humidity_scaler, precip_assembler, precip_scaler, snow_assembler, snow_scaler, snowdepth_assembler, snowdepth_scaler, windspeed_assembler, windspeed_scaler, cloudcover_assembler, cloudcover_scaler, visibility_assembler, visibility_scaler, assembler])
transformed_df = pipeline.fit(combined_df).transform(combined_df)



In [None]:
# Show the data frame the original columns and features column
transformed_df.select('datetime', 'borough', 'total_trips', 'features').show(15)

In [None]:
# Split the data into training and testing data
train_df, test_df = transformed_df..randomSplit([0.7, 0.3], seed=42)
lr = LogisticRegression(featuresCol='features', labelCol='total_trips')
lr_model = lr.fit(train_df)
lr_predictions = lr_model.transform(test_df)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# # Show the confusion matrix for the model
#lr_predictions.groupby('total_trips').pivot('prediction').count().sort('total_trips').show()
confusion_matrix = lr_predictions.groupby('total_trips').pivot('prediction').count().fillna(0).collect()


# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='total_trips', predictionCol='prediction')

accuracy = evaluator.evaluate(lr_predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(lr_predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(lr_predictions, {evaluator.metricName: "weightedRecall"})
f1_score = evaluator.evaluate(lr_predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

# Confusion matrix
prediction_and_labels = lr_predictions.select("prediction", "total_trips").rdd
metrics = MulticlassMetrics(prediction_and_labels)
confusion_matrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:")
print(confusion_matrix)
