In [None]:
spark

In [None]:
!pip install holidays

# Set Up

In [None]:
import pandas as pd
from google.cloud import storage
from io import BytesIO
from datetime import datetime, date
import holidays

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count, udf, to_date, year, month, date_format, size, split, dayofweek
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml import Pipeline




In [None]:
bucket_name = 'my-bigdataproject-jg'
gs_path  = f'gs://{bucket_name}/'
cleaned_folder = 'cleaned/'
destination_folder = 'code_and_models/'

storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)

# Weather DF

In [None]:
weather_files = gs_path + cleaned_folder + 'weather_data_*.parquet'
weather_df = spark.read.parquet(weather_files)
weather_df.printSchema()

# Taxi Trips DF

In [None]:
taxi_files = gs_path + cleaned_folder + "taxi_data/*.parquet"
taxi_df = spark.read.parquet(taxi_files)
taxi_df.printSchema()

In [None]:
# Show the number of records for each unique RatecodeID
taxi_df.groupBy('RatecodeID').count().show()

# Show the number of records for each unique passenger_count
taxi_df.groupBy('passenger_count').count().show()


In [None]:
# create a new column pickup_date, which the date using the pickup_datetime column
taxi_df = taxi_df.withColumn('pickup_date', to_date(col('pickup_datetime')))

# create a new column pickup_hour, which the hour using the pickup_datetime column
taxi_df = taxi_df.withColumn('time_of_day', 
                             when((F.hour(F.col('pickup_datetime')) >= 5) & (F.hour(F.col('pickup_datetime')) < 12), 'morning')\
                             .when((F.hour(F.col('pickup_datetime')) >= 12) & (F.hour(F.col('pickup_datetime')) < 21), 'afternoon')\
                             .otherwise('night'))

# Remove the records with RatecodeID = 99 and passenger_count = 0
taxi_df = taxi_df.filter((col('RatecodeID') != 99) & (col('passenger_count') != 0))



taxi_df = taxi_df.drop('dropoff_datetime', 'RatecodeID', 'payment_type', 'total_amount')


# Taxi Zone DF

In [None]:
taxi_zone_file = gs_path + cleaned_folder + 'taxi_zones_data.parquet'
taxi_zone_df = spark.read.parquet(taxi_zone_file)
taxi_zone_df = taxi_zone_df.drop('zone')
taxi_zone_df.printSchema()


In [None]:
# Show the unique boroughs in the taxi_zone_df DataFrame
taxi_zone_df.select('borough').distinct().show()

## Taxi data frames combined

In [None]:

taxi_df = taxi_df.join(taxi_zone_df, taxi_df.PULocationID == taxi_zone_df.LocationID, how='left')
taxi_df = taxi_df.withColumnRenamed('Borough', 'PUBorough')
taxi_df = taxi_df.drop('LocationID')
taxi_df = taxi_df.join(taxi_zone_df, taxi_df.DOLocationID == taxi_zone_df.LocationID, how='left')
taxi_df = taxi_df.withColumnRenamed('Borough', 'DOBorough')
taxi_df = taxi_df.drop('LocationID')
taxi_df.show(15)

In [None]:
# Drop the records where the PUBorough or DOBorough is 'EWR'
taxi_df = taxi_df.filter((taxi_df.PUBorough != 'EWR') & (taxi_df.DOBorough != 'EWR'))

# Combined Data Frame

In [None]:
weather_df = weather_df.withColumnRenamed('borough', 'weather_borough')
weather_df = weather_df.withColumnRenamed('snow', 'snow_precip')

combined_df = taxi_df.join(weather_df, [taxi_df.pickup_date == weather_df.datetime, taxi_df.PUBorough == weather_df.weather_borough])

combined_df = combined_df.drop('datetime')
combined_df = combined_df.drop('weather_borough')
combined_df = combined_df.drop('PULocationID', 'DOLocationID', 'pickup_datetime')

combined_df.printSchema()


In [None]:
# Show th pickup_date, time_of_day, PUBorough, DOBorough, tip_percentage, trip_distance
combined_df.select('pickup_date', 'time_of_day', 'PUBorough', 'DOBorough', 'tip_percentage', 'trip_distance').show()

In [None]:
# Summarize the columns: tip_percentage, trip_distance, fare_amount, passenger_count
combined_df.select('tip_percentage', 'trip_distance', 'fare_amount', 'passenger_count').describe().show()

# Datetime features

In [None]:
# datetime

# month
combined_df = combined_df.withColumn('month', month(col('pickup_date')))
# dayofweek
combined_df = combined_df.withColumn('dayofweek', dayofweek(col('pickup_date')))

# weekend
combined_df = combined_df.withColumn('weekend', when(col('dayofweek') == 1, 1.0).when(col('dayofweek') == 7, 1.0).otherwise(0))

# holiday
combined_df = combined_df.withColumn('pickup_date', to_date(col('pickup_date')))

# Get the min and max date in the datetime column
min_date = combined_df.agg({"pickup_date": "min"}).collect()[0][0]
max_date = combined_df.agg({"pickup_date": "max"}).collect()[0][0]


# Get the holidays observed in New York
us_holidays = holidays.UnitedStates(years=[min_date.year, max_date.year], observed=True, subdiv='NY')

print(us_holidays)

# Keep only the dates of the holidays
us_holidays = list(us_holidays.keys())

# Create a new column holiday and set it to 1 if the date is a holiday, 0 otherwise
combined_df = combined_df.withColumn('holiday', when(col('pickup_date').isin(us_holidays), 1).otherwise(0))

# Min Max Scaling

## temp

In [None]:
# temp

# Get the min and max temp values
temp_min = combined_df.agg(F.min('tempmin')).collect()[0][0]
temp_max = combined_df.agg(F.max('tempmax')).collect()[0][0]

temp_assembler = VectorAssembler(inputCols=['temp'], outputCol='temp_vector')
temp_scaler = MinMaxScaler(inputCol='temp_vector', outputCol='temp_scaled', min=temp_min, max=temp_max)

## feels_like

In [None]:
# feels_like

# Get the min and max feelslike values
feelslike_min = combined_df.agg(F.min('feelslikemin')).collect()[0][0]
feelslike_max = combined_df.agg(F.max('feelslikemax')).collect()[0][0]

feelslike_assembler = VectorAssembler(inputCols=['feelslike'], outputCol='feelslike_vector')
feelslike_scaler = MinMaxScaler(inputCol='feelslike_vector', outputCol='feelslike_scaled', min=feelslike_min, max=feelslike_max)


## Additional Min Max Features

In [None]:
# humidity
humidity_assembler = VectorAssembler(inputCols=['humidity'], outputCol='humidity_vector')
humidity_scaler = MinMaxScaler(inputCol='humidity_vector', outputCol='humidity_scaled')

# precip
precip_assembler = VectorAssembler(inputCols=['precip'], outputCol='precip_vector')
precip_scaler = MinMaxScaler(inputCol='precip_vector', outputCol='precip_scaled')

# snow
snow_precip_assembler = VectorAssembler(inputCols=['snow_precip'], outputCol='snow_precip_vector')
snow_precip_scaler = MinMaxScaler(inputCol='snow_precip_vector', outputCol='snow_precip_scaled')

# snowdepth
snowdepth_assembler = VectorAssembler(inputCols=['snowdepth'], outputCol='snowdepth_vector')
snowdepth_scaler = MinMaxScaler(inputCol='snowdepth_vector', outputCol='snowdepth_scaled')

# windspeed
windspeed_assembler = VectorAssembler(inputCols=['windspeed'], outputCol='windspeed_vector')
windspeed_scaler = MinMaxScaler(inputCol='windspeed_vector', outputCol='windspeed_scaled')

# cloudcover
cloudcover_assembler = VectorAssembler(inputCols=['cloudcover'], outputCol='cloudcover_vector')
cloudcover_scaler = MinMaxScaler(inputCol='cloudcover_vector', outputCol='cloudcover_scaled')

# visibility
visibility_assembler = VectorAssembler(inputCols=['visibility'], outputCol='visibility_vector')
visibility_scaler = MinMaxScaler(inputCol='visibility_vector', outputCol='visibility_scaled')

# uvindex
# encode directly

# UDF Condition Features

In [None]:
# conditions

# Select all the distinct options for conditions and save them in a list
conditions = combined_df.select('conditions').distinct().rdd.flatMap(lambda x: x).collect()
# Split the string into a list
conditions = [x.split(', ') for x in conditions]

# Flatten the list
conditions = [item for sublist in conditions for item in sublist]
# Keep the unique values only
conditions = list(set(conditions))

print(conditions)

# Create a new column for each condition
for condition in conditions:
    combined_df = combined_df.withColumn(condition, when(col('conditions').contains(condition), 1).otherwise(0))


# Pipeline

## String Indexer

In [None]:
# String Index columns
indexer_input = ['PUBorough', 'DOBorough', 'time_of_day']
indexer_output = [x + '_index' for x in indexer_input]
indexer = StringIndexer(inputCols=indexer_input, outputCols=indexer_output)


## One Hot Encoder

In [None]:
encoder_output = [x + '_encoded' for x in indexer_input]
encoder = OneHotEncoder(inputCols=indexer_output, outputCols=encoder_output)

## Vector Assembler

In [None]:
encode_directly = ['uvindex', 'month', 'dayofweek', 'weekend', 'holiday', 'trip_distance', 'passenger_count', 'fare_amount']

input_cols  = [
    'temp_scaled', 'feelslike_scaled', 'humidity_scaled', 'precip_scaled', 'snow_precip_scaled', 'snowdepth_scaled', 'windspeed_scaled', 'cloudcover_scaled', 'visibility_scaled'
]
input_cols = input_cols + conditions + encoder_output + encode_directly
print(input_cols)

In [None]:
assembler = VectorAssembler(inputCols=input_cols, outputCol='features')

# Save the pipeline with features

In [None]:
print('Saving the transformed data...')
# Create a new frame with the transformed data
pipeline = Pipeline(stages=[temp_assembler, temp_scaler, feelslike_assembler, feelslike_scaler, humidity_assembler, humidity_scaler, precip_assembler, precip_scaler, snow_precip_assembler, snow_precip_scaler, snowdepth_assembler, snowdepth_scaler, windspeed_assembler, windspeed_scaler, cloudcover_assembler, cloudcover_scaler, visibility_assembler, visibility_scaler, indexer, encoder, assembler])
model = pipeline.fit(combined_df)
transformed_df = model.transform(combined_df)

# Save the transformed data
transformed_df.write.parquet(gs_path + destination_folder + 'features')
print('Transformed data saved!')

# Train / Test Regression Pipeline

In [None]:
train_df, test_df = combined_df.randomSplit([0.7, 0.3], seed=42)

linear_reg = LinearRegression(featuresCol='features', labelCol='tip_percentage')
evaluator = RegressionEvaluator(labelCol='tip_percentage')

In [None]:
regresion_pipe = Pipeline(stages=[
    indexer, temp_assembler, temp_scaler, feelslike_assembler, feelslike_scaler, humidity_assembler, humidity_scaler, precip_assembler, precip_scaler, snow_precip_assembler, snow_precip_scaler, snowdepth_assembler, snowdepth_scaler, windspeed_assembler, windspeed_scaler, cloudcover_assembler, cloudcover_scaler, visibility_assembler, visibility_scaler, encoder, assembler, linear_reg
])

# Validate Regression Model

In [None]:
grid = ParamGridBuilder()
grid = grid.build()

cv = CrossValidator(estimator=regresion_pipe, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3)

all_models = cv.fit(train_df)

print(f"Average metrics: {all_models.avgMetrics}")

best_model = all_models.bestModel

test_results = best_model.transform(test_df)

rmse = evaluator.evaluate(test_results, {evaluator.metricName: "rmse"})
mse = evaluator.evaluate(test_results, {evaluator.metricName: "mse"})
mae = evaluator.evaluate(test_results, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(test_results, {evaluator.metricName: "r2"})

print(f"RMSE: {rmse}")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")

In [None]:
coefficients = best_model.stages[-1].coefficients
print("bestModel coefficients", coefficients)

# Save the Model

In [None]:
# Save the model
print('Saving the model')
model_path = gs_path + destination_folder + 'model'
best_model.write().overwrite().save(model_path)
print('Model saved')

# Extra

In [None]:
indexer = StringIndexer(inputCol='borough', outputCol='borough_index')
encoder = OneHotEncoder(inputCol='borough_index', outputCol='borough_encoded')
assembler = VectorAssembler(inputCols=['month', 'dayofweek', 'weekend', 'borough_encoded', 'temp_scaled', 'feelslike_scaled', 'humidity_scaled', 'precip_scaled', 'snow_scaled', 'snowdepth_scaled', 'windspeed_scaled', 'cloudcover_scaled', 'visibility_scaled', 'uvindex'], outputCol='features')

train_df, test_df = combined_df.randomSplit([0.7, 0.3], seed=42)

linear_reg = LinearRegression(labelCol='total_trips')

evaluator = RegressionEvaluator(labelCol='total_trips')

regresion_pipe = Pipeline(stages=[indexer, encoder, temp_assembler, temp_scaler, feelslike_assembler, feelslike_scaler, humidity_assembler, humidity_scaler, precip_assembler, precip_scaler, snow_precip_assembler, snow_precip_scaler, snowdepth_assembler, snowdepth_scaler, windspeed_assembler, windspeed_scaler, cloudcover_assembler, cloudcover_scaler, visibility_assembler, visibility_scaler, assembler, linear_reg])

grid = ParamGridBuilder()
grid = grid.build()
cv = CrossValidator(estimator=regresion_pipe, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3)
all_models = cv.fit(train_df)

print(f"Average metrics: {all_models.avgMetrics}")

best_model = all_models.bestModel

test_results = best_model.transform(test_df)

rmse = evaluator.evaluate(test_results, {evaluator.metricName: "rmse"})
mse = evaluator.evaluate(test_results, {evaluator.metricName: "mse"})
mae = evaluator.evaluate(test_results, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(test_results, {evaluator.metricName: "r2"})

print(f"RMSE: {rmse}")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")

In [None]:
indexer = StringIndexer(inputCol='borough', outputCol='borough_index')
encoder = OneHotEncoder(inputCol='borough_index', outputCol='borough_encoded')
assembler = VectorAssembler(inputCols=['month', 'dayofweek', 'weekend', 'borough_encoded', 'temp_scaled', 'feelslike_scaled', 'humidity_scaled', 'precip_scaled', 'snow_scaled', 'snowdepth_scaled', 'windspeed_scaled', 'cloudcover_scaled', 'visibility_scaled', 'uvindex'], outputCol='features')

pipeline = Pipeline(stages=[temp_assembler, temp_scaler, feelslike_assembler, feelslike_scaler, humidity_assembler, humidity_scaler, precip_assembler, precip_scaler, snow_precip_assembler, snow_precip_scaler, snowdepth_assembler, snowdepth_scaler, windspeed_assembler, windspeed_scaler, cloudcover_assembler, cloudcover_scaler, visibility_assembler, visibility_scaler, indexer, encoder, assembler])

pipeline_model = pipeline.fit(combined_df).transform(combined_df)

train_df, test_df = pipeline_model.randomSplit([0.7, 0.3], seed=42)

lr = LinearRegression(featuresCol='features', labelCol='total_trips')
lr_model = lr.fit(train_df)

lr_predictions = lr_model.transform(test_df)
lr_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='total_trips')

rmse = lr_evaluator.evaluate(lr_predictions, {lr_evaluator.metricName: 'rmse'})
mae = lr_evaluator.evaluate(lr_predictions, {lr_evaluator.metricName: 'mae'})
mse = lr_evaluator.evaluate(lr_predictions, {lr_evaluator.metricName: 'mse'})
r2 = lr_evaluator.evaluate(lr_predictions, {lr_evaluator.metricName: 'r2'})

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')

