In [1]:
# Adjust Python path within the notebook
import sys
project_root = '/Users/thangnguyen/Documents/GitHub/project-1-individual-knam2609'
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import scripts

In [2]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

In [3]:
# Create SparkSession
spark = scripts.clean_base.create_spark_session()

24/08/27 20:58:36 WARN Utils: Your hostname, THANGs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 100.86.89.69 instead (on interface en0)
24/08/27 20:58:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/27 20:58:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/27 20:58:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
uber_df = spark.read.parquet("../data/curated/uber/uber_weather.parquet")

In [5]:
uber_df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- trip_time: double (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- waiting_time: double (nullable = true)
 |-- fare_per_miles: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- humidity: double (nullable = true)


24/08/27 20:58:50 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
# Choose columns that are not directly related to the fare
columns = ["dispatching_base_num", "request_datetime", "on_scene_datetime", "pickup_datetime", "dropoff_datetime", "trip_distance", "PULocationID", "DOLocationID", "trip_time", "waiting_time", "temp", "humidity", "precip", "total_amount"]
uber_df = uber_df.select(columns)
uber_df.printSchema()

In [13]:
# Split data, train on June-October, test on November
train_df = uber_df.filter(col("pickup_datetime") < "2023-11-01 00:00:00")
test_df = uber_df.filter(~(col("pickup_datetime") < "2023-11-01 00:00:00"))

In [14]:
# Encoding categorical data
for column in ["dispatching_base_num", "PULocationID", "DOLocationID"]:
    train_df, model = scripts.feature_engineer.encoder(train_df, column)
    test_df = model.transform(test_df).drop(column)

False
True
True


Deal with timestamp data by splitting it into year, month, day, hour and minute

In [15]:
# Train data 
train_df = scripts.feature_engineer.prepare_timestamp_features(train_df, "request_datetime")
train_df = scripts.feature_engineer.prepare_timestamp_features(train_df, "on_scene_datetime")
train_df = scripts.feature_engineer.prepare_timestamp_features(train_df, "pickup_datetime")
train_df = scripts.feature_engineer.prepare_timestamp_features(train_df, "dropoff_datetime")

In [16]:
# Test data
test_df = scripts.feature_engineer.prepare_timestamp_features(test_df, "request_datetime")
test_df = scripts.feature_engineer.prepare_timestamp_features(test_df, "on_scene_datetime")
test_df = scripts.feature_engineer.prepare_timestamp_features(test_df, "pickup_datetime")
test_df = scripts.feature_engineer.prepare_timestamp_features(test_df, "dropoff_datetime")

In [18]:
# Drop redundant columns from indexing
train_df = train_df.drop("dispatching_base_num_indexed")
test_df = test_df.drop("dispatching_base_num_indexed")

In [19]:
# Create predictors and response
response = "total_amount"
predictors = [i for i in train_df.columns if i != response]

In [20]:
predictors

['trip_distance',
 'trip_time',
 'waiting_time',
 'temp',
 'humidity',
 'precip',
 'dispatching_base_num_encoded',
 'PULocationID_encoded',
 'DOLocationID_encoded',
 'request_datetime_year',
 'request_datetime_month',
 'request_datetime_day',
 'request_datetime_hour',
 'request_datetime_minute',
 'on_scene_datetime_year',
 'on_scene_datetime_month',
 'on_scene_datetime_day',
 'on_scene_datetime_hour',
 'on_scene_datetime_minute',
 'pickup_datetime_year',
 'pickup_datetime_month',
 'pickup_datetime_day',
 'pickup_datetime_hour',
 'pickup_datetime_minute',
 'dropoff_datetime_year',
 'dropoff_datetime_month',
 'dropoff_datetime_day',
 'dropoff_datetime_hour',
 'dropoff_datetime_minute']

In [21]:
# Assemble the predictors into a single vector column
assembler = VectorAssembler(inputCols=predictors, outputCol="predictors")
output = assembler.transform(train_df)

# The final DataFrame should have two columns: predictors and response
train_data = output.select("predictors", response)
test_data = assembler.transform(test_df).select("predictors", response)

In [None]:
# Correlation matrix of predictors and response
scripts.plot_data.plot_correlation_heatmap(train_df, 
list(set(train_df.columns) - set(['dispatching_base_num_encoded', 'PULocationID_encoded', 'DOLocationID_encoded'])), 
"uber", "../plots/uber/correlation/", "predictors")

In [22]:
# Initialize the model
lr = LinearRegression(featuresCol="predictors", labelCol=response)

# Fit the model on the training data
lr_model = lr.fit(train_data)

24/08/27 21:06:19 WARN Instrumentation: [e547b7e6] regParam is zero, which might cause numerical instability and overfitting.
24/08/27 21:06:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/27 21:06:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/08/27 21:06:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/08/27 21:06:24 WARN Instrumentation: [e547b7e6] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


In [23]:
# Print the coefficients and intercept for linear regression
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")

# Make predictions on the test data
predictions = lr_model.transform(test_data)
predictions.select("prediction", "total_amount", "predictors").show()

# Evaluate the model on test data
test_results = lr_model.evaluate(test_data)

# Print the R^2 and RMSE
print(f"R^2: {test_results.r2}")
print(f"Root Mean Squared Error (RMSE): {test_results.rootMeanSquaredError}")

Coefficients: [2.53827021758345,0.590811212860773,0.281631283415102,-0.1159528334572165,-0.008120147203847577,0.20701390544309936,-0.06073527067944083,0.747098093003648,0.5617011928053198,0.4512315383424473,0.3809067717627234,-0.12304051309213442,-1.3569671275717266,-6.9532086864805445,-0.7192770227036087,-3.106204956715981,-1.34161439205203,-4.001376781361955,8.009570377885506,-4.729636234974317,-7.948091163953914,-4.433134413922769,-7.285224430636965,-6.56964580326856,-0.13481196862206302,-10.590764195411046,6.427875330576509,3.915991214651274,-8.94133615932603,-2.1205671559312322,-0.2830552723124882,-1.6434461335742878,4.633066256290501,0.0,0.0,-5.721203105043373,-4.64330444705351,0.40059248333560826,-5.753101973355311,-3.3829542392905334,-2.634644687974419,-2.8688632125208717,-4.495023497325867,-5.627305953817422,-3.2491335538983166,6.198105680900649,7.121277665841321,-2.091294265166873,-4.803269228673642,-4.71858265270569,-1.7457958835496448,-4.312310967467162,-8.58774636415531,-3



R^2: 0.8455738827791401
Root Mean Squared Error (RMSE): 11.388198773554578


