In [1]:
# Adjust Python path within the notebook
import sys
project_root = '/Users/thangnguyen/Documents/GitHub/project-1-individual-knam2609'
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import scripts

In [2]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

In [3]:
# Create SparkSession
spark = scripts.clean_base.create_spark_session()

24/08/27 20:41:09 WARN Utils: Your hostname, THANGs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 100.86.89.69 instead (on interface en0)
24/08/27 20:41:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/27 20:41:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
yellow_df = spark.read.parquet("../data/curated/yellow/yellow_weather.parquet")

In [5]:
yellow_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- trip_time: double (nullable = true)
 |-- fare_per_miles: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- precip: double (nu

In [6]:
# Choose columns that are not directly related to the fare
columns = ["VendorID", "pickup_datetime", "dropoff_datetime", "trip_distance", "RatecodeID", "PULocationID", "DOLocationID", "payment_type", "trip_time", "temp", "humidity", "precip", "total_amount"]
yellow_df = yellow_df.select(columns)
yellow_df.printSchema()

In [9]:
# Split data, train on June-October, test on November
train_df = yellow_df.filter(col("pickup_datetime") < "2023-11-01 00:00:00")
test_df = yellow_df.filter(~(col("pickup_datetime") < "2023-11-01 00:00:00"))

In [10]:
# Encoding categorical data
for column in ["VendorID", "RatecodeID", "PULocationID", "DOLocationID", "payment_type"]:
    train_df, model = scripts.feature_engineer.encoder(train_df, column)
    test_df = model.transform(test_df).drop(column)

True




True
True
True
True


Deal with timestamp data by splitting it into year, month, day, hour and minute

In [11]:
# Train data
train_df = scripts.feature_engineer.prepare_timestamp_features(train_df, "pickup_datetime")
train_df = scripts.feature_engineer.prepare_timestamp_features(train_df, "dropoff_datetime")

In [12]:
# Test data
test_df = scripts.feature_engineer.prepare_timestamp_features(test_df, "pickup_datetime")
test_df = scripts.feature_engineer.prepare_timestamp_features(test_df, "dropoff_datetime")

In [14]:
# Create predictors and response
response = "total_amount"
predictors = [i for i in train_df.columns if i != response]

In [15]:
predictors

['trip_distance',
 'trip_time',
 'temp',
 'humidity',
 'precip',
 'VendorID_encoded',
 'RatecodeID_encoded',
 'PULocationID_encoded',
 'DOLocationID_encoded',
 'payment_type_encoded',
 'pickup_datetime_year',
 'pickup_datetime_month',
 'pickup_datetime_day',
 'pickup_datetime_hour',
 'pickup_datetime_minute',
 'dropoff_datetime_year',
 'dropoff_datetime_month',
 'dropoff_datetime_day',
 'dropoff_datetime_hour',
 'dropoff_datetime_minute']

In [17]:
# Assemble the predictors into a single vector column
assembler = VectorAssembler(inputCols=predictors, outputCol="predictors")
output = assembler.transform(train_df)

# The final DataFrame should have two columns: predictors and response
train_data = output.select("predictors", response)
test_data = assembler.transform(test_df).select("predictors", response)

In [20]:
# Initialize the model
lr = LinearRegression(featuresCol="predictors", labelCol=response)

# Fit the model on the training data
lr_model = lr.fit(train_data)

24/08/27 20:47:02 WARN Instrumentation: [72b9f7e8] regParam is zero, which might cause numerical instability and overfitting.
24/08/27 20:47:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/27 20:47:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/08/27 20:47:07 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/08/27 20:47:07 WARN Instrumentation: [72b9f7e8] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


In [22]:
# Print the coefficients and intercept for linear regression
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")

# Make predictions on the test data
predictions = lr_model.transform(test_data)
predictions.select("prediction", "total_amount", "predictors").show()

# Evaluate the model on test data
test_results = lr_model.evaluate(test_data)

# Print the R^2 and RMSE
print(f"R^2: {test_results.r2}")
print(f"Root Mean Squared Error (RMSE): {test_results.rootMeanSquaredError}")

Coefficients: [3.797261236849205,0.312981831694941,-0.021813764132746483,-0.0012749474363547024,0.014675977890516516,0.0,-0.5185359653811479,0.0,-1.2689067041010278,-0.6828313657524686,16.231182170403383,24.300061312354853,23.48653181166026,0.0,-22.172582806238097,-32.490165018002585,-12.683230987676373,-10.932947607404104,0.0,0.0,-12.012286255542381,-20.059156885323542,0.0,-4.2973219951305275,0.6343405198281631,-11.299547216332492,-11.225132447003643,4.698264905639258,-17.558463655717162,18.10734465970109,-11.902837761722964,-12.303263363268591,-11.695208028648759,-12.34139222178686,-13.920318924737005,-10.905875863128893,0.0,-11.914850483302196,-12.36279403273303,-15.147135113410897,0.0,1.8614253785339514,-12.032577366926185,16.413341252262605,-1.6681573237042773,-11.890683084459262,-12.167966006151834,-10.96832532407468,-8.64122023948652,-12.54025045695101,-12.037635486789975,7.0300501264807265,-16.587401372231724,-10.414240184528126,-12.550313397437252,-12.394723371866743,-10.95754



R^2: 0.9439308913290396
Root Mean Squared Error (RMSE): 5.362644700586453


