In [1]:
import csv
import math
import numpy as np
from datetime import datetime
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.linalg import Vectors

In [2]:
# Spark context
sc = SparkContext.getOrCreate()
sc.setLogLevel("ERROR")

25/04/12 12:24:20 WARN Utils: Your hostname, nam-Nitro-AN515-45 resolves to a loopback address: 127.0.1.1; using 192.168.1.18 instead (on interface wlp5s0)
25/04/12 12:24:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/12 12:24:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load the dataset
try:
    train_data = sc.textFile("train.csv")
    test_data = sc.textFile("test.csv")
except Exception as e:
    print(f"Error reading CSV file: {e}")
    sc.stop()
    exit(1)

In [4]:
# Parse the dataset
def parse_datetime(datetime_str):
    dt = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
    return [dt.hour, dt.weekday(), dt.month, dt.timetuple().tm_yday]


def compute_distance(long_1, lat_1, long_2, lat_2):
    long_diff = math.radians(long_1 - long_2) / 2
    lat_diff = math.radians(lat_1 - lat_2) / 2
    a = math.sin(lat_diff) ** 2 + \
        math.cos(math.radians(lat_1)) * \
        math.cos(math.radians(lat_2)) * \
        math.sin(long_diff) ** 2
    return 6371 * 2 * math.asin(math.sqrt(a))


def parse_train(row):
    cols = row.split(",")
    passenger_count = float(cols[4])
    long_lats = [float(col) for col in cols[5:9]]
    trip_duration = float(cols[10])
    pickup_datetime = parse_datetime(cols[2])
    trip_distance = compute_distance(*long_lats)
    # Features: hour, weekday, month, day_of_year, passenger_count, pickup_long, pickup_lat, dropoff_long, dropoff_lat, distance
    # Target: trip_duration
    return [*pickup_datetime, passenger_count, *long_lats, trip_distance, trip_duration]

def parse_test(row):
    cols = row.split(",")
    passenger_count = float(cols[3])
    long_lats = [float(col) for col in cols[4:8]]
    pickup_datetime = parse_datetime(cols[2])
    trip_distance = compute_distance(*long_lats)
    trip_id = cols[0]
    # Features: hour, weekday, month, day_of_year, passenger_count, pickup_long, pickup_lat, dropoff_long, dropoff_lat, distance
    # Target: trip_duration
    return [*pickup_datetime, passenger_count, *long_lats, trip_distance, trip_id]

In [5]:
train_header = train_data.first()
train_data = train_data.filter(lambda row: row != train_header)
train_data = train_data.map(parse_train)

test_header = test_data.first()
test_data = test_data.filter(lambda row: row != test_header)
test_data = test_data.map(parse_test)

# Initial filtering
train_data = train_data.filter(lambda row: row[9] > 0)  # distance > 0
train_data = train_data.filter(lambda row: row[10] > 0)  # duration > 0
train_data = train_data.filter(lambda row: row[4] > 0)  # passenger_count > 0

                                                                                

In [6]:
# Preprocess the data
def remove_outliers(data, column, lower_quantile=0.25, upper_quantile=0.75, k=1.5):
    values = data.map(lambda row: row[column]).collect()
    lower_quantile_value = np.percentile(values, lower_quantile * 100)
    upper_quantile_value = np.percentile(values, upper_quantile * 100)
    iqr = upper_quantile_value - lower_quantile_value
    lower_bound = lower_quantile_value - k * iqr
    upper_bound = upper_quantile_value + k * iqr
    return data.filter(lambda row: lower_bound <= row[column] <= upper_bound)


train_data = remove_outliers(train_data, 9)  # distance
train_data = remove_outliers(train_data, 10)  # duration

                                                                                

In [7]:
# Convert to LabeledPoint(label, features) tuple and split train/val
train_data = train_data.map(lambda row: LabeledPoint(row[-1], row[:-2]))
train_data, val_data = train_data.randomSplit([0.8, 0.2], seed=42)

# Train the model
model = DecisionTree.trainRegressor(
    train_data,
    categoricalFeaturesInfo={},
    maxDepth=30,
    minInstancesPerNode=30,
    maxBins=128,
)

                                                                                

In [8]:
# Evaluate the model
def evaluate_model(model, data, dataset_name):
    features = data.map(lambda x: x.features)
    labels = data.map(lambda x: x.label)

    predictions = model.predict(features)

    pred_and_labels = predictions.zip(labels).map(lambda x: (float(x[0]), float(x[1])))
    metrics = RegressionMetrics(pred_and_labels)

    # Print metrics
    print(f"\n=== {dataset_name} Metrics ===")
    print(f"RMSE: {metrics.rootMeanSquaredError}")
    print(f"MSE: {metrics.meanSquaredError}")
    print(f"MAE: {metrics.meanAbsoluteError}")
    print(f"R2: {metrics.r2}")

In [9]:
evaluate_model(model, train_data, "Training Data")
evaluate_model(model, val_data, "Validation Data")

                                                                                


=== Training Data Metrics ===




RMSE: 197.93184847154677
MSE: 39177.016639363355
MAE: 146.39200471256427
R2: 0.7217969717263647


                                                                                


=== Validation Data Metrics ===




RMSE: 217.46851064291195
MSE: 47292.55312124631
MAE: 160.9846768328867
R2: 0.664624255151187


                                                                                

In [10]:
test_features = test_data.map(lambda row: Vectors.dense(row[0:9]))
test_ids = test_data.map(lambda row: row[-1])
predictions = model.predict(test_features)
result = test_ids.zip(predictions)
result_list = result.collect()

with open("predictions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "trip_duration"])
    writer.writerows(result_list)
sc.stop()

                                                                                