In [1316]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

chunk_size = 50000

# Initialize models
model = {
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

# Storage for training data
all_X_train = []
all_y_train = []
total_X_test = pd.DataFrame()
total_y_test = pd.DataFrame()

counter = 0

for df in pd.read_csv("training_dataset.csv", chunksize=chunk_size):
    counter += 1
    print(f"Processing chunk {counter}...")

    # Handle missing values
    df.dropna(inplace=True)

    # Rename columns
    column_names = ["ID", "vendorid","tpep_pickup_datetime", "tpep_dropoff_datetime","passenger_count", "trip_distance",
                    "ratecodeid","store_and_fwd_flag","pulocationid","dolocationid","payment_type","fare_amount",
                    "extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount",
                    "congestion_surcharge","airport_fee","duration"]                 
    df.columns = column_names
    df.drop(columns=["ID"], inplace=True)

    # Convert datetime columns
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

    df['tpep_pickup_hour'] = df['tpep_pickup_datetime'].dt.hour

    df.drop(columns=['tpep_pickup_datetime'], inplace=True)

    # Outlier filtering
    df = df[(df['duration'] < 2880) & (df['duration'] > 30)]
    df = df[(df['trip_distance'] < 300) & (df['trip_distance'] > 0.25)]

    # Drop unneeded columns
    df.drop(columns=[
        'airport_fee', 'payment_type', 'congestion_surcharge',
        'passenger_count', 'vendorid', 'improvement_surcharge', 'tolls_amount',
        'extra', 'tip_amount'
    ], inplace=True)

    # Split features and target
    X_data = df.drop(columns=['duration', "tpep_dropoff_datetime", 'ratecodeid', 'store_and_fwd_flag'])
    y_data = df['duration']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
    
    # Accumulate training data
    all_X_train.append(X_train)
    all_y_train.append(y_train)

    # Accumulate test data
    total_X_test = pd.concat([total_X_test, X_test])
    total_y_test = pd.concat([total_y_test, y_test])


# Combine all training data
final_X_train = pd.concat(all_X_train)
final_y_train = pd.concat(all_y_train)

# Standardize numerical features
scaler = StandardScaler()
final_X_train = pd.DataFrame(scaler.fit_transform(final_X_train), columns=final_X_train.columns)
total_X_test = pd.DataFrame(scaler.transform(total_X_test), columns=total_X_test.columns)

# Standardize the target variable
scaler_y = StandardScaler()
final_y_train = scaler_y.fit_transform(final_y_train.values.reshape(-1, 1)).flatten()
total_y_test = scaler_y.transform(total_y_test.values.reshape(-1, 1)).flatten()

# Train models only once
reg = {}
for model_name in model.keys():
    
    reg[model_name] = model[model_name].fit(final_X_train, final_y_train)

# Evaluate models
for model_name in model.keys():
    y_pred = reg[model_name].predict(total_X_test)
    mse = mean_squared_error(y_true=total_y_test, y_pred=y_pred)
    
    score = r2_score(y_true=total_y_test, y_pred=y_pred)

    print(f"MSE {model_name}: {mse}")
    print(f"Score {model_name}: {score}")


Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Processing chunk 27...
Processing chunk 28...
Processing chunk 29...
Processing chunk 30...
Processing chunk 31...
Processing chunk 32...
Processing chunk 33...
Processing chunk 34...
Processing chunk 35...
Processing chunk 36...
Processing chunk 37...
Processing chunk 38...
Processing chunk 39...
Processing chunk 40...
Processing chunk 41...
Processing chunk 42...
Processing chunk 43...
Processing chunk 44.

MemoryError: could not allocate 2147483648 bytes

In [1315]:
df_eval = pd.read_csv("eval.csv")

# df_eval = df_eval.dropna()

# Rename columns
column_names = ["ID", "vendorid","tpep_pickup_datetime","passenger_count", "trip_distance",
                "ratecodeid","store_and_fwd_flag","pulocationid","dolocationid","payment_type","fare_amount",
                "extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount",
                "congestion_surcharge","airport_fee"]                  

df_eval.columns = column_names

eval_ids = df_eval["ID"]

df_eval.drop(columns=["ID"], inplace=True)

# Convert datetime columns
df_eval['tpep_pickup_datetime'] = pd.to_datetime(df_eval['tpep_pickup_datetime'])

df_eval['tpep_pickup_hour'] = df_eval['tpep_pickup_datetime'].dt.hour

df_eval.drop(columns=['tpep_pickup_datetime', 'store_and_fwd_flag', 'ratecodeid'], inplace=True)

# Drop unneeded columns
df_eval.drop(columns=[
    'airport_fee', 'payment_type', 'congestion_surcharge',
    'passenger_count', 'vendorid', 'improvement_surcharge', 'tolls_amount',
    'extra', 'tip_amount'
], inplace=True)

print(df_eval.describe())

y_pred_eval = reg["RandomForestRegressor"].predict(df_eval)

df_out = pd.DataFrame({"ID": eval_ids, "duration": y_pred_eval}).set_index("ID")
df_out.to_csv("submission.csv")

       trip_distance   pulocationid   dolocationid    fare_amount  \
count  500000.000000  500000.000000  500000.000000  500000.000000   
mean        5.765083     164.262598     163.553426      19.275166   
std       630.518207      64.314410      69.545243      19.410251   
min         0.000000       1.000000       1.000000   -1174.100000   
25%         1.010000     132.000000     113.000000       9.300000   
50%         1.760000     161.000000     162.000000      13.500000   
75%         3.370000     233.000000     234.000000      22.600000   
max    319111.840000     265.000000     265.000000    1349.800000   

             mta_tax   total_amount  tpep_pickup_hour  
count  500000.000000  500000.000000     500000.000000  
mean        0.480351      27.853946         14.304504  
std         0.127669      24.121979          5.847429  
min        -0.500000    -963.880000          0.000000  
25%         0.500000      15.750000         11.000000  
50%         0.500000      21.000000       