In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

chunk_size = 50000

# Initialize models
model = {
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

# Storage for training data
all_X_train = []
all_y_train = []
total_X_test = pd.DataFrame()
total_y_test = pd.DataFrame()

counter = 0

for df in pd.read_csv("training_dataset.csv", chunksize=chunk_size):
    if counter == 20:
        break
    counter += 1
    print(f"Processing chunk {counter}...")

    # Handle missing values
    df.dropna(inplace=True)

    # Rename columns
    column_names = ["ID", "vendorid","tpep_pickup_datetime", "tpep_dropoff_datetime","passenger_count", "trip_distance",
                    "ratecodeid","store_and_fwd_flag","pulocationid","dolocationid","payment_type","fare_amount",
                    "extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount",
                    "congestion_surcharge","airport_fee","duration"]                  
    df.columns = column_names
    df.drop(columns=["ID"], inplace=True)

    # Convert datetime columns
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

    df['tpep_pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
    df['tpep_dropoff_hour'] = df['tpep_dropoff_datetime'].dt.hour

    df.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], inplace=True)

    # Convert categorical values
    df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map({'N': 0, 'Y': 1})

    # Outlier filtering
    df = df[(df['duration'] < 2880) & (df['duration'] > 30)]
    df = df[(df['trip_distance'] < 300) & (df['trip_distance'] > 0.25)]

    # Drop unneeded columns
    df.drop(columns=[
        'tpep_dropoff_hour', 'airport_fee', 'payment_type', 'congestion_surcharge',
        'passenger_count', 'vendorid', 'improvement_surcharge', 'tolls_amount',
        'extra', 'tip_amount'
    ], inplace=True)

    # Split features and target
    X_data = df.drop(columns=['duration'])
    y_data = df['duration']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
    
    # Accumulate training data
    all_X_train.append(X_train)
    all_y_train.append(y_train)

    # Accumulate test data
    total_X_test = pd.concat([total_X_test, X_test])
    total_y_test = pd.concat([total_y_test, y_test])

# Combine all training data
final_X_train = pd.concat(all_X_train)
final_y_train = pd.concat(all_y_train)

# Standardize numerical features
scaler = StandardScaler()
final_X_train = pd.DataFrame(scaler.fit_transform(final_X_train), columns=final_X_train.columns)
total_X_test = pd.DataFrame(scaler.transform(total_X_test), columns=total_X_test.columns)

# Standardize the target variable
scaler_y = StandardScaler()
final_y_train = scaler_y.fit_transform(final_y_train.values.reshape(-1, 1)).flatten()
total_y_test = scaler_y.transform(total_y_test.values.reshape(-1, 1)).flatten()

# Train models only once
reg = {}
for model_name in model.keys():
    
    reg[model_name] = model[model_name].fit(final_X_train, final_y_train)

# Evaluate models
for model_name in model.keys():
    y_pred = reg[model_name].predict(total_X_test)
    mse = mean_squared_error(y_true=total_y_test, y_pred=y_pred)
    
    score = r2_score(y_true=total_y_test, y_pred=y_pred)

    print(f"MSE {model_name}: {mse}")
    print(f"Score {model_name}: {score}")
    




Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
MSE RandomForestRegressor: 0.03793298025989009
Score RandomForestRegressor: 0.9619387740622833
MSE DecisionTreeRegressor: 0.07358058433253363
Score DecisionTreeRegressor: 0.9261706508235772
