In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

In [2]:
def read_data(file_path):
    df = pd.read_parquet(file_path)
    
    df['trip_duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['trip_duration_min'] = df['trip_duration'].apply(lambda x: round((x.total_seconds()/60),2))
    
    max_cut_off = df['trip_duration_min'].quantile(.999)
    df = df[(df['trip_duration_min'] >0) & (df['trip_duration_min'] <=max_cut_off)]
    return df

In [3]:
df_train = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')
df_val = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet')

In [4]:
def create_features(df):
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)
    return df

In [5]:
df_train = create_features(df_train)
df_val = create_features(df_val)

In [6]:
def vectorize_data(df_train, df_val, cat_cols, num_cols):
    df_train_copy = df_train[cat_features + num_features]
    df_val_copy = df_val[cat_features + num_features]
    
    train_dicts = df_train_copy[cat_features + num_features].to_dict(orient='records')
    val_dicts = df_val_copy[cat_features + num_features].to_dict(orient='records')
    
    dv = DictVectorizer()
    df_train_vec = dv.fit_transform(train_dicts)
    df_val_vec = dv.transform(val_dicts)    
    return df_train_vec, df_val_vec

In [7]:
cat_features = ['PULocationID', 'DOLocationID', 'PU_DO']
num_features = ['trip_distance']

In [8]:
X_train_vec, X_val_vec = vectorize_data(df_train, df_val, cat_features, num_features)

X_train = df_train[cat_features + num_features]
X_val = df_val[cat_features + num_features]

In [9]:
target = 'trip_duration_min'
y_train = df_train[target].values
y_val = df_val[target].values

In [10]:
print(X_train.shape)
print(X_val.shape)
print(X_train_vec.shape)
print(X_val_vec.shape)
print(y_train.shape)
print(y_val.shape)

(2960789, 4)
(3003715, 4)
(2960789, 26116)
(3003715, 26116)
(2960789,)
(3003715,)


## Model training

In [11]:
def train_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
    rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
    return y_pred_train, y_pred_val, rmse_train, rmse_val

### Regression

In [12]:
model = LinearRegression()

y_pred_train, y_pred_val, rmse_train, rmse_val = train_model(model, X_train_vec, y_train, X_val_vec, y_val)

In [13]:
print(f'rmse_train: {rmse_train}')
print(f'rmse_val: {rmse_val}')

rmse_train: 6.130541048697959
rmse_val: 6.488167113904765


### Random Forest

In [14]:
model = RandomForestRegressor(max_depth=3)

y_pred_train, y_pred_val, rmse_train, rmse_val = train_model(model, X_train, y_train, X_val, y_val)

print(f'rmse_train: {rmse_train}')
print(f'rmse_val: {rmse_val}')

rmse_train: 6.669905698914243
rmse_val: 6.894515524009098


### Lasso

In [None]:
model = Lasso()

y_pred_train, y_pred_val, rmse_train, rmse_val = train_model(model, X_train_vec, y_train, X_val_vec, y_val)

print(f'rmse_train: {rmse_train}')
print(f'rmse_val: {rmse_val}')

### Ridge

In [None]:
model = Ridge()

y_pred_train, y_pred_val, rmse_train, rmse_val = train_model(model, X_train_vec, y_train, X_val_vec, y_val)

print(f'rmse_train: {rmse_train}')
print(f'rmse_val: {rmse_val}')