In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [2]:
df_train = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [3]:
num_columns = df_train.shape[1]
print(f"number of columns:{num_columns}")

number of columns:19


In [4]:
df_train['duration'] =( df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime).dt.total_seconds()/60
df_val['duration'] =( df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime).dt.total_seconds()/60
std_dev_minutes_train = df_train['duration'].std()
print(f"Standard deviation of trip duration train: {std_dev_minutes_train:.2f} minutes")

Standard deviation of trip duration train: 42.59 minutes


In [5]:
original_count = len(df_train)
filtered_df = df_train[(df_train['duration'] >= 1) & (df_train['duration'] <= 60)]
filtered_count = len(filtered_df)
fraction_remaining = filtered_count / original_count

print(f"Fraction of records remaining train data: {fraction_remaining:.4f}")

Fraction of records remaining train data: 0.9812


In [6]:
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

print(f"Shape of feature matrix train: {X_train.shape}")
print(f"Number of columns train: {X_train.shape[1]}")

print(f"Shape of feature matrix val: {X_val.shape}")
print(f"Number of columns val: {X_val.shape[1]}")

Shape of feature matrix train: (3009173, 516)
Number of columns train: 516
Shape of feature matrix val: (2858634, 516)
Number of columns val: 516


In [7]:
df_train = df_train[(df_train['duration'] >= 1) & (df_train['duration'] <= 60)]
df_val = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)]
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_train, y_pred)

7.658405791284101

In [9]:
y_pred = lr.predict(X_val)
root_mean_squared_error(y_val, y_pred)

11.863146558759315

In [11]:
lr = Lasso(0.01)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

11.40188381665915