In [1]:
# Import libraries
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

In [19]:
# Extract the data into Train and Test Dataframes
df_train = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df_test = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [20]:
# Question 1. Downloading the data
print("Number of columns of df_train: ", df_train.shape[1])

Number of columns of df_train:  19


In [21]:
# Question 2. Computing duration
df_train['duration'] = df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)
print("Standard deviation of the trips duration in January 2023: ", df_train.duration.std())

Standard deviation of the trips duration in January 2023:  42.594351241920904


In [22]:
# Question 3. Dropping outliers
df_train_ = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
fraction = (df_train_.shape[0] / df_train.shape[0])*100
print(f'Fraction of the records left after dropping the outliers: {"%.0f" % round(fraction, 0)}%') 

Fraction of the records left after dropping the outliers: 98%


In [23]:
# Question 4. One-hot encoding
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

categorical = ['PULocationID', 'DOLocationID']
df_train[categorical] = df_train[categorical].astype(str)

train_dicts = df_train[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

print("Number of columns of X_train:", X_train.shape[1])

Number of columns of X_train: 515


In [24]:
# Question 5. Training a model. RMSE on train
target = 'duration'
y_train = df_train[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

print("RMSE on the training data: ", mean_squared_error(y_train, y_pred, squared=False))

RMSE on the training data:  7.649261927665777


In [25]:
# Question 6. Evaluating the model. RMSE on validation
df_test['duration'] = df_test.tpep_dropoff_datetime - df_test.tpep_pickup_datetime
df_test.duration = df_test.duration.apply(lambda td: td.total_seconds() / 60)

df_test = df_test[(df_test.duration >= 1) & (df_test.duration <= 60)]
df_test[categorical] = df_test[categorical].astype(str)

test_dicts = df_test[categorical].to_dict(orient='records')

X_test = dv.transform(test_dicts)
y_test = df_test[target].values
y_pred = lr.predict(X_test)

print("RMSE on the validation data: ", mean_squared_error(y_test, y_pred, squared=False))

RMSE on the validation data:  7.811817548344513
