In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_squared_error

## Q1. Downloading the data

Read the data for January. How many columns are there?

In [2]:
jan_df = pd.read_parquet("yellow_tripdata_2023-01.parquet")

jan_df.shape[1]

19

## Q2. Computing duration
What's the standard deviation of the trips duration in January?

In [3]:
jan_df["duration"] = (jan_df.tpep_dropoff_datetime - jan_df.tpep_pickup_datetime).dt.total_seconds() / 60

jan_df.duration.std()

42.59435124195458

## Q3. Dropping outliers
Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [4]:
# Number of rows before dropping outlier
nr_before = jan_df.shape[0]

# Dropping outliers
jan_df = jan_df.loc[(jan_df.duration >= 1) & (jan_df.duration <= 60)]

# fraction
jan_df.shape[0] / nr_before

0.9812202822125979

## Q4. One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
Fit a dictionary vectorizer
Get a feature matrix from it
What's the dimensionality of this matrix (number of columns)?

In [9]:
sub_df = jan_df[["PULocationID", "DOLocationID"]].astype(str)
 
train_dicts = sub_df.to_dict("records")

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train.shape[1]

515

## Q5. Training a model
Now let's use the feature matrix from the previous step to train a model.

Train a plain linear regression model with default parameters, where duration is the response variable
Calculate the RMSE of the model on the training data
What's the RMSE on train?

In [10]:
target = "duration"
y_train = jan_df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_train, y_pred)

7.649261931416412

## Q6. Evaluating the model
Now let's apply this model to the validation dataset (February 2023).

What's the RMSE on validation?

In [12]:
feb_df = pd.read_parquet("yellow_tripdata_2023-02.parquet")
feb_df["duration"] = (feb_df.tpep_dropoff_datetime - feb_df.tpep_pickup_datetime).dt.total_seconds() / 60
feb_df = feb_df.loc[(feb_df.duration >= 1) & (feb_df.duration <= 60)]
val_dicts = feb_df[["PULocationID", "DOLocationID"]].astype(str).to_dict("records")

X_val = dv.transform(val_dicts)

y_actual = feb_df[target].values
y_pred = lr.predict(X_val)

root_mean_squared_error(y_actual, y_pred)

7.8118162035401735