In [1]:
import pandas as pd
import pickle


import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [2]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [3]:
ls data/

green_tripdata_2022-01.parquet  yellow_tripdata_2022-01.parquet
green_tripdata_2022-02.parquet  yellow_tripdata_2022-02.parquet


Read the data for January. How many columns are there?

In [4]:
pd.read_parquet("data/yellow_tripdata_2022-01.parquet").shape

(2463931, 19)

In [5]:
january_df = read_dataframe("data/yellow_tripdata_2022-01.parquet")
february_df = read_dataframe("data/yellow_tripdata_2022-02.parquet")

Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the standard deviation of the trips duration in January?

In [6]:
round(january_df.duration.std(), 2)

46.45

Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [7]:
january_df.shape

(2463931, 20)

In [8]:
january_df = january_df[(january_df.duration >= 1) & (january_df.duration <= 60)]
january_df.shape

(2421440, 20)

In [9]:
2421440 / 2463931 * 100

98.27547930522405

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

    Turn the dataframe into a list of dictionaries
    Fit a dictionary vectorizer
    Get a feature matrix from it

What's the dimensionality of this matrix (number of columns)?

In [10]:
january_df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [25]:
X_train = january_df[["PULocationID", "DOLocationID"]].copy()
X_train = X_train.to_dict(orient='records')
y_train = january_df.duration.copy()

In [26]:
dv = DictVectorizer()

In [27]:
X_train = dv.fit_transform(X_train)

In [28]:
dv.feature_names_[:10]

['DOLocationID=1',
 'DOLocationID=10',
 'DOLocationID=100',
 'DOLocationID=101',
 'DOLocationID=102',
 'DOLocationID=105',
 'DOLocationID=106',
 'DOLocationID=107',
 'DOLocationID=108',
 'DOLocationID=109']

In [29]:
len(dv.feature_names_)

515

Now let's use the feature matrix from the previous step to train a model.

    Train a plain linear regression model with default parameters
    Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [30]:
lr = LinearRegression()

In [31]:
lr.fit(X_train, y_train)

In [32]:
y_pred = lr.predict(X_train)

In [33]:
round(mean_squared_error(y_train, y_pred, squared=False), 2)

6.99

Now let's apply this model to the validation dataset (February 2022).

What's the RMSE on validation?

In [45]:
february_df = february_df[(february_df.duration >= 1) & (february_df.duration <= 60)]
X_val = february_df[["PULocationID", "DOLocationID"]].copy()
X_val = X_val.to_dict(orient='records')
y_val = february_df.duration.copy()

In [46]:
X_val = dv.transform(X_val)

In [47]:
y_pred = lr.predict(X_val)

In [48]:
round(mean_squared_error(y_val, y_pred, squared=False), 2)

7.79