In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    #df['duration'] = df.dropOff_datetime - df.pickup_datetime # Not yet!
    #df.duration = df.duration.apply(lambda td: td.total_seconds() / 60) # Not yet!

    #df = df[(df.duration >= 1) & (df.duration <= 60)] # Not yet!

    #categorical = ['PUlocationID', 'DOlocationID'] # Not yet!
    #df[categorical] = df[categorical].astype(str) # Not yet!
    
    return df

# Q1. Downloading the data

We'll use the same NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "For-Hire Vehicle Trip Records".

Download the data for January and February 2021.

Note that you need "For-Hire Vehicle Trip Records", not "High Volume For-Hire Vehicle Trip Records".

Read the data for January. How many records are there?

In [3]:
df_train = read_dataframe('./data/fhv_tripdata_2021-01.parquet')

In [4]:
len(df_train)

1154112

# Q2. Computing duration
Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the average trip duration in January?

In [5]:
df_train['duration'] = df_train.dropOff_datetime - df_train.pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)

In [6]:
df_train.duration.mean()

19.167224093791006

# Data preparation

Check the distribution of the duration variable. There are some outliers.

Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

How many records did you drop?

In [7]:
original_len = len(df_train)

In [8]:
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

In [9]:
original_len - len(df_train)

44286

# Q3. Missing values

The features we'll use for our model are the pickup and dropoff location IDs.

But they have a lot of missing values there. Let's replace them with "-1".

What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.

In [10]:
values={'PUlocationID': -1,
        'DOlocationID': -1}
df_train.fillna(value=values, inplace=True)

In [11]:
(df_train.PUlocationID == -1).mean()

0.8352732770722617

# Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries
- Fit a dictionary vectorizer
- Get a feature matrix from it

What's the dimensionality of this matrix? (The number of columns).

In [12]:
categorical = ['PUlocationID', 'DOlocationID']
train_dicts = df_train[categorical].astype(str).to_dict(orient='records')

In [13]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [14]:
X_train.shape[1]

525

# Q5. Training a model

Now let's use the feature matrix from the previous step to train a model.

- Train a plain linear regression model with default parameters
- Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [15]:
target = 'duration'
y_train = df_train[target].values

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.528519107212658

# Q6. Evaluating the model

Now let's apply this model to the validation dataset (Feb 2021).

What's the RMSE on validation?

In [17]:
df_val = read_dataframe('./data/fhv_tripdata_2021-02.parquet')
df_val['duration'] = (df_val.dropOff_datetime - df_val.pickup_datetime).apply(lambda td: td.total_seconds() / 60)
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val.fillna(value=values, inplace=True)

In [18]:
val_dicts = df_val[categorical].astype(str).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [19]:
y_val = df_val[target].values

In [20]:
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

11.014283205413125