In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

#### FUNCTION TO READ AND PREPROCESS DATA

In [2]:
categorical = ['PUlocationID', 'DOlocationID']

def read_dataframe(filename):
    ''' read and preprocess data: convert duration to minutes and filter for duration between 1 and 60 minutes.'''
    
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.dropoff_datetime = pd.to_datetime(df.dropoff_datetime)
        df.pickup_datetime = pd.to_datetime(df.pickup_datetime)

    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df_new = df.query('duration >= 1 & duration <= 60').copy()

    df_new.PUlocationID.fillna(-1, inplace=True)
    df_new.DOlocationID.fillna(-1, inplace=True)
    df_new.PUlocationID = df_new.PUlocationID.astype('int')
    df_new.DOlocationID = df_new.DOlocationID.astype('int')

    df_new[categorical] = df_new[categorical].astype('str')
    
    return df, df_new

#### READ DATASET

In [3]:
df, df_new = read_dataframe('data/fhv_tripdata_2021-01.parquet')

#### NUMBER OF RECORDS DROPPED

In [4]:
len(df) - len(df_new)

44286

#### PERCENTAGE OF MISSING VALUES IN PUlocationID

In [5]:
df_new['PUlocationID'].value_counts(normalize=True) * 100

-1     83.527328
221     0.750568
206     0.612438
129     0.484671
115     0.367805
         ...    
111     0.000451
27      0.000360
34      0.000270
2       0.000180
110     0.000090
Name: PUlocationID, Length: 262, dtype: float64

#### ENCODING CATEGORICAL VARIABLES

In [6]:
df_new[categorical] = df_new[categorical].astype('str')

train_dicts = df_new[categorical].to_dict(orient='records')

dv = DictVectorizer()

X_train = dv.fit_transform(train_dicts)
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

#### MODELING

In [7]:
target = 'duration'
y_train = df_new[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

#### MODEL EVALUATION

In [8]:
y_train_pred = lr.predict(X_train)

rmse = mean_squared_error(y_train, y_train_pred, squared=False)
rmse

10.528519107204893

#### READ VALIDATION DATASET

In [9]:
df, df_val = read_dataframe('data/fhv_tripdata_2021-02.parquet')

#### NUMBER OF RECORDS DROPPED

In [10]:
len(df) - len(df_val)

47579

#### ENCODING CATEGORICAL VARIABLES

In [11]:
val_dicts = df_val[categorical].to_dict(orient='records')

X_val = dv.transform(val_dicts)
X_val

<990113x525 sparse matrix of type '<class 'numpy.float64'>'
	with 1980223 stored elements in Compressed Sparse Row format>

In [12]:
target = 'duration'

y_val = df_val[target].values

In [13]:
y_val_pred = lr.predict(X_val)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
rmse

11.014283140085958