In [143]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [144]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

How many records are there for January?

In [145]:
categorical = ['PUlocationID', 'DOlocationID']
target = ['duration']

In [146]:
def load(file: str, categorical: list[str]) -> pd.DataFrame:
    df = pd.read_parquet(file)
    df['duration'] = (df.dropOff_datetime - df.pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype(int)
    df[categorical] = df[categorical].astype(str)

    return df


In [147]:
df_train = load('data/fhv_tripdata_2021-01.parquet', categorical)
df_val = load('data/fhv_tripdata_2021-02.parquet', categorical)

In [154]:
df_val.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                      object
DOlocationID                      object
SR_Flag                           object
Affiliated_base_number            object
duration                         float64
dtype: object

In [149]:
train_dicts = df_train[categorical].to_dict(orient='records')
val_dicts = df_val[categorical].to_dict(orient='records')

In [150]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
y_train = df_train.duration.values
y_val = df_val.duration.values

In [151]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [156]:
y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)

10.528519426833792

In [157]:
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

11.014285904950842