In [67]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### Reading data

In [70]:
jan = pd.read_parquet('fhv_tripdata_2021-01.parquet')
feb = pd.read_parquet('fhv_tripdata_2021-02.parquet')

In [62]:
def prepare_duration(df):
    df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
    df['duration'] = df['duration'].map(lambda x: x.total_seconds() / 60

In [84]:
def prepare_features(df):
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    df['PUlocationID'] = df['PUlocationID'].fillna(-1)
    df['DOlocationID'] = df['DOlocationID'].fillna(-1)

    df['PUlocationID'] = df['PUlocationID'].astype(str)
    df['DOlocationID'] = df['DOlocationID'].astype(str)
    
    return df

### Preparing train data

In [72]:
prepare_duration(jan)

In [73]:
jan.duration.mean()

19.167224093791006

In [85]:
jan = prepare_features(jan)

In [90]:
jan[jan.PUlocationID == '-1.0'].shape[0] / len(jan)

0.8352732770722617

### Preparing model data

In [92]:
X = jan[['PUlocationID', 'DOlocationID']].to_dict(orient='records')
vec = DictVectorizer()
X = vec.fit_transform(features)
y = jan.duration.values
X.shape

(1109826, 525)

In [93]:
lr = LinearRegression()
lr.fit(X, y)
preds = lr.predict(X)
np.sqrt(mean_squared_error(y, preds))

10.528519425310185

In [94]:
prepare_duration(feb)

In [95]:
feb = prepare_features(feb)

In [98]:
X_val = feb[['PUlocationID', 'DOlocationID']].to_dict(orient='records')
X_val = vec.transform(X_val)
y_val = feb.duration.values

In [99]:
preds = lr.predict(X_val)
np.sqrt(mean_squared_error(y_val, preds))

11.014285828610237