In [73]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import pickle

In [74]:

def read_from_file(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration']  = df['duration'].apply(lambda x: x.total_seconds()/60)
    df = df[(df['duration'] >= 1) & (df['duration']<=60)]
    catagorical = ['PULocationID', 'DOLocationID']
    df[catagorical] = df[catagorical].astype(str)
    return df

In [75]:
df_train = read_from_file('./data/green_tripdata_2021-01.parquet')
df_val = read_from_file('./data/green_tripdata_2021-02.parquet')

In [76]:
len(df_train), len(df_val)

(73908, 61921)

In [77]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [78]:
# catagorical = ['PULocationID', 'DOLocationID']
catagorical = ['PU_DO']
numerical = ['trip_distance']

train_dict = df_train[catagorical + numerical].to_dict(orient = 'records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dict)

val_dict = df_val[catagorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dict)

In [79]:
# sns.distplot(df_train['duration'])
# plt.show()

In [80]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [65]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)
r2 = mean_squared_error(y_val, y_pred, squared= False)
r2

7.479562160810692

In [83]:
with open('linear_regression.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [66]:
# sns.distplot(y_val, label='Real')
# sns.distplot(y_pred, label='Pred')
# plt.legend()
# plt.show()

In [70]:
lr = Lasso(alpha=0.001)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)
r2 = mean_squared_error(y_val, y_pred, squared= False)
r2

9.233436225720547

In [69]:
lr = Ridge(alpha=0.1)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)
r2 = mean_squared_error(y_val, y_pred, squared= False)
r2

11.342569094090537