In [None]:
#import libraries

import os
import random
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

In [None]:
# load the dataset. it's in parquet format
df = pd.read_parquet("data/fhv_tripdata_2021-01.parquet")

In [None]:
print(f"Number of records in Jan 2021 FHV data {len(df)}")

In [None]:
# compute the duration of each trip
df["duration"] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [None]:
# what's the average trip time?
trip_duration = np.mean(df.duration).round(2)
print(f"Average trip duration in January is {trip_duration} mins")

In [None]:
def read_dataframe(filename):
    # load the dataset
    
    if filename.endswith(".csv"):
        df = pd.read_csv(filename)
        
        df.pickup_datetime = pd.to_datetime(df.pickup_datetime)
        df.dropOff_datetime = pd.to_datetime(df.dropOff_datetime)
    else:
        df = pd.read_parquet(filename)
    
    # create a trip duration column
        
    df["duration"] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    # fill in the missing values with -1
    
    df["PUlocationID"].fillna(-1, inplace=True)
    df["DOlocationID"].fillna(-1, inplace=True)
    
    # PulocationID and DOlocation are categorical data
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
  
    return df

In [None]:
# load the train and validatation data set. January is for training, February is for validation

train = read_dataframe("data/fhv_tripdata_2021-01.parquet")
val = read_dataframe("data/fhv_tripdata_2021-02.parquet")

In [None]:
# Explore the first five rows of our train data

train.head()

In [None]:
# what's the percentage of the missing value in pick up location ID?

missing_values = np.around(train["PUlocationID"].value_counts(normalize=True), 3)
print(f"Fraction of missing values for pickup location ID: {missing_values[0] * 100} %")

In [None]:
# one hot encoding of categorical variables using dictvectorizer

features = ["PUlocationID", "DOlocationID"]
dv = DictVectorizer()

train_dicts = train[features].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = val[features].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [None]:
#shape of the train and validation data after one hot encoding
print(f"The train dataset has {X_train.shape[0]} rows and {X_train.shape[1]} columns")
print(f"The validation dataset has {X_val.shape[0]} rows and {X_val.shape[1]} columns")

In [None]:
# create our features and target 

target = 'duration'
y_train = train[target].values
y_val = val[target].values

In [None]:
# train, fit and predict using LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

train_pred = lr.predict(X_train)

train_rmse = mean_squared_error(train_pred, y_train, squared=False)
print(f"Train rmse: {train_rmse}")

In [None]:
sns.distplot(train_pred, label='prediction')
sns.distplot(y_train, label='actual')

plt.legend();

In [None]:
# what is the performance of our model on validation data
y_pred = lr.predict(X_val)

val_rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Val rmse: {val_rmse}")

In [None]:
with open('models/lin_reg.bin', 'wb') as f:
    pickle.dump((dv, lr), f)