In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

In [2]:
def get_data(path):
    df = pd.read_parquet(path=path)
    df.drop(columns=['SR_Flag'], inplace=True)
    return df

In [3]:
def data_preparation(df):
    df['duration'] = (df['dropOff_datetime'] - df['pickup_datetime']).dt.total_seconds()/60
    return df

In [4]:
def get_entries(df):
    return df.shape[0]

In [5]:
def get_average_duration(df):
    return df.duration.mean()

In [6]:
def get_null_perc(df):
    len_df=df.shape[0]
    df=df[df['duration']>=1]
    df=df[df['duration']<=60]

    return df['PUlocationID'].isnull().values.sum()/df.shape[0], len_df-df.shape[0]

In [7]:
def feature_engineering(df):
    df=df[df['duration']>=1]
    df=df[df['duration']<=60]
    df['PUlocationID'].fillna(-1, inplace=True)
    df['DOlocationID'].fillna(-1, inplace=True)
    df['PUlocationID'] = df['PUlocationID'].astype(str)
    df['DOlocationID'] = df['DOlocationID'].astype(str)
    
    return df
    

In [8]:
def get_metrics(X_train, X_test, y_train, y_test):
    lr = LinearRegression()
    lr_rmse = np.mean((cross_val_score(lr, X_train, y_train, cv=2, scoring = 'neg_root_mean_squared_error')))
    lr.fit(X_train, y_train)

    print("Linear Regression Training RMSE: {0}".format(-lr_rmse))

    y_pred = lr.predict(X_test)

    print("Linear Regression Testing RMSE: {0}".format(mean_squared_error(y_test, y_pred)**0.5))

In [9]:
df_train = get_data('data/fhv_tripdata_2021-01.parquet')
df_train = data_preparation(df_train)

df_test = get_data('data/fhv_tripdata_2021-02.parquet')
df_test = data_preparation(df_test)

### Q1

In [10]:
get_entries(df_train)

1154112

### Q2

In [11]:
get_average_duration(df_train)

19.167224093791006

### Q3

In [12]:
get_null_perc(df_train)

(0.8352732770722617, 44286)

### Q4

In [13]:
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

In [14]:
dv = DictVectorizer()
X_train = dv.fit_transform(df_train[['PUlocationID','DOlocationID']].to_dict(orient='records'))
print(X_train.shape[1])
y_train = df_train['duration']

X_test = dv.transform(df_test[['PUlocationID','DOlocationID']].to_dict(orient='records'))
y_test = df_test['duration']

525


### Q5 & Q6

In [15]:
get_metrics(X_train, X_test, y_train, y_test)

Linear Regression Training RMSE: 10.542642284026236
Linear Regression Testing RMSE: 11.01428315568312
