In [1]:
import pandas as pd
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import time 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df_jan23 = pd.read_parquet("yellow_tripdata_2023-01.parquet")
df_jan23.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [3]:
cols = df_jan23.shape[1]
print(f"Columns in Jan 23 Dataset: {cols}")

Columns in Jan 23 Dataset: 19


In [4]:
duration = df_jan23['tpep_dropoff_datetime']-df_jan23['tpep_pickup_datetime']

In [5]:
df_jan23['duration'] = duration.apply(lambda x:round((x.total_seconds()/60),2))

In [6]:
standard_dev = round((np.std(df_jan23['duration'])),2)
print(f"Standard Deviation of trips duration in January 2023: {standard_dev} min")

Standard Deviation of trips duration in January 2023: 42.59 min


In [7]:
cdf = df_jan23[(df_jan23['duration']>=1) & (df_jan23['duration']<=60)].copy()

In [8]:
orig_rows = df_jan23.shape[0]
clean_rows = cdf.shape[0]

In [9]:
records_left = (clean_rows/orig_rows) * 100
print("Fraction of records left after we dropped outliers {:.2f}%".format(records_left))

Fraction of records left after we dropped outliers 98.12%


In [10]:
locationID_cols = ['PULocationID','DOLocationID']
cdf[locationID_cols] = cdf[locationID_cols].astype(str)

In [11]:
col_dict = cdf[locationID_cols].to_dict(orient='records')

In [12]:
dict_vec = DictVectorizer()
X_train = dict_vec.fit_transform(col_dict)

In [13]:
X_train.shape

(3009173, 515)

In [13]:
y_train= cdf['duration'].values
y_train.shape

(3009173,)

In [14]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

LinearRegression()

In [15]:
y_train_pred = lr_model.predict(X_train)

In [16]:
rmse = mean_squared_error(y_train, y_train_pred,squared=False)
print("RMSE on Training Data: {:.2f}%".format(rmse))

RMSE on Training Data: 7.65%


In [17]:
#For deallocating some memory consumed by dataframe which is not in use
del df_jan23
del cdf

In [24]:
# df_feb23 = pd.read_parquet("yellow_tripdata_2023-02.parquet")
# df_feb23.head()


def ops_on_df(dataset_path, dv):
    df_feb23 = pd.read_parquet(dataset_path)
    duration = df_feb23['tpep_dropoff_datetime']-df_feb23['tpep_pickup_datetime']
    df_feb23['duration'] = duration.apply(lambda x:round((x.total_seconds()/60),2))
    cdf_feb = df_feb23[(df_feb23['duration']>=1) & (df_feb23['duration']<=60)].copy()
    locationID_cols = ['PULocationID','DOLocationID']
    cdf_feb[locationID_cols] = cdf_feb[locationID_cols].astype(str)
    col_dict_feb = cdf_feb[locationID_cols].to_dict(orient='records')
    X_train_feb = dv.transform(col_dict_feb)
    y_train_feb = cdf_feb['duration'].values
    print("Processing Complete for feb file")
    return X_train_feb, y_train_feb  
    
    
X_train_feb, y_train_feb = ops_on_df("yellow_tripdata_2023-02.parquet", dict_vec) 

Processing Complete for feb file


In [27]:
y_test_pred = lr_model.predict(X_train_feb)
feb_rmse = round(mean_squared_error(y_train_feb, y_test_pred, squared = False),2)
print(f"RMSE on testing/evaluation Dataset: {feb_rmse}%")

RMSE on testing/evaluation Dataset: 7.81%


In [28]:
#Once again deallocating the memory which is not in use
del X_train_feb
del y_train_feb

------------------------------------------- HOMEWORK COMPLETED ----------------------------------------------------------