In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [12]:
df=pd.read_parquet('./yellow_tripdata_2023-01.parquet')

In [13]:
print("Total Columns:",len(df.columns))

Total Columns: 19


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [15]:
df['duration'] = df.tpep_dropoff_datetime  - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [16]:
print("Standard Deviation", df.duration.std())

Standard Deviation 42.59435124195458


In [17]:
df_cleaned = df[(df.duration >= 1) & (df.duration <= 60)]
perc = len(df_cleaned) / len(df) * 100
print(f"Percentage of records left after outlier removal: {perc:.2f}%")

Percentage of records left after outlier removal: 98.12%


In [18]:
from sklearn.feature_extraction import DictVectorizer

data_dicts = df_cleaned[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient='records')

dv = DictVectorizer()
X_train  = dv.fit_transform(data_dicts)

print("Feature matrix shape:", X_train.shape)

Feature matrix shape: (3009173, 515)


In [19]:
target = 'duration'
y_train = df_cleaned[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 7.649261822035489


In [20]:
def transform_data(path, dv, target):
    df = pd.read_parquet(path)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df_cleaned = df[(df.duration >= 1) & (df.duration <= 60)]
    data_dicts = df_cleaned[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient='records')
    X = dv.transform(data_dicts)
    y = df_cleaned[target].values
    return X, y

X_val, y_val = transform_data('./yellow_tripdata_2023-02.parquet', dv, 'duration')


In [21]:
y_pred = lr.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 7.811821332387183
