In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
taxi_trip_jan = pd.read_parquet("./data/yellow_tripdata_2023-01.parquet")

In [3]:
original_datapoints = taxi_trip_jan.shape[0]

In [4]:
print("The number of columns :", taxi_trip_jan.shape[1])

The number of columns : 19


In [5]:
categorical = ['PULocationID', 'DOLocationID']

In [6]:
taxi_trip_jan = taxi_trip_jan.dropna(subset=categorical)


In [7]:
taxi_trip_jan.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [8]:
taxi_trip_jan.tpep_dropoff_datetime = pd.to_datetime(taxi_trip_jan.tpep_dropoff_datetime) 

In [9]:
taxi_trip_jan.tpep_pickup_datetime =  pd.to_datetime(taxi_trip_jan.tpep_pickup_datetime)

In [10]:
taxi_trip_jan['duration'] = (taxi_trip_jan.tpep_dropoff_datetime - taxi_trip_jan.tpep_pickup_datetime).apply(lambda td: td.total_seconds()/60)

In [11]:
taxi_trip_jan.duration.std()

42.594351241920904

In [12]:
original_datapoints = taxi_trip_jan.duration.shape[0]

In [13]:
taxi_trip_jan =  taxi_trip_jan[(taxi_trip_jan.duration <= 60) & (taxi_trip_jan.duration >=1)]

In [14]:
current_datapoints = taxi_trip_jan.duration.shape[0]

In [15]:
dropped_outliers =  (original_datapoints- current_datapoints)/original_datapoints

In [16]:
dropped_outliers*100

1.8779717787402104

In [17]:
dv = DictVectorizer()

In [18]:
taxi_trip_jan.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [19]:
def preprocessor(df):
    

    
    categorical = ['PULocationID', 'DOLocationID']

    df.dropna(subset=categorical, inplace= True)

    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime =  pd.to_datetime(df.tpep_pickup_datetime)
    df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).apply(lambda td: td.total_seconds()/60)
    df =  df[(taxi_trip_jan.duration <= 60) & (df.duration >=1)]
    target = 'duration'
    
    y = df[target].values
    df = df[categorical].astype(str)

    return df, y
    

In [20]:
df_train = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
df_val = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

In [21]:
df_train, y_train = preprocessor(df_train)
df_val , y_val= preprocessor(df_val)


  df =  df[(taxi_trip_jan.duration <= 60) & (df.duration >=1)]


In [22]:
categorical = ['PULocationID', 'DOLocationID']

train_dict = df_train.to_dict(orient="records")
val_dict = df_val.to_dict(orient="records")
combined_data = pd.concat([df_train, df_val], ignore_index=True)

combined_dict = combined_data.to_dict(orient="records")

dv = DictVectorizer()
dv.fit(combined_dict)

X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)

In [23]:

# dv = DictVectorizer()
# X_train = dv.fit_transform(train_dict)
# X_val = dv.fit_transform(val_dict)

In [24]:

lr = LinearRegression()

In [25]:
lr.fit(X_train,y_train)


In [26]:
df_train.columns

Index(['PULocationID', 'DOLocationID'], dtype='object')

In [27]:



y_pred = lr.predict(X_train)

In [28]:

mean_squared_error(y_train,y_pred, squared= False)

7.649262227847301

In [29]:


y_preds= lr.predict(X_val)

In [30]:
mean_squared_error(y_val,y_preds, squared= False)

42.37495930398577