In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [2]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    #categorical = ['PULocationID', 'DOLocationID']
    #df[categorical] = df[categorical].astype(str)
    
    return df

In [3]:
df_train = read_dataframe('../data/green_tripdata_2023-03.parquet')
df_val = read_dataframe('../data/green_tripdata_2023-04.parquet')

In [4]:
df_train.dtypes

VendorID                          int32
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                      float64
PULocationID                      int32
DOLocationID                      int32
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                    float64
trip_type                       float64
congestion_surcharge            float64
duration                        float64
dtype: object

In [5]:
df_train.isna().sum()

VendorID                     0
lpep_pickup_datetime         0
lpep_dropoff_datetime        0
store_and_fwd_flag        4435
RatecodeID                4435
PULocationID                 0
DOLocationID                 0
passenger_count           4435
trip_distance                0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
ehail_fee                69392
improvement_surcharge        0
total_amount                 0
payment_type              4435
trip_type                 4437
congestion_surcharge      4435
duration                     0
dtype: int64

In [6]:
from feature_engine.imputation import MeanMedianImputer

In [7]:
mmi = MeanMedianImputer(imputation_method="median")

In [8]:
mmi.fit(df_train)
df_train = mmi.transform(df_train)

In [9]:
df_train.isna().sum()

VendorID                     0
lpep_pickup_datetime         0
lpep_dropoff_datetime        0
store_and_fwd_flag        4435
RatecodeID                   0
PULocationID                 0
DOLocationID                 0
passenger_count              0
trip_distance                0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
ehail_fee                69392
improvement_surcharge        0
total_amount                 0
payment_type                 0
trip_type                    0
congestion_surcharge         0
duration                     0
dtype: int64

In [10]:
drop_c = ["store_and_fwd_flag", "ehail_fee"]
df_train = df_train.drop(drop_c, axis="columns")

In [11]:
categorical = [c for c in df_train.columns.values]
numerical = df_train["trip_distance"]

In [12]:
from feature_engine.selection import SelectByInformationValue

In [13]:
iv = SelectByInformationValue()
test = iv.fit(df_train[categorical], df_train[numerical])

KeyError: "None of [Index([ 2.36,  0.78,  0.78,  1.66,  3.14,  5.69,  2.92,  3.34,  1.75,  0.74,\n       ...\n        1.77,  2.72,  0.65,  1.29,  9.35,  2.93,  8.49, 12.27,  4.42,  2.96],\n      dtype='float64', length=69392)] are in the [columns]"