In [60]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

Данные: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [6]:
df_jan = pd.read_parquet("./data/yellow_tripdata_2022-01.parquet")
df_feb = pd.read_parquet("./data/yellow_tripdata_2022-02.parquet")

In [10]:
df_jan.sample(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
672494,1,2022-01-10 16:27:17,2022-01-10 16:38:14,1.0,2.7,1.0,N,114,233,1,10.5,3.5,0.5,1.0,0.0,0.3,15.8,2.5,0.0
2070814,2,2022-01-27 15:05:24,2022-01-27 15:41:29,1.0,4.78,1.0,N,236,144,1,24.0,0.0,0.5,6.82,0.0,0.3,34.12,2.5,0.0
690767,1,2022-01-10 20:32:02,2022-01-10 20:39:39,1.0,1.5,1.0,N,90,48,1,7.5,3.0,0.5,1.0,0.0,0.3,12.3,2.5,0.0


Count columns

In [9]:
df_jan.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [8]:
len(df_jan.columns)

19

Standard deviation of the trips duration in Jan 2022 Yellow Taxi Trip data

In [38]:
df_jan.tpep_dropoff_datetime = pd.to_datetime(df_jan.tpep_dropoff_datetime)
df_jan.tpep_pickup_datetime = pd.to_datetime(df_jan.tpep_pickup_datetime)

In [39]:
df_jan["duration"] = df_jan.tpep_dropoff_datetime - df_jan.tpep_pickup_datetime

In [41]:
df_jan["duration"] = df_jan["duration"].apply(lambda x: x.total_seconds() / 60)

In [43]:
df_jan.duration.std()

46.44530513776802

Fraction of the records left after dropping the outliers

In [48]:
len(df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]) / len(df_jan) * 100

98.27547930522405

In [49]:
df_jan = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]

Dimensionality after OHE

In [51]:
categorical = ["PULocationID", "DOLocationID"]
df_jan[categorical] = df_jan[categorical].astype(str)

In [52]:
train_dicts = df_jan[categorical].to_dict(orient="records")

In [54]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [55]:
X_train

<2421440x515 sparse matrix of type '<class 'numpy.float64'>'
	with 4842880 stored elements in Compressed Sparse Row format>

In [57]:
target = "duration"
y_train = df_jan[target].values

Train model

In [58]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [61]:
preds = lr.predict(X_train)

In [63]:
mean_squared_error(y_train, preds, squared=False)

6.986190837370544

Validation

In [64]:
def read_df(path: str, feat_cols: list[str]=["PULocationID", "DOLocationID"]) -> pd.DataFrame:
    df = pd.read_parquet(path)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = df["duration"].apply(lambda x: x.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[feat_cols] = df[feat_cols].astype(str)
    return df


df_val = read_df("./data/yellow_tripdata_2022-02.parquet")

In [65]:
len(df_val)

2918187

In [66]:
val_dicts = df_val[categorical].to_dict(orient="records")
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

In [69]:
val_preds = lr.predict(X_val)
mean_squared_error(y_val, val_preds, squared=False)

7.786409085078911