In [64]:
import polars as pl
from pathlib import Path

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import root_mean_squared_error

import numpy as np

In [3]:
df = pl.read_parquet(Path("./data/yellow_tripdata_2023-01.parquet"))

In [4]:
print(len(df.columns))

19


In [15]:
df = df.with_columns(duration=pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime"))

In [18]:
df.select(pl.col("duration").std())

duration
duration[ns]
42m 35s 661074517ns


In [28]:
filtered_df = df.filter(pl.col("duration")>=pl.duration(minutes=1), pl.col("duration")<=pl.duration(minutes=60))

In [32]:
filtered_df.shape[0] / df.shape[0] * 100

98.1220282212598

In [35]:
categorical = ['PULocationID', 'DOLocationID']
filtered_df = filtered_df.with_columns(pl.col(categorical).cast(pl.String))
train_dicts = filtered_df.select(pl.col(categorical)).to_dicts()

In [38]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [44]:
X_train.shape[1]

515

In [56]:
y_train = filtered_df.select(pl.col("duration").cast(pl.Int64) / 10**9 / 60).to_numpy()

In [None]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train)

y_pred = lin_reg_model.predict(X_train)

In [59]:
root_mean_squared_error(y_train, y_pred)

7.649261824389368

In [65]:
def read_dataframe(file_path: Path) -> tuple[list[dict[str, str]], np.array]:
    df = pl.read_parquet(file_path)
    df = df.with_columns(duration=pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime"))
    filtered_df = df.filter(pl.col("duration")>=pl.duration(minutes=1), pl.col("duration")<=pl.duration(minutes=60))
    categorical = ['PULocationID', 'DOLocationID']
    filtered_df = filtered_df.with_columns(pl.col(categorical).cast(pl.String))
    X_dicts = filtered_df.select(pl.col(categorical)).to_dicts()
    y = filtered_df.select(pl.col("duration").cast(pl.Int64) / 10**9 / 60).to_numpy()
    return X_dicts, y

In [66]:
train_dicts, y_train = read_dataframe(Path("./data/yellow_tripdata_2023-01.parquet"))
val_dicts, y_val = read_dataframe(Path("./data/yellow_tripdata_2023-02.parquet"))

In [67]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [68]:
y_pred = lin_reg_model.predict(X_val)
root_mean_squared_error(y_val, y_pred)

7.811822570922324