In [None]:
import os
import pandas
import numpy as np
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
source = Path(os.environ["DATA_DIR"])
vect = DictVectorizer(sparse=True)


def load_data(file):
    return pandas.read_parquet(
        source / file
    )


def duration_to_min(X):
    X["duration"] = (
        X["tpep_dropoff_datetime"] - X["tpep_pickup_datetime"]
    ).dt.total_seconds() / 60
    return X


def preprocess(X, fit=True):
    text = X[["PULocationID", "DOLocationID"]].astype(str).to_dict("records")
    if fit:
        return vect.fit_transform(text)
    else:
        return vect.transform(text)

Read in the January data and count number of columns

In [None]:
for file in os.listdir(source):
    if "2022-01" in file:
        X = load_data(file)
len(X.columns)

Compute duration variable in min of a ride and the std

In [None]:
X = duration_to_min(X)
X["duration"].std()

Duration statistics: Outliers

In [None]:
# Calculate multiple percentiles
percentiles = np.arange(0.1, 1, 0.1)
result = X["duration"].describe(percentiles=percentiles)

print(result)

In [None]:
n_total = len(X)
X = X.query("duration.between(1, 60)", engine="python")
len(X) / n_total

In [None]:
Xt = preprocess(X, fit=True)
y = X.pop("duration")

Number of columns

In [None]:
Xt

Train Linear Regression model and train 

In [None]:
rgs = LinearRegression()
rgs.fit(Xt, y)
ypred = rgs.predict(Xt)

In [None]:
mean_squared_error(y, ypred, squared=False)

Load in test data from February.

In [None]:
for file in os.listdir(source):
    if "2022-02" in file:
        Xtest = load_data(file)

In [None]:
Xtest = duration_to_min(Xtest)
Xtest = Xtest.query("duration.between(1, 60)", engine="python")
ytest = Xtest.pop("duration")
Xtest = preprocess(Xtest, fit=False)

In [None]:
ypred = rgs.predict(Xtest)

Compute RMS against test set

In [None]:
mean_squared_error(ytest, ypred, squared=False)