In [1]:
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
import torch

In [2]:
# read parquet file as pandas dataframe
df = pq.read_table('taxi_trips.parquet').to_pandas()
df

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,min_of_day,day_of_week,day_of_year
187173,0.270744,1.425051,0.466831,0.776192,-0.007192,-0.942057,-1.047619,1.180855
179655,0.085681,0.306079,-0.226274,0.124111,0.370803,0.778703,-1.047619,-0.311290
120048,-0.555533,-0.934806,-0.022489,-0.548561,-0.434735,-1.183016,-1.047619,1.316504
62587,0.069718,0.035804,0.258900,0.317284,-0.811614,-0.908008,0.481657,0.018144
212063,-0.234482,0.074616,0.115836,0.258310,0.480548,1.066806,-0.537860,0.521985
...,...,...,...,...,...,...,...,...
519591,-0.853230,-0.623528,-1.257638,-0.858561,-0.496274,1.548723,0.991416,1.665317
342999,0.623131,0.373217,-0.558609,-0.119350,0.789244,-1.714696,0.481657,-1.745299
157477,-0.226796,-0.595202,-0.527212,-1.051594,0.081328,-1.976608,1.501174,0.056901
432901,-0.226796,0.056775,1.005321,-1.571717,1.210403,1.111331,-0.028101,0.812663


In [3]:
# define target
y = df['trip_duration']
y

187173   -0.007192
179655    0.370803
120048   -0.434735
62587    -0.811614
212063    0.480548
            ...   
519591   -0.496274
342999    0.789244
157477    0.081328
432901    1.210403
190792    0.140342
Name: trip_duration, Length: 523554, dtype: float32

In [4]:
# define features
X = df.drop(columns=['trip_duration'])
X

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,min_of_day,day_of_week,day_of_year
187173,0.270744,1.425051,0.466831,0.776192,-0.942057,-1.047619,1.180855
179655,0.085681,0.306079,-0.226274,0.124111,0.778703,-1.047619,-0.311290
120048,-0.555533,-0.934806,-0.022489,-0.548561,-1.183016,-1.047619,1.316504
62587,0.069718,0.035804,0.258900,0.317284,-0.908008,0.481657,0.018144
212063,-0.234482,0.074616,0.115836,0.258310,1.066806,-0.537860,0.521985
...,...,...,...,...,...,...,...
519591,-0.853230,-0.623528,-1.257638,-0.858561,1.548723,0.991416,1.665317
342999,0.623131,0.373217,-0.558609,-0.119350,-1.714696,0.481657,-1.745299
157477,-0.226796,-0.595202,-0.527212,-1.051594,-1.976608,1.501174,0.056901
432901,-0.226796,0.056775,1.005321,-1.571717,1.111331,-0.028101,0.812663


In [5]:
# split in training, test and validations
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=10.0/80.0, random_state=1)

print('shapes:\n- - - -')
print(f'x_train: {X_train.shape}\t y_train: {y_train.shape}\t frac: {y_train.shape[0]/y.shape[0]*100:.2f}%')
print(f'x_test: {X_test.shape}\t y_test: {y_test.shape}\t frac: {y_test.shape[0]/y.shape[0]*100:.2f}%')
print(f'x_val: {X_val.shape}\t y_val: {y_val.shape}\t frac: {y_val.shape[0]/y.shape[0]*100:.2f}%')

shapes:
- - - -
x_train: (366487, 7)	 y_train: (366487,)	 frac: 70.00%
x_test: (104711, 7)	 y_test: (104711,)	 frac: 20.00%
x_val: (52356, 7)	 y_val: (52356,)	 frac: 10.00%


In [6]:
# Baseline model: simple linear regression
from sklearn.linear_model import LinearRegression
# create linear model to predict y from X
model_baseline = LinearRegression().fit(X_train, y_train)
y_pred_baseline = model_baseline.predict(X_test)

In [7]:
from torcheval.metrics.functional import r2_score
r2_score(torch.tensor(y_pred_baseline), torch.tensor(y_test.to_numpy()))

tensor(0.0189)

In [9]:
# loss fn and NN model
import torch.nn as nn

class Loss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, g_vec, y_true):
        f_mu = g_vec[:,0]
        f_sigma = torch.log(1+torch.exp(g_vec[:,1]))
        L = torch.log(f_sigma) + 1/2*(y_true-f_mu)**2/f_sigma**2
        return L.mean()

class NNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return g_vec