In [1]:
# Compare Algorithms
import yellowcab
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, PolynomialFeatures, StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn import metrics

In [2]:
# load dataset
trip_data = yellowcab.io.read_all_files('parquet')

In [3]:
# this cell can be removed after the associated issue is fixed
trip_data = trip_data[trip_data["trip_distance"] > 0]

In [4]:
known = ['passenger_count', 'trip_distance', 'PULocationID',
         'start_month', 'start_day', 'start_hour', 'start_week', 'weekend', 'weekday',
         'start_location_long', 'start_location_lat', 
         'PUBorough', 'PUservice_zone']

numerical_features = ['passenger_count', 'start_month', 'start_day', 
                      'start_hour', 'start_week', 
                      'start_location_long', 'start_location_lat']

categorical_features = ['PULocationID', 'PUBorough', 'PUservice_zone',
                        'weekend', 'weekday']

predict = ["trip_distance"]

In [5]:
# trip_data = trip_data.sample(100)

In [6]:
X = trip_data[known].drop(columns=predict)
y = trip_data[predict]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

### MLPR

In [8]:
mlp = yellowcab.model.training_pipe(numerical_features, categorical_features, degree=1, regressor=MLPRegressor(verbose=True))

In [9]:
print(datetime.now())

2021-06-14 00:11:38.302841


In [10]:
mlp = mlp.fit(X_train, y_train.to_numpy().ravel())

Iteration 1, loss = 5.43363324
Iteration 2, loss = 5.38129239
Iteration 3, loss = 5.37183425
Iteration 4, loss = 5.36633096
Iteration 5, loss = 5.36131347
Iteration 6, loss = 5.35689601
Iteration 7, loss = 5.35381643
Iteration 8, loss = 5.35142261
Iteration 9, loss = 5.34888317
Iteration 10, loss = 5.34758280
Iteration 11, loss = 5.34745270
Iteration 12, loss = 5.34609568
Iteration 13, loss = 5.34510373
Iteration 14, loss = 5.34460938
Iteration 15, loss = 5.34408962
Iteration 16, loss = 5.34386627
Iteration 17, loss = 5.34308738
Iteration 18, loss = 5.34263319
Iteration 19, loss = 5.34257556
Iteration 20, loss = 5.34236386
Iteration 21, loss = 5.34229219
Iteration 22, loss = 5.34209162
Iteration 23, loss = 5.34201297
Iteration 24, loss = 5.34168680
Iteration 25, loss = 5.34160808
Iteration 26, loss = 5.34130070
Iteration 27, loss = 5.34129378
Iteration 28, loss = 5.34141071
Iteration 29, loss = 5.34115927
Iteration 30, loss = 5.34083783
Iteration 31, loss = 5.34084784
Iteration 32, los

In [11]:
print(datetime.now())

2021-06-14 06:36:40.195518


In [12]:
mlp_result = yellowcab.model.test_regression_model(mlp, X_train, X_test, y_train, y_test)

In [13]:
mlp_result

Unnamed: 0,Training set,Test set
RSME,3.264188,3.250417
MAE,2.012045,2.0121


In [14]:
yellowcab.io.save_model("s1_trip_distance_mlp", mlp)

### Lasso with degree 5 and alpha 0.001

In [11]:
lasso = yellowcab.model.training_pipe(numerical_features, categorical_features, degree=4, regressor=Lasso(alpha=0.001))

In [12]:
print(datetime.now())

2021-06-14 14:36:14.669834


In [None]:
lasoo = lasso.fit(X_train, y_train)

In [None]:
print(datetime.now())

In [None]:
lasso_result = yellowcab.model.test_regression_model(lasoo, X_train, X_test, y_train, y_test)

### RFR

In [None]:
# random_forest = yellowcab.model.training_pipe(numerical_features, categorical_features, degree=1, regressor=RandomForestRegressor(max_depth=100))

In [None]:
# print(datetime.now())

In [None]:
# random_forest = random_forest.fit(X_train, y_train)

In [None]:
# print(datetime.now())

In [None]:
# forest_result = yellowcab.model.test_regression_model(random_forest, X_train, X_test, y_train, y_test)

In [None]:
# forest_result