In [43]:
from yellowcab import io, model

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

# Load and enrich data

In [44]:
df_raw = io.read_all_files('parquet')

In [45]:
df_raw = df_raw.sample(1000000)

In [46]:
df_raw = io.add_weather_data(df_raw)
df_raw = io.add_lockdown(df_raw)

In [47]:
X = df_raw[['tip_amount',
             'congestion_surcharge',
             'DOLocationID',
             'PULocationID',
             'total_amount',
             'Temperature',
             'lockdown']]
y = df_raw[['payment_type']].astype(int)

In [48]:
y.groupby(['payment_type']).size()

payment_type
1    728218
2    265427
3      4814
4      1723
dtype: int64

# Preprocessing Pipeline

In [49]:
numeric_features = ['tip_amount', 'congestion_surcharge', 'total_amount', 'Temperature', 'lockdown']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())])

categorical_features = ['DOLocationID', 'PULocationID']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Train-test-split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
y_train = y_train.to_numpy().ravel()

# Perform Grid Search

In [64]:
%%time
model_core = RandomForestClassifier(random_state=123,  n_jobs=-1)
# parameters_old = {'n_estimators': (10, 50, 100, 200, 500, 1000),
#               'max_depth': (1, 2, 5, 10, 20, 50, 100, 200),
#               'min_samples_split': (1, 2, 5, 10, 50, 100, 500, 1000),
#               'min_samples_leaf': (1, 2, 5, 10, 50, 100, 500, 1000),
#               }
parameters = {'n_estimators': (20, 50, 100),
              'max_depth': (50, 100, 200),
              'min_samples_split': (1, 2, 5),
              'min_samples_leaf': (1, 2, 5),
              }
gridsearch_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('classifier', GridSearchCV(model_core,
                                                              parameters,
                                                              n_jobs=-1,
                                                              cv=5,
                                                              verbose=2))])
gridsearch_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Wall time: 30min 10s


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['tip_amount',
                                                   'congestion_surcharge',
                                                   'total_amount',
                                                   'Temperature', 'lockdown']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['DOLocationID',
                                                   'PULocationID']

# Evaluate Best Estimators

In [65]:
p_test = gridsearch_pipe.predict(X_test)
print(classification_report(y_test, p_test))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       1.00      0.96      0.98    145484
           2       0.89      0.99      0.94     53249
           3       0.00      0.00      0.00       950
           4       0.00      0.00      0.00       354

    accuracy                           0.97    200037
   macro avg       0.47      0.49      0.48    200037
weighted avg       0.96      0.97      0.96    200037



# Get best Parameters

In [66]:
# Results from a detailed comparison
gridsearch_pipe['classifier'].best_params_

{'min_samples_leaf': 2}

### Save model in case

In [67]:
io.save_model('payment_type_rf_tuned', gridsearch_pipe)