In [22]:
import pandas as pd
import numpy as np
from py_files.data_manager import get_X_y
from py_files.features import distance, generate_features
from config import data_path

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import time
import xgboost as xgb
pd.options.display.float_format = '{:.6f}'.format

In [18]:
# X, y = get_X_y(force_clean=True)
X, y = get_X_y()
X = generate_features(X, y)

In [19]:
X = X.drop(['pickup_datetime'], axis=1)

In [20]:
def xgboost_model(X, y):
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # XGBoost
    xgb_model = xgb.XGBRegressor(booster='gbtree', n_estimators=100, learning_rate=0.01, max_depth=10, alpha=0.1)

    # Fit
    xgb_model.fit(X_train, y_train)

    # Validate
    y_pred = xgb_model.predict(X_test)

    return {'RMSE': mean_squared_error(y_test, y_pred, squared=False)}

In [23]:
xgboost_model(X, y)

{'RMSE': 603.7398110737926}

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, np.log(y), test_size=0.2, random_state=42)

In [5]:
num_samples = 10000
random_indices = np.random.choice(len(X_train), num_samples, replace=False)

X_train = X_train.iloc[random_indices]
y_train = y_train.iloc[random_indices]

In [6]:
len(X_train)

10000

In [15]:
# get the X and y, and add the features
X, y = get_X_y(force_clean=True)
feature_X = generate_features(X, y)

# drop the pickup datetime feature since sklearn RandomForest does
# not accept datetime columns
feature_X = feature_X.drop(columns=['pickup_datetime'])

# get the X and y for training
X_train = feature_X.copy()
y_train = y.copy()
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

# to speed up the grid search, we will use the first four instances
# of each cluster-to-cluster pair of data points
df = X_train.copy()
df = df.sort_values(by='avg_cluster_duration')

dfs = []

sample_per_class = 3
for _ in range(sample_per_class):
    firsts = df['avg_cluster_duration'] != df['avg_cluster_duration'].shift(1)
    dfs.append(df.loc[firsts].copy())
    df = df.loc[~firsts].copy()

# combine all of the reprsentative samples
final_df = pd.concat(dfs, axis=0).sort_values('avg_cluster_duration')

# shuffle the data so that it is no longer sorted by avg_cluster_duration
X_train = final_df.copy().sample(frac=1)
y_train = y_train.loc[X_train.index]

In [11]:
# Create param grid
param_grid = {
    'booster': ['gbtree', 'dart'],
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05],
    'max_depth': [10, 20],
    'alpha': [0.1, 0.5],
}

# XGBoost
xgb_model = xgb.XGBRegressor()

# Grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Best params
print('Best parameters from grid search: ', grid_search.best_params_)


Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best parameters from grid search:  {'alpha': 0.1, 'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}
