# XGBoost Kaggle Masters

# Loading Libraries

In [4]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as pd
# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# DateTime Library
import datetime as dt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

# Datasets
from sklearn import datasets
from sklearn.datasets import load_diabetes

# Model Metrics
from sklearn.metrics import mean_squared_error as MSE, accuracy_score, confusion_matrix, classification_report, recall_score

#Extreme Gradient Boosting
from xgboost import XGBRegressor, XGBRFRegressor
from xgboost import XGBClassifier, XGBRFClassifier

# Warnings
import warnings

# Timing
import time
import datetime as dt

#### Loading Uber & Lyft Data

In [9]:
df = pd.read_csv('cab_rides.csv', nrows=10000)
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL


In [10]:
# Setting a 'date' column
df['date'] = pd.to_datetime(df['time_stamp']*(10**6))
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.677
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.198
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223


In [14]:
# Since 'date' is already added, let's build upon it 'month, hour, dayofweek'

# Moth
df['month'] = df['date'].dt.month
# Hour
df['hour'] = df['date'].dt.hour
# Day of the week
df['dayofweek'] = df['date'].dt.dayofweek

In [15]:
# Checking on weekend days

def weekend(row):
    if row['dayofweek'] in [5, 6]:
        return 1
    else:
        return 0

In [16]:
df['weekend'] = df.apply(weekend, axis=1)

In [20]:
# Checking on Rush Hour

def rush_hour(row):
    if (row['hour'] in [6, 7 ,8 ,9, 15, 16, 17, 18]) & (row['weekend'] == 0):
        return 1
    else:
        return 0

In [21]:
df['roush_hour'] = df.apply(rush_hour, axis=1)

In [22]:
df.tail()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,roush_hour
9995,3.05,Uber,1543504379037,Fenway,North Station,11.5,1.0,934d2fbe-f978-4495-9786-da7b4dd21107,997acbb5-e102-41e1-b155-9df7de0a73f2,UberPool,2018-11-29 15:12:59.037,11,15,3,0,1
9996,3.05,Uber,1543800477997,Fenway,North Station,26.0,1.0,af8fd57c-fe7c-4584-bd1f-beef1a53ad42,6c84fd89-3f11-4782-9b50-97c468b19529,Black,2018-12-03 01:27:57.997,12,1,0,0,0
9997,3.05,Uber,1543407083241,Fenway,North Station,19.5,1.0,b3c5db97-554b-47bf-908b-3ac880e86103,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,2018-11-28 12:11:23.241,11,12,2,0,0
9998,3.05,Uber,1544896813623,Fenway,North Station,36.5,1.0,fcb35184-9047-43f7-8909-f62a7b17b6cf,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV,2018-12-15 18:00:13.623,12,18,5,1,0
9999,2.03,Lyft,1543812781166,Theatre District,Northeastern University,7.0,1.0,7f0e8caf-e057-41eb-bdef-27eb14c88122,lyft_line,Shared,2018-12-03 04:53:01.166,12,4,0,0,0


#### Engineering Frequency Columns

In [23]:
# Let's check frequency
df['cab_type'].value_counts()

cab_type
Uber    5427
Lyft    4573
Name: count, dtype: int64

In [24]:
df['cab_freq'] = df.groupby('cab_type')['cab_type'].transform('count')

In [25]:
df['cab_freq'] = df['cab_freq']/len(df)

In [26]:
df.tail()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,roush_hour,cab_freq
9995,3.05,Uber,1543504379037,Fenway,North Station,11.5,1.0,934d2fbe-f978-4495-9786-da7b4dd21107,997acbb5-e102-41e1-b155-9df7de0a73f2,UberPool,2018-11-29 15:12:59.037,11,15,3,0,1,0.5427
9996,3.05,Uber,1543800477997,Fenway,North Station,26.0,1.0,af8fd57c-fe7c-4584-bd1f-beef1a53ad42,6c84fd89-3f11-4782-9b50-97c468b19529,Black,2018-12-03 01:27:57.997,12,1,0,0,0,0.5427
9997,3.05,Uber,1543407083241,Fenway,North Station,19.5,1.0,b3c5db97-554b-47bf-908b-3ac880e86103,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,2018-11-28 12:11:23.241,11,12,2,0,0,0.5427
9998,3.05,Uber,1544896813623,Fenway,North Station,36.5,1.0,fcb35184-9047-43f7-8909-f62a7b17b6cf,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV,2018-12-15 18:00:13.623,12,18,5,1,0,0.5427
9999,2.03,Lyft,1543812781166,Theatre District,Northeastern University,7.0,1.0,7f0e8caf-e057-41eb-bdef-27eb14c88122,lyft_line,Shared,2018-12-03 04:53:01.166,12,4,0,0,0,0.4573


#### Mean Encoding - TargetEncoder

In [27]:
from category_encoders.target_encoder import TargetEncoder

In [28]:
# Encoder Initialization
encoder = TargetEncoder()

In [29]:
df['cab_type_mean'] = encoder.fit_transform(df['cab_type'], df['price'])

In [30]:
df.tail()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,roush_hour,cab_freq,cab_type_mean
9995,3.05,Uber,1543504379037,Fenway,North Station,11.5,1.0,934d2fbe-f978-4495-9786-da7b4dd21107,997acbb5-e102-41e1-b155-9df7de0a73f2,UberPool,2018-11-29 15:12:59.037,11,15,3,0,1,0.5427,15.743446
9996,3.05,Uber,1543800477997,Fenway,North Station,26.0,1.0,af8fd57c-fe7c-4584-bd1f-beef1a53ad42,6c84fd89-3f11-4782-9b50-97c468b19529,Black,2018-12-03 01:27:57.997,12,1,0,0,0,0.5427,15.743446
9997,3.05,Uber,1543407083241,Fenway,North Station,19.5,1.0,b3c5db97-554b-47bf-908b-3ac880e86103,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,2018-11-28 12:11:23.241,11,12,2,0,0,0.5427,15.743446
9998,3.05,Uber,1544896813623,Fenway,North Station,36.5,1.0,fcb35184-9047-43f7-8909-f62a7b17b6cf,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV,2018-12-15 18:00:13.623,12,18,5,1,0,0.5427,15.743446
9999,2.03,Lyft,1543812781166,Theatre District,Northeastern University,7.0,1.0,7f0e8caf-e057-41eb-bdef-27eb14c88122,lyft_line,Shared,2018-12-03 04:53:01.166,12,4,0,0,0,0.4573,16.916357


## Building Non-Correlated Ensembles

#### Range of Models

In [31]:
from sklearn.datasets import load_breast_cancer

In [32]:
# Setting Target & Predictors up
X, y = load_breast_cancer(return_X_y=True)

# Setting consistency through cross-validation
kfold = StratifiedKFold(n_splits=5)

In [33]:
# Classification function set-up
def classification_model(model):
    scores = cross_val_score(model, X, y, cv=kfold)
    return scores.mean()

In [41]:
# Scores with XGBoost
classification_model(XGBClassifier())

0.9701288619779538

In [42]:
# Score with gblinear
classification_model(XGBClassifier(booster='gblinear'))

0.9034777208507997

In [43]:
# Score with 'dart'
classification_model(XGBClassifier(booster='dart'))

0.9701288619779538

In [44]:
# Score with Random Forest
classification_model(RandomForestClassifier(random_state=2))

0.9666356155876418

In [45]:
# Score with Logistic Regression
classification_model(LogisticRegression(max_iter=10000))

0.9507995652848935

In [46]:
classification_model(XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1))

0.9683589504735289

#### Correlation

In [56]:
# M.L model finder function
def y_pred(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_pred, y_test)
    print(score)
    return y_pred

In [57]:
# One-fold predictions on
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [58]:
# 'y_pred' on
y_pred_gbtree = y_pred(XGBClassifier())

0.9440559440559441


In [59]:
# 'y_pred' on
y_pred_dart = y_pred(XGBClassifier(booster='dart', one_drop=True))

0.958041958041958


In [60]:
# On Random Forest
y_pred_forest = y_pred(RandomForestClassifier())

0.9440559440559441


In [61]:
# On Logistic Regression
y_pred_logistic = y_pred(LogisticRegression(max_iter=10000))

0.9370629370629371


In [62]:
# On Tune XGBClassifier
y_pred_xgb = y_pred(XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1))

0.965034965034965


##### Predictions concatenation

In [63]:
df_pred = pd.DataFrame(data=np.c_[y_pred_gbtree,
                       y_pred_dart,
                       y_pred_forest,
                       y_pred_logistic, 
                       y_pred_xgb], columns=['gbtree', 'dart', 'forest', 'logistic', 'xgb'])

In [64]:
df_pred.corr()

Unnamed: 0,gbtree,dart,forest,logistic,xgb
gbtree,1.0,0.970994,0.942069,0.927777,0.927384
dart,0.970994,1.0,0.971247,0.927777,0.956473
forest,0.942069,0.971247,1.0,0.955936,0.956215
logistic,0.927777,0.927777,0.955936,1.0,0.941715
xgb,0.927384,0.956473,0.956215,0.941715,1.0


### The Voting Classifier Emsemble

In [70]:
from sklearn.ensemble import VotingClassifier

# Initialization
estimators = []

In [71]:
# Model Initialization
logistic_model = LogisticRegression(max_iter=10000)

In [72]:
# Append the model
estimators.append(('logistic', logistic_model))

In [73]:
# Same procedure on XGBClassifier & Random Forest Classifier

xgb_model = XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1)
estimators.append(('xgb', xgb_model))

rf_model = RandomForestClassifier(random_state=2)
estimators.append(('rf', rf_model))

In [74]:
# Invoking Voting Classifier
ensemble = VotingClassifier(estimators)

In [76]:
scores = cross_val_score(ensemble, X, y, cv=kfold)
print(scores.mean())

0.9771619313771154


## Stacking Models Procedure

#### Stacking in Scikit-Learn

In [80]:
from sklearn.ensemble import StackingClassifier

# Base model empty list
base_models = []

In [81]:
# Appending all the 'Base Models' on

# LogisticRegression
base_models.append(('lr', LogisticRegression()))
# XGBClassifier
base_models.append(('xgb', XGBClassifier()))
# RandomForestClassifier
base_models.append(('rf', RandomForestClassifier(random_state=2)))

In [85]:
# Meta-Model
meta_model = LogisticRegression(max_iter=10000)

In [86]:
# Stacking Initalization
clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)

In [87]:
# Applying Cross-Validation
scores = cross_val_score(clf, X, y, cv=kfold)
print(scores.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.9806862288464524
