# XGBoost Model Deployment

#### Loading Libraries

In [56]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as PD
# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# DateTime Library
import datetime as dt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder

# Preprocessing
from sklearn.base import TransformerMixin
# Pipeline
from sklearn.pipeline import Pipeline

# Datasets
from sklearn import datasets
from sklearn.datasets import load_diabetes

# Model Metrics
from sklearn.metrics import mean_squared_error as MSE, accuracy_score, confusion_matrix, classification_report, recall_score

#Extreme Gradient Boosting
from xgboost import XGBRegressor, XGBRFRegressor
from xgboost import XGBClassifier, XGBRFClassifier

# Statistical & Math computing
from scipy.sparse import csr_matrix 
from scipy.sparse import hstack

# Warnings
import warnings

# Timing
import time
import datetime as dt

In [4]:
warnings.filterwarnings('ignore')

#### Loading Data

In [5]:
df = pd.read_csv('student-por.csv')
df.head()

Unnamed: 0,school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3
0,"GP;NaN;18;""U"";""GT3"";""A"";4;4;""at_home"";""teacher..."
1,"GP;""F"";NaN;""U"";""GT3"";""T"";1;1;""at_home"";""other""..."
2,"GP;""F"";15;""U"";""LE3"";""T"";1;1;""at_home"";""other"";..."
3,"GP;""F"";15;""U"";""GT3"";""T"";4;2;""health"";""services..."
4,"GP;""F"";16;""U"";""GT3"";""T"";3;3;""other"";""other"";""h..."


In [6]:
# Separating by coma; Let's reload
df = pd.read_csv('student-por.csv', sep=';')
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


#### Clearing Null-Values

In [7]:
df.isnull().sum()

school        0
sex           1
age           1
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      1
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [8]:
# Let's check on missing row
df[df.isna().any(axis=1)]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11


In [9]:
pd.options.display.max_columns = None

In [10]:
df[df.isna().any(axis=1)]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,course,,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11


In [11]:
# Setting Numerical Null values (-999.0)
df['age'].fillna(-999.0, inplace=True)

In [12]:
# Applying 'mode' on categorical columns
df['sex'] = df['sex'].fillna(df['sex'].mode())
df['guardian'] = df['guardian'].fillna(df['guardian'].mode())

In [13]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18.0,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,-999.0,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13


#### One-Hot Encoding

In [14]:
# Switching categorical columns
categorical_columns = df.columns[df.dtypes==object].tolist()

In [15]:
# OneHotEncoder Initialization
ohe = OneHotEncoder()

In [16]:
# Fit Transform Method
hot = ohe.fit_transform(df[categorical_columns])

In [17]:
hot_df = pd.DataFrame(hot.toarray())
hot_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


In [18]:
print(hot)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 8)	1.0
  (0, 10)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 25)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  (0, 31)	1.0
  (0, 33)	1.0
  (0, 36)	1.0
  (0, 38)	1.0
  (0, 39)	1.0
  (0, 41)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 10)	1.0
  (1, 17)	1.0
  (1, 20)	1.0
  :	:
  (647, 27)	1.0
  (647, 29)	1.0
  (647, 31)	1.0
  (647, 33)	1.0
  (647, 35)	1.0
  (647, 38)	1.0
  (647, 40)	1.0
  (647, 41)	1.0
  (648, 1)	1.0
  (648, 3)	1.0
  (648, 4)	1.0
  (648, 7)	1.0
  (648, 9)	1.0
  (648, 13)	1.0
  (648, 17)	1.0
  (648, 20)	1.0
  (648, 25)	1.0
  (648, 27)	1.0
  (648, 29)	1.0
  (648, 31)	1.0
  (648, 33)	1.0
  (648, 35)	1.0
  (648, 38)	1.0
  (648, 40)	1.0
  (648, 41)	1.0


In [19]:
hot

<649x43 sparse matrix of type '<class 'numpy.float64'>'
	with 11033 stored elements in Compressed Sparse Row format>

#### Combining OHE Matrix & Numerical Columns

In [20]:
cold_df = df.select_dtypes(exclude=["object"])
cold_df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,18.0,4,4,2,2,0,4,3,4,1,1,3,4,0,11,11
1,-999.0,1,1,1,2,0,5,3,3,1,1,3,2,9,11,11
2,15.0,1,1,1,2,0,4,3,2,2,3,3,6,12,13,12
3,15.0,4,2,1,3,0,3,2,2,1,1,5,0,14,14,14
4,16.0,3,3,1,2,0,4,3,2,1,2,5,0,11,13,13


In [22]:
cold = csr_matrix(cold_df)

In [23]:
final_sparse_matrix = hstack((hot, cold))

In [24]:
# final_sparse_matrix verification
final_df = pd.DataFrame(final_sparse_matrix.toarray())
final_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,18.0,4.0,4.0,2.0,2.0,0.0,4.0,3.0,4.0,1.0,1.0,3.0,4.0,0.0,11.0,11.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,1.0,1.0,1.0,2.0,0.0,5.0,3.0,3.0,1.0,1.0,3.0,2.0,9.0,11.0,11.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,15.0,1.0,1.0,1.0,2.0,0.0,4.0,3.0,2.0,2.0,3.0,3.0,6.0,12.0,13.0,12.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,15.0,4.0,2.0,1.0,3.0,0.0,3.0,2.0,2.0,1.0,1.0,5.0,0.0,14.0,14.0,14.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,16.0,3.0,3.0,1.0,2.0,0.0,4.0,3.0,2.0,1.0,2.0,5.0,0.0,11.0,13.0,13.0


### Customizing Scikit-learn Transformers

#### Base Model Example

In [39]:
# class YourClass(TransformerMixin):
#     def __init__(self):
#         None
#     def fit(self, X, y=None):
#         return self
#     def transform(self, X, y=None):
#         # Transformation Code Procedure
#         return X

#### Customizing a Mixed Null-Value Imputer

In [42]:
from sklearn.base import TransformerMixin

# Defining the class
class NullValueImputer(TransformerMixin):
    def __init__(self):
        None
    # Fit Method
    def fit(self, X, y=None):
        return self
    # Transform Method
    def transform(self, X, y=None):
        for column in X.columns.tolist():
            if column in X.columns[X.dtypes==object].tolist():
                # Use mode()[0] since mode() returns a Series
                X[column] = X[column].fillna(X[column].mode()[0])
            else:
                X[column] = X[column].fillna(-999.0)
        return X

##### Resetting the data

In [43]:
df = pd.read_csv('student-por.csv', sep=';')
nvi = NullValueImputer().fit_transform(df)
nvi.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18.0,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,-999.0,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13


#### One-Hot Encoding Mixed Data

In [49]:
class SparseMatrix(TransformerMixin):
    def __init__(self):
        None
    # Fit Method
    def fit(self, X, y=None):
        return self
    # Transform Method
    def transform(self, X, y=None):
        # Accessing cat columns
        categorical_columns = X.columns[X.dtypes==object].tolist()
        # Encoder Initialization
        ohe = OneHotEncoder()
        # cat column into one-hot encode procedure
        hot = ohe.fit_transform(X[categorical_columns])
        # Numerical columns df
        cold_df = X.select_dtypes(exclude=["object"])
        # From Numerical to sparse df
        cold = csr_matrix(cold_df)
        # combining sparse matrices
        final_sparse_matrix = hstack((hot, cold))
        # Converting on Compressed Sparse Row(CSR)
        final_csr_matrix = final_sparse_matrix.tocsr()
        return final_csr_matrix

In [52]:
# Transforming 'nvi' data
sm = SparseMatrix().fit_transform(nvi)
print(sm)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 8)	1.0
  (0, 10)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 25)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  (0, 31)	1.0
  (0, 33)	1.0
  (0, 36)	1.0
  (0, 38)	1.0
  (0, 39)	1.0
  (0, 41)	1.0
  (0, 43)	18.0
  (0, 44)	4.0
  (0, 45)	4.0
  (0, 46)	2.0
  (0, 47)	2.0
  (0, 49)	4.0
  (0, 50)	3.0
  (0, 51)	4.0
  :	:
  (648, 20)	1.0
  (648, 25)	1.0
  (648, 27)	1.0
  (648, 29)	1.0
  (648, 31)	1.0
  (648, 33)	1.0
  (648, 35)	1.0
  (648, 38)	1.0
  (648, 40)	1.0
  (648, 41)	1.0
  (648, 43)	18.0
  (648, 44)	3.0
  (648, 45)	2.0
  (648, 46)	3.0
  (648, 47)	1.0
  (648, 49)	4.0
  (648, 50)	4.0
  (648, 51)	1.0
  (648, 52)	3.0
  (648, 53)	4.0
  (648, 54)	5.0
  (648, 55)	4.0
  (648, 56)	10.0
  (648, 57)	11.0
  (648, 58)	11.0


In [53]:
sm_df = pd.DataFrame(sm.toarray())
sm_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,18.0,4.0,4.0,2.0,2.0,0.0,4.0,3.0,4.0,1.0,1.0,3.0,4.0,0.0,11.0,11.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,1.0,1.0,1.0,2.0,0.0,5.0,3.0,3.0,1.0,1.0,3.0,2.0,9.0,11.0,11.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,15.0,1.0,1.0,1.0,2.0,0.0,4.0,3.0,2.0,2.0,3.0,3.0,6.0,12.0,13.0,12.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,15.0,4.0,2.0,1.0,3.0,0.0,3.0,2.0,2.0,1.0,1.0,5.0,0.0,14.0,14.0,14.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,16.0,3.0,3.0,1.0,2.0,0.0,4.0,3.0,2.0,1.0,2.0,5.0,0.0,11.0,13.0,13.0


#### Preprocessing Pipeline

In [54]:
df = pd.read_csv('student-por.csv', sep=';')

In [55]:
# Setting Predictors & Target
y = df.iloc[:, -1]
X = df.iloc[:, :-3]

In [57]:
from sklearn.pipeline import Pipeline

# Splitting Procedure
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [58]:
# Building the Pipeline
data_pipeline = Pipeline([('null_imputer', NullValueImputer()), ('sparse', SparseMatrix())])

In [59]:
# Transforming Predictors
X_train_transformed = data_pipeline.fit_transform(X_train)

### Finalizing an XGBoost Model

#### 1st XGBoost Model

In [60]:
y_train.value_counts()

G3
11    82
10    75
13    58
12    53
14    42
15    36
9     29
16    27
8     26
17    24
18    14
0     10
7      7
6      1
19     1
5      1
Name: count, dtype: int64

In [62]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

In [63]:
# Setting Cross-Validation function
def cross_val(model):
    scores = cross_val_score(model, X_train_transformed, y_train, scoring='neg_root_mean_squared_error', cv=kfold)
    rmse = (-scores.mean())
    return rmse

In [64]:
cross_val(XGBRegressor(missing=-999.0))

2.908377635848431

#### Fine-Tuning The XGBoost Hyperparameters

In [65]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train_transformed, y_train, random_state=2)

In [68]:
# Setting number of estimators function
def n_estimators(model):
    eval_set = [(X_test_2, y_test_2)]
    eval_metric="rmse"
    model.fit(X_train_2, y_train_2, eval_metric= eval_metric, eval_set=eval_set, early_stopping_rounds=100)
    y_pred = model.predict(X_test_2)
    rmse = MSE(y_test_2, y_pred)**0.5
    return rmse

In [69]:
# running out function
n_estimators(XGBRegressor(n_estimators=5000, missing=-999.0))

[0]	validation_0-rmse:3.26707
[1]	validation_0-rmse:3.15484
[2]	validation_0-rmse:3.12696
[3]	validation_0-rmse:3.13076
[4]	validation_0-rmse:3.14379
[5]	validation_0-rmse:3.10464
[6]	validation_0-rmse:3.11326
[7]	validation_0-rmse:3.11758
[8]	validation_0-rmse:3.09789
[9]	validation_0-rmse:3.05602
[10]	validation_0-rmse:3.02894
[11]	validation_0-rmse:3.02499
[12]	validation_0-rmse:3.01871
[13]	validation_0-rmse:3.02014
[14]	validation_0-rmse:3.00191
[15]	validation_0-rmse:3.00476
[16]	validation_0-rmse:2.99178
[17]	validation_0-rmse:3.01080
[18]	validation_0-rmse:3.03240
[19]	validation_0-rmse:3.02538
[20]	validation_0-rmse:3.02407
[21]	validation_0-rmse:3.02788
[22]	validation_0-rmse:3.03996
[23]	validation_0-rmse:3.06268
[24]	validation_0-rmse:3.06337
[25]	validation_0-rmse:3.06214
[26]	validation_0-rmse:3.06271
[27]	validation_0-rmse:3.06177
[28]	validation_0-rmse:3.06308
[29]	validation_0-rmse:3.06381
[30]	validation_0-rmse:3.06354
[31]	validation_0-rmse:3.05531
[32]	validation_0-

2.9917835098025005

In [70]:
# Grid-Search parameters function
def grid_search(params, reg=XGBRegressor(missing=-999.0)):
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    grid_reg.fit(X_train_transformed, y_train)
    # Best Parameters
    best_params = grid_reg.best_params_
    print("Best params: ", best_params)
    # Best Score
    best_score = np.sqrt(-grid_reg.best_score_)
    print("Best score: ", best_score)

In [72]:
grid_search(params={'max_depth': [1, 2, 3, 4, 6, 7, 8], 'n_estimators': [31]})

Best params:  {'max_depth': 1, 'n_estimators': 31}
Best score:  2.6626365314010423


In [74]:
# Narrowing 'max_depth' from 1 to 3 & adding min child weight
grid_search(params={'max_depth': [1, 2, 3],
                    'min_child_weight': [1, 2, 3, 4, 5],
                    'n_estimators': [31]})

Best params:  {'max_depth': 1, 'min_child_weight': 5, 'n_estimators': 31}
Best score:  2.6619236302372933


In [76]:
# Adding subsample
grid_search(params={'max_depth': [1],
                    'min_child_weight': [5],
                    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
                    'n_estimators': [31]})

Best params:  {'max_depth': 1, 'min_child_weight': 5, 'n_estimators': 31, 'subsample': 0.9}
Best score:  2.6602357702849453


In [77]:
# Adding colsample_bytree
grid_search(params={'max_depth': [1],
                    'min_child_weight': [5],
                    'subsample': [0.9],
                    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
                    'n_estimators': [50]})

Best params:  {'colsample_bytree': 0.7, 'max_depth': 1, 'min_child_weight': 5, 'n_estimators': 50, 'subsample': 0.9}
Best score:  2.6685636273901996


In [78]:
# Adding colsample_bylevel & colsample_bynode
grid_search(params={'max_depth': [1],
                    'min_child_weight': [5],
                    'subsample': [0.9],
                    'colsample_bytree': [0.7],
                    'colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
                    'colsample_bynode': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
                    'n_estimators': [50]})

Best params:  {'colsample_bylevel': 0.8, 'colsample_bynode': 0.6, 'colsample_bytree': 0.7, 'max_depth': 1, 'min_child_weight': 5, 'n_estimators': 50, 'subsample': 0.9}
Best score:  2.655251367845663


### Testing Model

In [79]:
X_test_transformed = data_pipeline.fit_transform(X_test)

In [84]:
# Model Inialization
model = XGBRegressor(max_depth=2, min_child_weight=3, subsample=0.9, colsample_bytree=0.8, gamma=2, missing=-999.0)
# Fitting Model
model.fit(X_train_transformed, y_train)

In [85]:
# Model Prediction
y_pred = model.predict(X_test_transformed)

In [86]:
rmse = MSE(y_pred, y_test)**0.5
rmse

2.8566593158444205

In [88]:
# Final adjustment
model = XGBRegressor(max_depth = 1,
                    min_child_weight= 5,
                    subsample = 0.9,
                    colsample_bytree = 0.7,
                    colsample_bylevel = 0.8,
                    colsample_bynode = 0.6,
                    n_estimators = 50,
                    missing=-999.0)

# Model Fitting
model.fit(X_train_transformed, y_train)
# Model Prediction
y_pred = model.predict(X_test_transformed)
#
rmse = MSE(y_pred, y_test)**0.5
rmse

2.786768260005396

### Building a Machine Learning Pipeline

In [90]:
full_pipeline = Pipeline([('null_imputer', NullValueImputer()),('sparse', SparseMatrix()),
                          ('xgb', XGBRegressor(max_depth = 1,
                                               min_child_weight= 5,
                                               subsample = 0.9, 
                                               colsample_bytree = 0.7,
                                               colsample_bylevel = 0.8,
                                               colsample_bynode = 0.6,
                                               n_estimators = 50,
                                               missing=-999.0))])

In [91]:
full_pipeline.fit(X, y)

In [92]:
new_data = X_test
full_pipeline.predict(new_data)

array([13.997908 ,  9.828741 , 12.510665 , 13.765332 , 12.376085 ,
       11.433457 , 13.981382 , 11.71628  , 10.466907 , 12.796446 ,
       13.017861 ,  9.320066 , 12.833034 , 12.375658 , 13.831662 ,
        8.55412  , 10.614889 , 10.395777 , 14.553729 , 10.508732 ,
       12.374157 , 13.279873 ,  7.844789 , 12.735181 ,  7.8177586,
        8.346847 , 10.704345 , 13.606125 , 13.591815 , 12.699286 ,
       11.773319 , 11.953569 , 14.490659 ,  9.611805 , 11.418309 ,
       13.652652 , 11.806082 , 11.490563 ,  8.924844 , 12.924772 ,
       11.597438 , 11.775952 , 12.034565 , 14.044403 , 13.819095 ,
       14.463523 , 12.372666 , 12.587751 , 12.903717 , 12.310608 ,
       13.955001 ,  7.254089 ,  6.7818146, 12.401753 , 13.562242 ,
       10.800904 , 12.664088 ,  9.025816 , 13.332284 , 12.268295 ,
       12.463243 ,  6.8164735,  9.692486 , 12.29682  , 13.999986 ,
       11.02311  , 13.591802 , 13.881598 , 12.39135  , 10.6344795,
       12.934876 , 12.636299 , 12.774744 ,  8.275056 , 13.0518

In [93]:
np.round(full_pipeline.predict(new_data))

array([14., 10., 13., 14., 12., 11., 14., 12., 10., 13., 13.,  9., 13.,
       12., 14.,  9., 11., 10., 15., 11., 12., 13.,  8., 13.,  8.,  8.,
       11., 14., 14., 13., 12., 12., 14., 10., 11., 14., 12., 11.,  9.,
       13., 12., 12., 12., 14., 14., 14., 12., 13., 13., 12., 14.,  7.,
        7., 12., 14., 11., 13.,  9., 13., 12., 12.,  7., 10., 12., 14.,
       11., 14., 14., 12., 11., 13., 13., 13.,  8., 13., 12., 14., 14.,
       12., 10., 12.,  8., 11., 11., 13., 11., 10., 12., 15., 14., 12.,
       10., 14., 13., 12., 13., 13.,  9., 13., 14., 13., 10., 12., 13.,
       13., 14., 13., 11.,  9., 14.,  7., 11., 13., 12., 14., 12., 12.,
       12., 12., 12., 13.,  8., 12., 11., 14., 12., 14., 14., 12., 13.,
       10., 14.,  8., 10., 13., 13., 10., 11., 13., 13., 10., 12., 13.,
       13., 13., 11., 13., 13., 14., 13.,  9., 10., 12.,  8.,  9., 12.,
       14., 13., 13., 11., 12., 12.,  9.], dtype=float32)

In [94]:
# Concatenating new data
new_df = pd.read_csv('student-por.csv')
new_X = df.iloc[:, :-3]
new_y = df.iloc[:, -1]
new_model = full_pipeline.fit(new_X, new_y)

In [95]:
more_new_data = X_test[:25]
np.round(new_model.predict(more_new_data))

array([14., 10., 13., 14., 12., 11., 14., 12., 10., 13., 13.,  9., 13.,
       12., 14.,  9., 11., 10., 15., 11., 12., 13.,  8., 13.,  8.],
      dtype=float32)

In [96]:
# One more small catch
single_row = X_test[:1]
single_row_plus = pd.concat([single_row, X_test[:25]])
print(np.round(new_model.predict(single_row_plus))[:1])

[14.]
