In [80]:
#importing libraries 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
import pickle



# 0. Label encoded original data

### to check feature importance

In [13]:
#reading original label encoded data
df = pd.read_csv("Data/transformed/train_org_lb.csv")
df.head()



Unnamed: 0,raisedhands,VisITedResources,AnnouncementsView,Discussion,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,50,88,30,80,1,4,4,1,4,0,8,0,0,1,1,1,H
1,32,82,59,63,1,3,3,2,0,1,0,1,1,1,0,0,M
2,50,62,13,33,1,3,3,2,0,1,4,1,0,0,0,0,L
3,60,80,50,40,0,4,11,0,8,0,9,0,1,0,0,1,M
4,70,92,50,7,0,4,4,2,0,1,7,0,1,1,1,1,H


In [14]:
#checking number of rows and columns
df.shape

(408, 17)

In [15]:
#convert feature's string values into numbers to apply models
label_to_key = {}
key_to_label = {}
for i,label in enumerate(sorted(df['Class'].unique())):
    label_to_key[label] = i
    key_to_label[i]= label

In [17]:
df.Class = df.Class.apply(lambda x :label_to_key[x]) # converting Class feature values from str to number

Unnamed: 0,raisedhands,VisITedResources,AnnouncementsView,Discussion,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,50,88,30,80,1,4,4,1,4,0,8,0,0,1,1,1,0
1,32,82,59,63,1,3,3,2,0,1,0,1,1,1,0,0,2
2,50,62,13,33,1,3,3,2,0,1,4,1,0,0,0,0,1
3,60,80,50,40,0,4,11,0,8,0,9,0,1,0,0,1,2
4,70,92,50,7,0,4,4,2,0,1,7,0,1,1,1,1,0


In [16]:
#checking the conversion
label_to_key

{'H': 0, 'L': 1, 'M': 2}

In [18]:
# extracting input features(X) and output (y)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [19]:
# to save trained model in pickle file

def save_pkl_model(path,model):
    pickle.dump(model, open(path,'wb'))

In [9]:
# to load trained model 
def load_pkl_model(path):
    return pickle.load(open(path, 'rb'))
    

**Random forest**

to check importance of features 

In [2]:
classifier = RandomForestClassifier()

In [3]:
f1 = make_scorer(f1_score, average='micro')

In [59]:
# setting parameters for grid search 


# n_estimatorsint, default=100
###The number of trees in the forest.

#criterion{“gini”, “entropy”}, default=”gini”
###The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.

#bootstrapbool, default=True
###Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html 
grid_param = {
    'n_estimators': [int(x) for x in range(200,2000,200)],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [60]:
#  grid search cross fold validation

gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring=f1,
                     cv=5,
                     n_jobs=-1)

In [61]:
# applying random forest with different combinations of hyperparameters 
gd_sr.fit(X, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400,
                                          1600, 1800]},
             scoring=make_scorer(f1_score, average=micro))

In [62]:
# getting model with best hyperparameters 
best_random = gd_sr.best_estimator_


In [None]:
#pritning best score, estimator and feature importance,  
print (gd_sr.best_score_,best_random,best_random.feature_importances_)

In [64]:
#saving the model 
save_pkl_model("models/rf.pkl",best_random)

In [10]:
#loading the model
best_random = load_pkl_model("models/rf.pkl")


In [63]:
# combining feature importance with column names in descending order to see 

d = {'Stats':X.columns,'FI':best_random.feature_importances_}
df1 = pd.DataFrame(d)

print(df1.sort_values(by=['FI'], ascending=False))



0.8210478771454381 RandomForestClassifier(criterion='entropy', n_estimators=1400) [0.14469198 0.19021877 0.1225293  0.08604307 0.02242389 0.03307895
 0.03152071 0.01396492 0.03192009 0.01654488 0.04661019 0.01087655
 0.0328439  0.03849443 0.02370171 0.15453664]
                       Stats        FI
1           VisITedResources  0.190219
15        StudentAbsenceDays  0.154537
0                raisedhands  0.144692
2          AnnouncementsView  0.122529
3                 Discussion  0.086043
10                     Topic  0.046610
13     ParentAnsweringSurvey  0.038494
5                NationalITy  0.033079
12                  Relation  0.032844
8                    GradeID  0.031920
6               PlaceofBirth  0.031521
14  ParentschoolSatisfaction  0.023702
4                     gender  0.022424
9                  SectionID  0.016545
7                    StageID  0.013965
11                  Semester  0.010877


In [21]:
# storing column names whose feature importance is less than 0.02 to drop 
drop_col = df1.Stats[df1.FI < 0.02]

In [22]:
# function to drop columns by matching names of columns
def func_dropCol(df,drop_col):
    for i in drop_col:
        df = df[df.columns.drop(list(df.filter(regex=i)))]
    return df




# 2. Naive Bays
**using one hot encoded ,bxtransformed data** 
**and important features from above model** 

**Assumptions:**

The biggest and only assumption is the assumption of conditional independence.

**Pros:**

1. Gives high performance when the conditional independence assumption is satisfied.
2. Easy to implement because only probabilities need to be calculated.
3. Works well with high-dimensional data, such as text.
4. Fast for real-time predictions.

**Cons:**

1. If conditional independence does not hold, then is performs poorly.
2. Has the problem of Numerical Stability or Numerical Underflow because of the multiplication of several small digits.
 

In [23]:
# loading classifier 
classifier = GaussianNB()

In [24]:
# reading one hot encoded box transformed data 
df = pd.read_csv("Data/transformed/train_bx_ohe.csv")

In [25]:
df.head()

Unnamed: 0,raisedhands,VisITedResources,AnnouncementsView,Discussion,gender_M,NationalITy_Iran,NationalITy_Iraq,NationalITy_Jordan,NationalITy_KW,NationalITy_Lybia,...,Topic_Math,Topic_Quran,Topic_Science,Topic_Spanish,Semester_S,Relation_Mum,ParentAnsweringSurvey_Yes,ParentschoolSatisfaction_Good,StudentAbsenceDays_Under-7,Class
0,0.283065,0.915333,-0.046205,1.202617,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,H
1,-0.248,0.787987,0.814707,0.738216,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,M
2,0.283065,0.330558,-0.819927,-0.262502,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,L
3,0.53294,0.744635,0.58067,0.000635,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,M
4,0.760956,0.998122,0.58067,-1.642231,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,H


In [26]:
# checking number of rows(r) and columns(c)  (r,c)
df.shape

(408, 61)

In [28]:
# dropping columns with less importance
df = func_dropCol(df,drop_col)
df.head()

Unnamed: 0,raisedhands,VisITedResources,AnnouncementsView,Discussion,gender_M,NationalITy_Iran,NationalITy_Iraq,NationalITy_Jordan,NationalITy_KW,NationalITy_Lybia,...,Topic_IT,Topic_Math,Topic_Quran,Topic_Science,Topic_Spanish,Relation_Mum,ParentAnsweringSurvey_Yes,ParentschoolSatisfaction_Good,StudentAbsenceDays_Under-7,Class
0,0.283065,0.915333,-0.046205,1.202617,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,H
1,-0.248,0.787987,0.814707,0.738216,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,M
2,0.283065,0.330558,-0.819927,-0.262502,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L
3,0.53294,0.744635,0.58067,0.000635,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,M
4,0.760956,0.998122,0.58067,-1.642231,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,H


In [29]:
# checking number of rows(r) and columns(c) in (r,c) format
df.shape

(408, 56)

In [30]:
# converting Class feature values from str to number
df.Class = df.Class.apply(lambda x :label_to_key[x]) 
df.head()

Unnamed: 0,raisedhands,VisITedResources,AnnouncementsView,Discussion,gender_M,NationalITy_Iran,NationalITy_Iraq,NationalITy_Jordan,NationalITy_KW,NationalITy_Lybia,...,Topic_IT,Topic_Math,Topic_Quran,Topic_Science,Topic_Spanish,Relation_Mum,ParentAnsweringSurvey_Yes,ParentschoolSatisfaction_Good,StudentAbsenceDays_Under-7,Class
0,0.283065,0.915333,-0.046205,1.202617,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0
1,-0.248,0.787987,0.814707,0.738216,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2
2,0.283065,0.330558,-0.819927,-0.262502,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.53294,0.744635,0.58067,0.000635,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2
4,0.760956,0.998122,0.58067,-1.642231,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0


In [31]:
# extracting input and target features
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [32]:
f1 = make_scorer(f1_score, average='micro')

In [33]:
# setting range of hyperparameters for grid search

#var_smoothingfloat, default=1e-9
##Portion of the largest variance of all features that is added to variances for calculation stability.
grid_param = {
    'var_smoothing': np.logspace(0,-9, num=10)}

In [34]:
# grid search cross validation
gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring=f1,
                     cv=5,
                     n_jobs=-1)

In [35]:
# applying random forest with different combinations of hyperparameters 
gd_sr.fit(X, y)

GridSearchCV(cv=5, estimator=GaussianNB(), n_jobs=-1,
             param_grid={'var_smoothing': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06, 1.e-07,
       1.e-08, 1.e-09])},
             scoring=make_scorer(f1_score, average=micro))

In [36]:
print (gd_sr.best_score_, gd_sr.best_params_)

0.6789220114423367 {'var_smoothing': 0.1}


# 3. algorithms with Normalized and one hot encoded data

In [52]:
df = pd.read_csv("Data/transformed/train_mms_ohe.csv")
print("shape " , df.shape)
df = func_dropCol(df,drop_col)

print("shape after removing columns ", df.shape)
df.Class = df.Class.apply(lambda x :label_to_key[x]) # mapping 
df.head()
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

shape  (408, 61)
shape after removing columns  (408, 56)


## 3.1 Decision Trees 
**Assumptions of algorithm** :

1. Initially, whole training data is considered as root.
2. Records are distributed recursively on the basis of the attribute value.

**Pros** :

1. Compared to other algorithms, data preparation requires less time.
2. Doesn’t require data to be normalized.
3. Missing values, to an extent, don’t affect its performance much.
4. Is very intuitive as can be explained as if-else conditions.

**Cons**:

1. Needs a lot of time to train the model.
2. A small change in data can cause a considerably large change in the Decision Tree structure.
3. Comparatively expensive to train.
4. Not good for regression tasks.

https://www.kdnuggets.com/2021/02/machine-learning-assumptions.html 
 

In [89]:
classifier = DecisionTreeClassifier()

In [90]:
# ??
f1 = make_scorer(f1_score, average='micro')

In [91]:
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

In [92]:
# grid search cross validation
gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=params,
                     scoring=f1,
                     cv=5,
                     n_jobs=-1)

In [93]:
# fitting model with different combinations of hyperparameters 
gd_sr.fit(X, y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100]},
             scoring=make_scorer(f1_score, average=micro))

In [43]:
best_random = gd_sr.best_estimator_

In [45]:
save_pkl_model("models/decision_tree.pkl",best_random)

In [44]:
print (gd_sr.best_score_, gd_sr.best_params_)

0.7476663655525445 {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 5}


# 3.2 Random forest

In [69]:
classifier = RandomForestClassifier()

In [70]:
f1 = make_scorer(f1_score, average='micro')

In [71]:
# n_estimatorsint, default=100
###The number of trees in the forest.

#criterion{“gini”, “entropy”}, default=”gini”
###The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.

#bootstrapbool, default=True
###Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html 
grid_param = {
    'n_estimators': [int(x) for x in range(200,2000,200)],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [72]:


gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring=f1,
                     cv=5,
                     n_jobs=-1)

In [73]:
# fitting model with different combinations of hyperparameters 
gd_sr.fit(X, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400,
                                          1600, 1800]},
             scoring=make_scorer(f1_score, average=micro))

In [74]:
print (gd_sr.best_score_, gd_sr.best_params_,gd_sr.best_estimator_.feature_importances_)

0.8235170129479072 {'bootstrap': False, 'criterion': 'entropy', 'n_estimators': 1200} [1.39391289e-01 1.67668527e-01 1.16293465e-01 8.92054264e-02
 2.65855691e-02 1.69458306e-03 4.41671792e-03 1.29594073e-02
 1.23105382e-02 1.20747395e-03 1.03705182e-03 5.74252832e-03
 4.92646410e-03 1.76656371e-03 2.57781185e-03 7.49381339e-04
 2.21051158e-03 1.25728733e-04 1.60093140e-03 4.69117540e-03
 1.21717491e-02 1.28142374e-02 1.33674254e-03 1.05384923e-03
 2.91819253e-03 4.45173087e-03 1.38175298e-03 1.75520255e-03
 2.05199687e-03 3.14238993e-03 1.18754610e-04 1.02351943e-02
 4.25232742e-05 6.32379756e-03 1.13565298e-02 1.20480471e-02
 1.02267456e-03 1.29494829e-03 2.78029799e-03 1.66685472e-03
 4.94376520e-03 7.45661870e-03 7.39790453e-03 8.66912799e-03
 7.27651984e-03 5.61289083e-03 1.08285688e-02 5.96464049e-03
 3.76493870e-03 7.61006993e-03 4.22507140e-03 3.45892615e-02
 3.99977490e-02 2.82488949e-02 1.36285366e-01]


In [75]:
save_pkl_model("models/random_forest.pkl",gd_sr.best_estimator_)

In [59]:
d = {'Stats':X.columns,'FI':gd_sr.best_estimator_.feature_importances_}
df1 = pd.DataFrame(d)


In [60]:
df1

Unnamed: 0,Stats,FI
0,raisedhands,0.138884
1,VisITedResources,0.164786
2,AnnouncementsView,0.117418
3,Discussion,0.088904
4,gender_M,0.026567
5,NationalITy_Iran,0.001502
6,NationalITy_Iraq,0.00471
7,NationalITy_Jordan,0.012951
8,NationalITy_KW,0.013042
9,NationalITy_Lybia,0.001386


# 3.3 Logistic regression 

In [76]:
classifier = LogisticRegression()

In [77]:
f1 = make_scorer(f1_score, average='micro')

In [78]:
grid_param =  {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 3),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000]
    }

In [79]:
# grid search cross validation
gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring=f1,
                     cv=5,
                     n_jobs=-1)

In [66]:
# fitting model with different combinations of hyperparameters 
gd_sr.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-04, 1.e+00, 1.e+04]),
                         'max_iter': [100, 1000],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag',
                                    'saga']},
             scoring=make_scorer(f1_score, average=micro))

In [67]:
print (gd_sr.best_score_, gd_sr.best_params_,gd_sr.best_estimator_)

0.7622704004817826 {'C': 1.0, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'} LogisticRegression()


In [68]:
save_pkl_model("models/logistic_regression.pkl",gd_sr.best_estimator_)

# 3.4 SVM
**Assumptions:**

It assumes data is independent and identically distributed.

**Pros:**

1. Works really well on high dimensional data.
2. Memory efficient.
3. Effective in cases where the number of dimensions is greater than the number of samples.

**Cons:**

1. Not suitable for large datasets.
2. Doesn’t work well when the dataset has noise, i.e., the target classes are overlapping.
3. Slow to train.
4. No probabilistic explanation for classification.

In [81]:
classifier = SVC()

In [82]:
f1 = make_scorer(f1_score, average='micro')

In [83]:
grid_param = { 'C':[0.1,1,100,1000],
'kernel':['rbf','poly','sigmoid','linear'],
'degree':[1,5,6]
}

In [84]:
# grid search cross validation
gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring=f1,
                     cv=5,
                     n_jobs=-1)

In [85]:
# fitting model with different combinations of hyperparameters 
gd_sr.fit(X, y)

GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 100, 1000], 'degree': [1, 5, 6],
                         'kernel': ['rbf', 'poly', 'sigmoid', 'linear']},
             scoring=make_scorer(f1_score, average=micro))

In [86]:
print (gd_sr.best_score_, gd_sr.best_params_,gd_sr.best_estimator_)

0.7917193616380608 {'C': 1, 'degree': 1, 'kernel': 'rbf'} SVC(C=1, degree=1)


In [87]:
save_pkl_model("models/svm.pkl",gd_sr.best_estimator_)

# 3.5 XGBOOST

**Assumptions:**

It may have an assumption that encoded integer value for each variable has ordinal relation.

**Pros:**

1. Can work in parallell.
2. Can handle missing values.
3. No need for scaling or normalizing data.
4. Fast to interpret.
5. Great execution speed.

**Cons:**

1. Can easily overfit if parameters are not tuned properly.
2. Hard to tune.

In [2]:
pip install --no-cache-dir xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
from xgboost import XGBClassifier

XGBoostError: 
XGBoost Library (libxgboost.so) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ['/home/ubuntu/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so: cannot read file data']


In [21]:
classifier = XGBClassifier()

In [22]:
f1 = make_scorer(f1_score, average='micro')

In [23]:
# grid search cross validation
grid_param  = {
    'n_estimators': [100],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [1,2],
    'min_samples_leaf': [1,2],
    'max_leaf_nodes': [4,50,None]
}

In [24]:
gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring=f1,
                     cv=5,
                     n_jobs=-1)

In [25]:
# fitting model with different combinations of hyperparameters 
gd_sr.fit(X, y)

Parameters: { "criterion", "max_leaf_nodes", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.

Parameters: { "criterion", "max_leaf_nodes", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "criterion", "max_leaf_nodes", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, o

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_ca...
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                        

In [None]:
print (gd_sr.best_score_, gd_sr.best_params_,gd_sr.best_estimator_)

0.7917193616380608 {'C': 1, 'degree': 1, 'kernel': 'rbf'} SVC(C=1, degree=1)


In [None]:
save_pkl_model("models/xgboost.pkl",gd_sr.best_estimator_)

# 3.6 Neural network 