# Conflict in clinical significance in ClinVar 

Importing libraries

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import os as os 
import time as time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import balanced_accuracy_score,accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

## Reading data 

In [2]:
# main dataset
df = pd.read_csv("finalClin_3.csv", encoding='latin-1')
# Validation dataset
vdf = pd.read_csv("val_df.csv", encoding='latin-1')
#Production (Single submitter) dataset
ss = pd.read_csv("ss_df_2.csv", encoding='latin-1')

print("main data frame" ,df.shape)
print("validation data frame", vdf.shape)
print("single submitter data frame", ss.shape)

main data frame (187493, 50)
validation data frame (28993, 50)
single submitter data frame (690455, 50)


In [3]:
#dataframes concatenation
l = [df,vdf, ss]
df = pd.concat(l, ignore_index = True)
df.shape

(906941, 50)

In [4]:
# dropping columns no longer needed
df = df.drop(['ID', 'Protein_position', 'Codons'], axis = 1).copy()


### defining predictors and response variables

In [5]:
X = df.drop('class', axis = 1).copy()
X.head()

Unnamed: 0,SYMBOL,Allele,CLNVC,Consequence,IMPACT,relativeLocationRatio,ExIntron,TSL,Denisova,SIFT,...,H1.hESC_confidence_value,HUVEC_confidence_value,SiPhy_29way_logOdds,bStatistic,phastCons100way_vertebrate,GM12878_fitCons_rankscore,H1.hESC_fitCons_rankscore,HUVEC_fitCons_rankscore,phyloP30way_mammalian_rankscore,EVE_scores_ASM
0,ISG15,G>A,single_nucleotide_variant,missense_variant,MODERATE,1.0,2.0,,G/G,0.1,...,0.0,0.0,3.488,929.0,0.122,0.09955,0.31042,0.5714,0.04913,
1,ISG15,G>A,single_nucleotide_variant,missense_variant,MODERATE,1.0,2.0,,A/A,0.38,...,0.0,0.0,9.972,929.0,0.0,0.09955,0.31042,0.5714,0.01748,
2,ISG15,G>A,single_nucleotide_variant,missense_variant,MODERATE,1.0,2.0,,G/G,0.13,...,0.0,0.0,6.5244,929.0,0.0,0.09955,0.31042,0.5714,0.1211,
3,ISG15,G>A,single_nucleotide_variant,missense_variant,MODERATE,1.0,2.0,,G/G,0.87,...,0.0,0.0,3.5096,929.0,0.0,0.09955,0.31042,0.5714,0.00043,
4,AGRN,G>C,single_nucleotide_variant,missense_variant,MODERATE,0.027778,1.0,1.0,G/G,0.12,...,0.0,1.0,2.0859,934.0,0.0,0.04544,0.10781,0.0843,0.01906,


In [6]:
y = df['class'].copy()
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: class, dtype: float64

### One-hot encoding

In [7]:
print("One-Hot Encoding")
X_encoded = pd.get_dummies(X, columns=['SYMBOL',
                                       'Allele',
                                       'CLNVC',
                                       'Consequence',
                                       'IMPACT',
                                       'CollectionMethod',
                                       'Denisova',
                                       'TSL',
                                       'firstSubmitter'])
X_encoded.head()

One-Hot Encoding


Unnamed: 0,relativeLocationRatio,ExIntron,SIFT,PolyPhen,submitterNo,CADD_raw_rankscore_hg19,ClinPred_rankscore,DANN_rankscore,Eigen.raw_coding_rankscore,LIST.S2_rankscore,...,firstSubmitter_UNCMolecularGeneticsLaboratoryUnive,firstSubmitter_UWHindbrainMalformationResearchProg,firstSubmitter_UndiagnosedDiseasesNetworkNIH,firstSubmitter_UnitforGeneticEpidemiologicalResear,firstSubmitter_UniversityofWashingtonCenterforMend,firstSubmitter_UniversityofWashingtonDepartmentofL,firstSubmitter_VictorianClinicalGeneticsServicesMu,firstSubmitter_WomensHealthandGeneticsLaboratoryCo,firstSubmitter_WongMitoLabMolecularandHumanGenetic,firstSubmitter_other
0,1.0,2.0,0.1,0.952,2,0.38964,0.15687,0.89085,0.38367,0.33984,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2.0,0.38,0.003,3,0.06002,0.00038,0.15196,0.03563,0.14579,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2.0,0.13,0.115,2,0.15404,0.19137,0.44442,0.16641,0.3465,...,0,0,0,0,0,0,0,0,0,0
3,1.0,2.0,0.87,0.001,2,0.00104,0.00179,0.11246,0.00039,0.12095,...,0,0,0,0,0,0,0,0,0,0
4,0.027778,1.0,0.12,0.0,2,0.12689,0.0005,0.00769,0.02306,0.05533,...,0,0,0,0,0,0,0,0,0,0


### seprating Validation dataset from the X and y

In [8]:
print("orginal validation set dimention", vdf.shape)

# X_val
X_val = pd.DataFrame(X_encoded.loc[187494:216486])
#X_val.head()
print(X_val.shape)

type(y)

y_val = y.iloc[187493:216486]
print(y_val.shape)

#ss dataframe
sX = pd.DataFrame(X_encoded.loc[216486:])
print(sX.shape)

# now reverting X and y to their orginal shape
X_encoded = pd.DataFrame(X_encoded.loc[:187492])
#X_val.head()
print(X_encoded.shape)

y = y.iloc[:187493]
print(y.shape)



orginal validation set dimention (28993, 50)
(28993, 4123)
(28993,)
(690455, 4123)
(187493, 4123)
(187493,)


### Basci Model

In [9]:
# checking precent of diffrent class
sum(y)/len(y)

0.26144442725861766

In [10]:
# so lets stratify by y
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state = 42, stratify=y)


In [11]:
# # verify using stratify worked as expected
sum(y_train)/len(y_train)

0.26144404383475917

In [12]:
sum(y_test)/len(y_test)

0.26144557750565345

In [34]:
# making classifier
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic',
                            seed =42,
                            n_estimators=500)

# define the eval set and metric
eval_set = [(X_test, y_test)]
eval_metric = ["aucpr","error"]

%time clf_xgb.fit(X_train, y_train, verbose = True, early_stopping_rounds = 10, eval_metric= eval_metric, eval_set=eval_set)



[0]	validation_0-aucpr:0.60232	validation_0-error:0.20361
[1]	validation_0-aucpr:0.62628	validation_0-error:0.20084
[2]	validation_0-aucpr:0.62972	validation_0-error:0.19945
[3]	validation_0-aucpr:0.64011	validation_0-error:0.19668
[4]	validation_0-aucpr:0.64502	validation_0-error:0.19653
[5]	validation_0-aucpr:0.64938	validation_0-error:0.19499
[6]	validation_0-aucpr:0.65237	validation_0-error:0.19422
[7]	validation_0-aucpr:0.65512	validation_0-error:0.19358
[8]	validation_0-aucpr:0.65739	validation_0-error:0.19281
[9]	validation_0-aucpr:0.65956	validation_0-error:0.19256
[10]	validation_0-aucpr:0.66400	validation_0-error:0.19228
[11]	validation_0-aucpr:0.66866	validation_0-error:0.19083
[12]	validation_0-aucpr:0.67072	validation_0-error:0.19064
[13]	validation_0-aucpr:0.67312	validation_0-error:0.18955
[14]	validation_0-aucpr:0.67359	validation_0-error:0.18912
[15]	validation_0-aucpr:0.67391	validation_0-error:0.18889
[16]	validation_0-aucpr:0.67581	validation_0-error:0.18812
[17]	va

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [35]:
#Model assessment
pred_test = clf_xgb.predict(X_test)
pred_train = clf_xgb.predict(X_train)
print('Train Accuracy: ', accuracy_score(y_train, pred_train))
print('Test Accuraccy: ', accuracy_score(y_test, pred_test))
print('Classification Report:')
print(classification_report(y_test,pred_test))

Train Accuracy:  0.8227266585596541
Test Accuraccy:  0.8174041046208986
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.94      0.88     34619
           1       0.74      0.47      0.57     12255

    accuracy                           0.82     46874
   macro avg       0.78      0.71      0.73     46874
weighted avg       0.81      0.82      0.80     46874



### Best model performance

In [164]:
# Classifer based on hyperopt library
clf_xgb = xgb.XGBClassifier(seed=42,
                            objective = 'binary:logistic',
                            gamma = 3.65,
                            reg_lambda = 6.219,
                            learning_rate = 0.246,
                            max_depth = 6,
                            scale_pos_weight =  1.22,
                            subsample = 0.9,
                            colsample_bytree = 0.5,
                            n_estimators=500,
                            use_label_encoder=False)
eval_set = [(X_test, y_test)]
eval_metric = ["aucpr","error"]

clf_xgb.fit(X_train,
            y_train,
            verbose = True,
            early_stopping_rounds  =10,
            eval_metric = eval_metric,
            eval_set = eval_set)



[0]	validation_0-aucpr:0.50716	validation_0-error:0.31566
[1]	validation_0-aucpr:0.58651	validation_0-error:0.34213
[2]	validation_0-aucpr:0.60542	validation_0-error:0.30104
[3]	validation_0-aucpr:0.60594	validation_0-error:0.29573
[4]	validation_0-aucpr:0.61156	validation_0-error:0.29707
[5]	validation_0-aucpr:0.62236	validation_0-error:0.28873
[6]	validation_0-aucpr:0.63715	validation_0-error:0.27273
[7]	validation_0-aucpr:0.64076	validation_0-error:0.26654
[8]	validation_0-aucpr:0.64615	validation_0-error:0.25884
[9]	validation_0-aucpr:0.65084	validation_0-error:0.25677
[10]	validation_0-aucpr:0.65593	validation_0-error:0.25831
[11]	validation_0-aucpr:0.65920	validation_0-error:0.25458
[12]	validation_0-aucpr:0.66020	validation_0-error:0.25594
[13]	validation_0-aucpr:0.66300	validation_0-error:0.25082
[14]	validation_0-aucpr:0.66328	validation_0-error:0.25225
[15]	validation_0-aucpr:0.66459	validation_0-error:0.25537
[16]	validation_0-aucpr:0.66580	validation_0-error:0.25748
[17]	va

[139]	validation_0-aucpr:0.69722	validation_0-error:0.23589
[140]	validation_0-aucpr:0.69732	validation_0-error:0.23604
[141]	validation_0-aucpr:0.69737	validation_0-error:0.23593
[142]	validation_0-aucpr:0.69739	validation_0-error:0.23580
[143]	validation_0-aucpr:0.69746	validation_0-error:0.23572
[144]	validation_0-aucpr:0.69746	validation_0-error:0.23553
[145]	validation_0-aucpr:0.69756	validation_0-error:0.23557
[146]	validation_0-aucpr:0.69759	validation_0-error:0.23555
[147]	validation_0-aucpr:0.69773	validation_0-error:0.23548
[148]	validation_0-aucpr:0.69772	validation_0-error:0.23514
[149]	validation_0-aucpr:0.69789	validation_0-error:0.23535
[150]	validation_0-aucpr:0.69793	validation_0-error:0.23540
[151]	validation_0-aucpr:0.69792	validation_0-error:0.23553
[152]	validation_0-aucpr:0.69796	validation_0-error:0.23542
[153]	validation_0-aucpr:0.69802	validation_0-error:0.23538
[154]	validation_0-aucpr:0.69801	validation_0-error:0.23563
[155]	validation_0-aucpr:0.69813	validat

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=3.65, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.246, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=6.219, scale_pos_weight=3, seed=42,
              subsample=0.9, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [165]:
# final model assessment
pred_test = clf_xgb.predict(X_test)
pred_train = clf_xgb.predict(X_train)
print('Train Accuracy: ', accuracy_score(y_train, pred_train))
print('Test Accuraccy: ', accuracy_score(y_test, pred_test))
print('Classification Report:')
print(classification_report(y_test,pred_test))


Train Accuracy:  0.7850148272992981
Test Accuraccy:  0.7648589836583181
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.76      0.83     34619
           1       0.53      0.78      0.63     12255

    accuracy                           0.76     46874
   macro avg       0.72      0.77      0.73     46874
weighted avg       0.81      0.76      0.78     46874



#### performance on validation set

In [38]:
# final model assessment
pred_val = clf_xgb.predict(X_val)
pred_train = clf_xgb.predict(X_train)
print('Train Accuracy: ', accuracy_score(y_train, pred_train))
print('Validation test accuraccy: ', accuracy_score(y_val, pred_val))
print('Classification Report:')
print(classification_report(y_val,pred_val))



Train Accuracy:  0.8247889687737788
Validation test accuraccy:  0.881626599524023
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94     25831
           1       0.35      0.10      0.15      3162

    accuracy                           0.88     28993
   macro avg       0.62      0.54      0.54     28993
weighted avg       0.84      0.88      0.85     28993



In [28]:
# final model assessment final model 3
pred_test = conflict_clf_xgb.predict(X_test)
pred_train = conflict_clf_xgb.predict(X_train)
pred_val = clf_xgb.predict(X_val)
print('Train Accuracy: ', accuracy_score(y_train, pred_train))
print('Test Accuraccy: ', accuracy_score(y_test, pred_test))
print('Validation test accuraccy: ', accuracy_score(y_val, pred_val))

print('Classification Report on test set:')
print(classification_report(y_test,pred_test))

print("")


print('Classification Report on validation set:')
print(classification_report(y_val,pred_val))

Train Accuracy:  0.7446788840768317
Test Accuraccy:  0.7138285616759824
Validation test accuraccy:  0.8846963060048977
Classification Report on test set:
              precision    recall  f1-score   support

           0       0.93      0.66      0.77     34619
           1       0.47      0.86      0.61     12255

    accuracy                           0.71     46874
   macro avg       0.70      0.76      0.69     46874
weighted avg       0.81      0.71      0.73     46874


Classification Report on validation set:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94     25831
           1       0.35      0.07      0.12      3162

    accuracy                           0.88     28993
   macro avg       0.62      0.53      0.53     28993
weighted avg       0.84      0.88      0.85     28993



In [None]:
# developer version final model 2
# Classifer based on hyperopt library
clf_xgb = xgb.XGBClassifier(seed=42,
                            objective = 'binary:logistic',
                            gamma = 3.65,
                            reg_lambda = 6.219,
                            learning_rate = 0.246,
                            max_depth = 6,
                            scale_pos_weight =  3,
                            subsample = 0.9,
                            colsample_bytree = 0.5,
                            n_estimators=500,
                            use_label_encoder=False)
eval_set = [(X_test, y_test)]
eval_metric = ["aucpr","error"]

clf_xgb.fit(X_train,
            y_train,
            verbose = True,
            early_stopping_rounds  =10,
            eval_metric = eval_metric,
            eval_set = eval_set)


In [27]:
# Developer version final model 3 
#Optimized to discover conflictig events weighted 
# Classifer based on hyperopt library
conflict_clf_xgb = xgb.XGBClassifier(seed=42,
                            objective = 'binary:logistic',
                            gamma = 3.65,
                            reg_lambda = 6.219,
                            learning_rate = 0.246,
                            max_depth = 6,
                            scale_pos_weight =  5,
                            subsample = 0.9,
                            colsample_bytree = 0.5,
                            n_estimators=500,
                            use_label_encoder=False)
eval_set = [(X_test, y_test)]
eval_metric = ["aucpr","error"]

conflict_clf_xgb.fit(X_train,
            y_train,
            verbose = True,
            early_stopping_rounds  =10,
            eval_metric = eval_metric,
            eval_set = eval_set)

[0]	validation_0-aucpr:0.49328	validation_0-error:0.49571
[1]	validation_0-aucpr:0.57975	validation_0-error:0.44756
[2]	validation_0-aucpr:0.59637	validation_0-error:0.43796
[3]	validation_0-aucpr:0.59944	validation_0-error:0.42672
[4]	validation_0-aucpr:0.59973	validation_0-error:0.42657
[5]	validation_0-aucpr:0.61234	validation_0-error:0.41953
[6]	validation_0-aucpr:0.63047	validation_0-error:0.40897
[7]	validation_0-aucpr:0.63567	validation_0-error:0.40656
[8]	validation_0-aucpr:0.64164	validation_0-error:0.39418
[9]	validation_0-aucpr:0.64363	validation_0-error:0.40097
[10]	validation_0-aucpr:0.64937	validation_0-error:0.37588
[11]	validation_0-aucpr:0.65458	validation_0-error:0.37180
[12]	validation_0-aucpr:0.65648	validation_0-error:0.37035
[13]	validation_0-aucpr:0.66270	validation_0-error:0.36468
[14]	validation_0-aucpr:0.66332	validation_0-error:0.36351
[15]	validation_0-aucpr:0.66460	validation_0-error:0.36325
[16]	validation_0-aucpr:0.66405	validation_0-error:0.35645
[17]	va

[139]	validation_0-aucpr:0.69449	validation_0-error:0.30307
[140]	validation_0-aucpr:0.69453	validation_0-error:0.30288
[141]	validation_0-aucpr:0.69470	validation_0-error:0.30356
[142]	validation_0-aucpr:0.69469	validation_0-error:0.30339
[143]	validation_0-aucpr:0.69485	validation_0-error:0.30337
[144]	validation_0-aucpr:0.69481	validation_0-error:0.30322
[145]	validation_0-aucpr:0.69475	validation_0-error:0.30339
[146]	validation_0-aucpr:0.69473	validation_0-error:0.30330
[147]	validation_0-aucpr:0.69489	validation_0-error:0.30341
[148]	validation_0-aucpr:0.69508	validation_0-error:0.30334
[149]	validation_0-aucpr:0.69488	validation_0-error:0.30256
[150]	validation_0-aucpr:0.69490	validation_0-error:0.30247
[151]	validation_0-aucpr:0.69498	validation_0-error:0.30239
[152]	validation_0-aucpr:0.69508	validation_0-error:0.30217
[153]	validation_0-aucpr:0.69510	validation_0-error:0.30211
[154]	validation_0-aucpr:0.69514	validation_0-error:0.30209
[155]	validation_0-aucpr:0.69520	validat

[276]	validation_0-aucpr:0.70216	validation_0-error:0.28935
[277]	validation_0-aucpr:0.70235	validation_0-error:0.28892
[278]	validation_0-aucpr:0.70250	validation_0-error:0.28892
[279]	validation_0-aucpr:0.70249	validation_0-error:0.28884
[280]	validation_0-aucpr:0.70250	validation_0-error:0.28856
[281]	validation_0-aucpr:0.70255	validation_0-error:0.28850
[282]	validation_0-aucpr:0.70263	validation_0-error:0.28871
[283]	validation_0-aucpr:0.70268	validation_0-error:0.28865
[284]	validation_0-aucpr:0.70283	validation_0-error:0.28865
[285]	validation_0-aucpr:0.70288	validation_0-error:0.28860
[286]	validation_0-aucpr:0.70287	validation_0-error:0.28856
[287]	validation_0-aucpr:0.70284	validation_0-error:0.28833
[288]	validation_0-aucpr:0.70291	validation_0-error:0.28845
[289]	validation_0-aucpr:0.70286	validation_0-error:0.28843
[290]	validation_0-aucpr:0.70290	validation_0-error:0.28850
[291]	validation_0-aucpr:0.70290	validation_0-error:0.28843
[292]	validation_0-aucpr:0.70315	validat

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=3.65, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.246, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=6.219, scale_pos_weight=5, seed=42,
              subsample=0.9, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [None]:
# final model assessment final model-2
pred_test = clf_xgb.predict(X_test)
pred_train = clf_xgb.predict(X_train)
print('Train Accuracy: ', accuracy_score(y_train, pred_train))
print('Test Accuraccy: ', accuracy_score(y_test, pred_test))
print('Classification Report:')
print(classification_report(y_test,pred_test))

# Finalize the model 

Finalize a model by applying the chosen machine learning procedure on all of the  data. In our case, we will run XGBoost with optimized hyperprametr on all of the data that we got: train + test + validation

In [39]:
# data preparing: providig all data to one hot coidn and y.
df = pd.read_csv("finalClin_3.csv", encoding='latin-1')
vdf = pd.read_csv("val_df.csv", encoding='latin-1')
l = [df,vdf]
df = pd.concat(l, ignore_index = True)

In [40]:
#defining x an y
# dropping columns
df = df.drop(['ID', 'Protein_position', 'Codons'], axis = 1).copy()

print("defining the target variables and predictors")
X = df.drop('class', axis = 1).copy()
X.head()

y = df['class'].copy()
y.head()


print("One-Hot Encoding")
X_encoded = pd.get_dummies(X, columns=['SYMBOL',
                                       'Allele',
                                       'CLNVC',
                                       'Consequence',
                                       'IMPACT',
                                       'CollectionMethod',
                                       'Denisova',
                                       'TSL',
                                       'firstSubmitter'])
X_encoded.head()

defining the target variables and predictors
One-Hot Encoding


Unnamed: 0,relativeLocationRatio,ExIntron,SIFT,PolyPhen,submitterNo,CADD_raw_rankscore_hg19,ClinPred_rankscore,DANN_rankscore,Eigen.raw_coding_rankscore,LIST.S2_rankscore,...,firstSubmitter_UNCMolecularGeneticsLaboratoryUnive,firstSubmitter_UWHindbrainMalformationResearchProg,firstSubmitter_UndiagnosedDiseasesNetworkNIH,firstSubmitter_UnitforGeneticEpidemiologicalResear,firstSubmitter_UniversityofWashingtonCenterforMend,firstSubmitter_UniversityofWashingtonDepartmentofL,firstSubmitter_VictorianClinicalGeneticsServicesMu,firstSubmitter_WomensHealthandGeneticsLaboratoryCo,firstSubmitter_WongMitoLabMolecularandHumanGenetic,firstSubmitter_other
0,1.0,2.0,0.1,0.952,2,0.38964,0.15687,0.89085,0.38367,0.33984,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2.0,0.38,0.003,3,0.06002,0.00038,0.15196,0.03563,0.14579,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2.0,0.13,0.115,2,0.15404,0.19137,0.44442,0.16641,0.3465,...,0,0,0,0,0,0,0,0,0,0
3,1.0,2.0,0.87,0.001,2,0.00104,0.00179,0.11246,0.00039,0.12095,...,0,0,0,0,0,0,0,0,0,0
4,0.027778,1.0,0.12,0.0,2,0.12689,0.0005,0.00769,0.02306,0.05533,...,0,0,0,0,0,0,0,0,0,0


In [41]:
#final mode-1
#fINALIZED Classifer based on hyperopt library
clf_xgb = xgb.XGBClassifier(seed=42,
                            objective = 'binary:logistic',
                            gamma = 3.65,
                            reg_lambda = 6.219,
                            learning_rate = 0.246,
                            max_depth = 6,
                            scale_pos_weight =  1.22,
                            subsample = 0.9,
                            colsample_bytree = 0.5,
                            n_estimators=500,
                            use_label_encoder=False)
# # define the eval set and metric
eval_set = [(X_test, y_test)]
eval_metric = ["aucpr","error"]


clf_xgb.fit(X_encoded,
            y,
            verbose = True,
            early_stopping_rounds  =10, eval_set = eval_set, eval_metric = eval_metric)

[0]	validation_0-aucpr:0.51536	validation_0-error:0.23081
[1]	validation_0-aucpr:0.59439	validation_0-error:0.20280
[2]	validation_0-aucpr:0.62923	validation_0-error:0.19879
[3]	validation_0-aucpr:0.63206	validation_0-error:0.19574
[4]	validation_0-aucpr:0.63300	validation_0-error:0.19420
[5]	validation_0-aucpr:0.63507	validation_0-error:0.19979
[6]	validation_0-aucpr:0.63697	validation_0-error:0.19962
[7]	validation_0-aucpr:0.64675	validation_0-error:0.19273
[8]	validation_0-aucpr:0.64994	validation_0-error:0.19247
[9]	validation_0-aucpr:0.65209	validation_0-error:0.19247
[10]	validation_0-aucpr:0.65361	validation_0-error:0.19196
[11]	validation_0-aucpr:0.65586	validation_0-error:0.19207
[12]	validation_0-aucpr:0.65764	validation_0-error:0.19111
[13]	validation_0-aucpr:0.66448	validation_0-error:0.19028
[14]	validation_0-aucpr:0.66764	validation_0-error:0.18972
[15]	validation_0-aucpr:0.67178	validation_0-error:0.18946
[16]	validation_0-aucpr:0.67417	validation_0-error:0.18925
[17]	va

[139]	validation_0-aucpr:0.72540	validation_0-error:0.17163
[140]	validation_0-aucpr:0.72548	validation_0-error:0.17161
[141]	validation_0-aucpr:0.72574	validation_0-error:0.17146
[142]	validation_0-aucpr:0.72580	validation_0-error:0.17135
[143]	validation_0-aucpr:0.72607	validation_0-error:0.17146
[144]	validation_0-aucpr:0.72614	validation_0-error:0.17155
[145]	validation_0-aucpr:0.72634	validation_0-error:0.17125
[146]	validation_0-aucpr:0.72668	validation_0-error:0.17116
[147]	validation_0-aucpr:0.72678	validation_0-error:0.17112
[148]	validation_0-aucpr:0.72701	validation_0-error:0.17105
[149]	validation_0-aucpr:0.72742	validation_0-error:0.17095
[150]	validation_0-aucpr:0.72762	validation_0-error:0.17095
[151]	validation_0-aucpr:0.72760	validation_0-error:0.17097
[152]	validation_0-aucpr:0.72778	validation_0-error:0.17103
[153]	validation_0-aucpr:0.72784	validation_0-error:0.17114
[154]	validation_0-aucpr:0.72789	validation_0-error:0.17125
[155]	validation_0-aucpr:0.72796	validat

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=3.65, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.246, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=6.219, scale_pos_weight=1.22, seed=42,
              subsample=0.9, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [44]:
#final mode-2
#fINALIZED Classifer based on hyperopt library
clf_xgb_2 = xgb.XGBClassifier(seed=42,
                            objective = 'binary:logistic',
                            gamma = 3.65,
                            reg_lambda = 6.219,
                            learning_rate = 0.246,
                            max_depth = 6,
                            scale_pos_weight =  5,
                            subsample = 0.9,
                            colsample_bytree = 0.5,
                            n_estimators=500,
                            use_label_encoder=False)
# # define the eval set and metric
eval_set = [(X_test, y_test)]
eval_metric = ["aucpr","error"]


clf_xgb_2.fit(X_encoded,
            y,
            verbose = True,
            early_stopping_rounds  =10, eval_set = eval_set, eval_metric = eval_metric)

[0]	validation_0-aucpr:0.48678	validation_0-error:0.47645
[1]	validation_0-aucpr:0.56446	validation_0-error:0.47935
[2]	validation_0-aucpr:0.61225	validation_0-error:0.43982
[3]	validation_0-aucpr:0.61567	validation_0-error:0.43393
[4]	validation_0-aucpr:0.62007	validation_0-error:0.41853
[5]	validation_0-aucpr:0.62280	validation_0-error:0.40210
[6]	validation_0-aucpr:0.62639	validation_0-error:0.40837
[7]	validation_0-aucpr:0.63669	validation_0-error:0.39694
[8]	validation_0-aucpr:0.64454	validation_0-error:0.38714
[9]	validation_0-aucpr:0.64586	validation_0-error:0.39237
[10]	validation_0-aucpr:0.64704	validation_0-error:0.38921
[11]	validation_0-aucpr:0.64867	validation_0-error:0.38988
[12]	validation_0-aucpr:0.64991	validation_0-error:0.38228
[13]	validation_0-aucpr:0.65945	validation_0-error:0.36777
[14]	validation_0-aucpr:0.66125	validation_0-error:0.35517
[15]	validation_0-aucpr:0.66566	validation_0-error:0.35190
[16]	validation_0-aucpr:0.66848	validation_0-error:0.34819
[17]	va

[139]	validation_0-aucpr:0.72239	validation_0-error:0.29076
[140]	validation_0-aucpr:0.72258	validation_0-error:0.29065
[141]	validation_0-aucpr:0.72302	validation_0-error:0.29040
[142]	validation_0-aucpr:0.72310	validation_0-error:0.29044
[143]	validation_0-aucpr:0.72328	validation_0-error:0.29012
[144]	validation_0-aucpr:0.72334	validation_0-error:0.28997
[145]	validation_0-aucpr:0.72381	validation_0-error:0.29012
[146]	validation_0-aucpr:0.72429	validation_0-error:0.28969
[147]	validation_0-aucpr:0.72456	validation_0-error:0.28920
[148]	validation_0-aucpr:0.72474	validation_0-error:0.28899
[149]	validation_0-aucpr:0.72530	validation_0-error:0.28918
[150]	validation_0-aucpr:0.72548	validation_0-error:0.28888
[151]	validation_0-aucpr:0.72557	validation_0-error:0.28884
[152]	validation_0-aucpr:0.72574	validation_0-error:0.28860
[153]	validation_0-aucpr:0.72577	validation_0-error:0.28852
[154]	validation_0-aucpr:0.72623	validation_0-error:0.28833
[155]	validation_0-aucpr:0.72625	validat

[276]	validation_0-aucpr:0.74562	validation_0-error:0.27036
[277]	validation_0-aucpr:0.74598	validation_0-error:0.27017
[278]	validation_0-aucpr:0.74608	validation_0-error:0.27032
[279]	validation_0-aucpr:0.74614	validation_0-error:0.27032
[280]	validation_0-aucpr:0.74636	validation_0-error:0.27004
[281]	validation_0-aucpr:0.74669	validation_0-error:0.26917
[282]	validation_0-aucpr:0.74677	validation_0-error:0.26919
[283]	validation_0-aucpr:0.74682	validation_0-error:0.26915
[284]	validation_0-aucpr:0.74687	validation_0-error:0.26936
[285]	validation_0-aucpr:0.74694	validation_0-error:0.26936
[286]	validation_0-aucpr:0.74701	validation_0-error:0.26887
[287]	validation_0-aucpr:0.74712	validation_0-error:0.26883
[288]	validation_0-aucpr:0.74723	validation_0-error:0.26872
[289]	validation_0-aucpr:0.74742	validation_0-error:0.26851
[290]	validation_0-aucpr:0.74745	validation_0-error:0.26857
[291]	validation_0-aucpr:0.74755	validation_0-error:0.26853
[292]	validation_0-aucpr:0.74764	validat

[413]	validation_0-aucpr:0.76143	validation_0-error:0.25760
[414]	validation_0-aucpr:0.76153	validation_0-error:0.25780
[415]	validation_0-aucpr:0.76160	validation_0-error:0.25778
[416]	validation_0-aucpr:0.76165	validation_0-error:0.25769
[417]	validation_0-aucpr:0.76189	validation_0-error:0.25756
[418]	validation_0-aucpr:0.76189	validation_0-error:0.25744
[419]	validation_0-aucpr:0.76191	validation_0-error:0.25758
[420]	validation_0-aucpr:0.76201	validation_0-error:0.25735
[421]	validation_0-aucpr:0.76202	validation_0-error:0.25731
[422]	validation_0-aucpr:0.76229	validation_0-error:0.25709
[423]	validation_0-aucpr:0.76234	validation_0-error:0.25705
[424]	validation_0-aucpr:0.76238	validation_0-error:0.25692
[425]	validation_0-aucpr:0.76250	validation_0-error:0.25654
[426]	validation_0-aucpr:0.76292	validation_0-error:0.25616
[427]	validation_0-aucpr:0.76294	validation_0-error:0.25613
[428]	validation_0-aucpr:0.76306	validation_0-error:0.25605
[429]	validation_0-aucpr:0.76309	validat

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=3.65, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.246, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=6.219, scale_pos_weight=5, seed=42,
              subsample=0.9, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [145]:
#final mode-3
#fINALIZED Classifer based on hyperopt library
clf_xgb_3 = xgb.XGBClassifier(seed=42,
                            objective = 'binary:logistic',
                            gamma = 3.65,
                            reg_lambda = 6.219,
                            learning_rate = 0.246,
                            max_depth = 6,
                            scale_pos_weight =  3,
                            subsample = 0.9,
                            colsample_bytree = 0.5,
                            n_estimators=500,
                            use_label_encoder=False)
# # define the eval set and metric
eval_set = [(X_test, y_test)]
eval_metric = ["aucpr","error"]


clf_xgb_3.fit(X_encoded,
            y,
            verbose = True,
            early_stopping_rounds  =10, eval_set = eval_set, eval_metric = eval_metric)

[0]	validation_0-aucpr:0.51368	validation_0-error:0.30638
[1]	validation_0-aucpr:0.53315	validation_0-error:0.35506
[2]	validation_0-aucpr:0.60436	validation_0-error:0.28711
[3]	validation_0-aucpr:0.62168	validation_0-error:0.28735
[4]	validation_0-aucpr:0.62489	validation_0-error:0.28673
[5]	validation_0-aucpr:0.63712	validation_0-error:0.27519
[6]	validation_0-aucpr:0.64738	validation_0-error:0.26029
[7]	validation_0-aucpr:0.65717	validation_0-error:0.25831
[8]	validation_0-aucpr:0.65675	validation_0-error:0.25714
[9]	validation_0-aucpr:0.66026	validation_0-error:0.25707
[10]	validation_0-aucpr:0.66148	validation_0-error:0.25831
[11]	validation_0-aucpr:0.66438	validation_0-error:0.25675
[12]	validation_0-aucpr:0.66660	validation_0-error:0.25545
[13]	validation_0-aucpr:0.66764	validation_0-error:0.25445
[14]	validation_0-aucpr:0.67005	validation_0-error:0.25372
[15]	validation_0-aucpr:0.67114	validation_0-error:0.25278
[16]	validation_0-aucpr:0.67310	validation_0-error:0.25200
[17]	va

[139]	validation_0-aucpr:0.73032	validation_0-error:0.21989
[140]	validation_0-aucpr:0.73049	validation_0-error:0.21978
[141]	validation_0-aucpr:0.73073	validation_0-error:0.21987
[142]	validation_0-aucpr:0.73084	validation_0-error:0.21965
[143]	validation_0-aucpr:0.73101	validation_0-error:0.21967
[144]	validation_0-aucpr:0.73128	validation_0-error:0.21923
[145]	validation_0-aucpr:0.73141	validation_0-error:0.21895
[146]	validation_0-aucpr:0.73159	validation_0-error:0.21886
[147]	validation_0-aucpr:0.73203	validation_0-error:0.21854
[148]	validation_0-aucpr:0.73219	validation_0-error:0.21863
[149]	validation_0-aucpr:0.73221	validation_0-error:0.21867
[150]	validation_0-aucpr:0.73234	validation_0-error:0.21863
[151]	validation_0-aucpr:0.73241	validation_0-error:0.21848
[152]	validation_0-aucpr:0.73249	validation_0-error:0.21850
[153]	validation_0-aucpr:0.73254	validation_0-error:0.21869
[154]	validation_0-aucpr:0.73274	validation_0-error:0.21848
[155]	validation_0-aucpr:0.73303	validat

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=3.65, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.246, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=6.219, scale_pos_weight=3, seed=42,
              subsample=0.9, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

# Save the classifier

In [45]:
# number of trees
clf_xgb_2.best_ntree_limit


500

In [146]:
# saving
# save in JSON format
# the following file is the best optimized model
#clf_xgb.save_model("final_model_1.json")
#clf_xgb_2.save_model("final_model_3.json")
#clf_xgb_3.save_model("final_model_2.json")

# saving finalized model
#clf_xgb.save_model("finalized.xgb.json")

#conflict_clf_xgb.save_model("finalized.xgb.json")

