In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
clean = pd.read_csv("data/machinelearning.csv")

In [3]:
X = clean.drop("MinorityDriver", axis = 1)
Y = clean['MinorityDriver']

In [4]:
seed = 7
test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [5]:
rus = RandomUnderSampler(random_state=42, sampling_strategy = (2/3))
X_train_under40, y_train_under40 = rus.fit_sample(X_train, y_train)

In [7]:
xgbs = XGBClassifier(n_estimators=150,
                      max_depth = 8,
                      subsample = 0.95,
                      colsample_bytree = 0.55,
                      min_child_weight = 4,
                      eta = 0.2,
                      alpha = 0.01,
                      gamma = 2, 
                     objective= 'binary:logistic', n_jobs =  -1,
                     eval_metric = 'auc')
model = xgbs.fit(X_train_under40, y_train_under40)
rf_predictions = model.predict(X_test)
rf_probs = model.predict_proba(X_test)[:, 1]
roc_value = roc_auc_score(y_test, rf_probs)
print(roc_value)
print(confusion_matrix(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))

0.7073565030041082
[[1001194  230634]
 [ 190261  148983]]
              precision    recall  f1-score   support

           0       0.84      0.81      0.83   1231828
           1       0.39      0.44      0.41    339244

    accuracy                           0.73   1571072
   macro avg       0.62      0.63      0.62   1571072
weighted avg       0.74      0.73      0.74   1571072



In [8]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
average_precision = average_precision_score(y_test, predictions)

In [9]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

Accuracy: 73.21%
Average precision-recall score: 0.29


In [11]:
from sklearn import metrics
model = XGBClassifier(objective= 'binary:logistic',
                      n_estimators=100,
                      max_depth = 2,
                      scale_pos_weight = 5.5, 
                      subsample = 0.6,
                      colsample_bytree = 0.8,
                      min_child_weight = 5,
                      eta = 0.2,
                      alpha = 0.01,
                      gamma = 2, 
                      reg_lambda = 4.5)
cross_val_score(model, X, Y, cv=10, scoring = 'roc_auc')



array([0.67405925, 0.50567106, 0.36056227, 0.24198612, 0.36527855,
       0.29489776, 0.37007543, 0.5012928 , 0.46711213, 0.67395188])