In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from xgboost import plot_importance

from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import pickle
pd.set_option("display.max_columns", None)


In [None]:
df = pd.read_csv(".csv")
df.head()

In [None]:
df_v1 = df.drop(["UselessColumn1","UselessColumn2"],axis=1)
df_v1.head()

In [None]:
y = df_v1["target"]
X = df_v1.copy()
X = X.drop("target",axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)


In [None]:
cv_params = {"max_depth":[4,5,6,7,8],
             "min_child_weight":[1,2,3,4,5],
             "learning_rate":[0.05,0.1,0.2,0.3],
             "n_estimators":[75,100,125]
             }
xgb = XGBClassifier(objective="binary:logistic" ,random_state=0)
scoring = {"accuracy","precision","recall","f1"} 
xgb_cv = GridSearchCV(xgb, cv_params, scoring=scoring, cv=5,refit="f1")



In [None]:
%%time
xgb_cv.fit(X_train, y_train)

In [None]:
##Pickle & save
#Saving the model since it might use a lot of time to train
import pickle
path = "../Proyectos DS/"
with open(path+"xgb_cv_model_p.pickle", "wb") as to_write:
    pickle.dump(xgb_cv, to_write)

In [None]:
## Pickle Load
#to open a pickle...
with open(path+"xgb_cv_model_p.pickle", "rb") as to_read:
    xgb_cv = pickle.load(to_read)

In [None]:
#We can arrange the results in a dataframe, as a function for reusing later
results = pd.DataFrame(columns=["Model","F1","Recall","Precision","Accuracy"])
def make_results(model_name, model_object):
    #Note: Model_Name is just a string for the name.
    cv_results = pd.DataFrame(model_object.cv_results_)
    #Calling just the one with highest mean f1
    best_estimator_results = cv_results.iloc[cv_results["mean_test_f1"].idxmax(),:]
    f1 = best_estimator_results.mean_test_f1
    recall = best_estimator_results.mean_test_recall
    precision = best_estimator_results.mean_test_precision
    accuracy = best_estimator_results.mean_test_accuracy

    data = pd.DataFrame()
    data = data.append({"Model": model_name, "F1": f1, "Precision": precision, "Accuracy": accuracy}, ignore_index=True)
    return data

In [None]:
xgb_cv_results = make_results("XGBoost CV", xgb_cv)

In [None]:
xgb_cv_preds = xgb_cv.predict(X_test)
print("F1 score for test data:", f1_score(y_test, xgb_cv_preds))

print("Recall score for test data:", recall_score(y_test, xgb_cv_preds))

print("Precision score for test data:", precision_score(y_test, xgb_cv_preds))

print("Accuracy score for test data:", accuracy_score(y_test, xgb_cv_preds))

In [None]:
#Defining a confusion matrix function to stop doing it all the time.
def conf_matrix_plot(model, x_data, y_data):
    model_pred = model.predict(x_data)
    cm = confusion_matrix(y_data, model_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=model.classes_ )
    disp.plot()
    plt.show() 

In [None]:
conf_matrix_plot(xgb_cv,X_test,y_test)
plot_importance(xgb_cv.best_estimator_);