In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Load the dataset
data = pd.read_csv('./data/augmented_data.csv')

# Separate the features and the target variable
X = data.drop('success', axis=1)
y = data['success']

In [2]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the number of folds for cross-validation
k = 5

# Create a k-fold cross-validation object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [3]:
# Create the XGBoost model
xgb_model = XGBClassifier(
                     learning_rate =0.01,
                     n_estimators=400,
                     max_depth=4,
                     min_child_weight=6,
                     gamma=0)

# Train the best model on the full training set
xgb_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
best_predictions = xgb_model.predict(X_test)

# Calculate evaluation metrics for the best model
best_report = classification_report(y_test, best_predictions, output_dict=True)
best_auc = roc_auc_score(y_test, best_predictions)

print("Classification Report:")
print(classification_report(y_test, best_predictions))
print("AUC-ROC Score:", best_auc)


Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.88      0.83       104
           1       0.85      0.72      0.78        96

    accuracy                           0.81       200
   macro avg       0.81      0.80      0.80       200
weighted avg       0.81      0.81      0.80       200

AUC-ROC Score: 0.8016826923076923


In [4]:
# Load the test dataset
new_data = pd.read_csv('./data/testing_encoded_data.csv')

# Separate the features
X_new = new_data.drop('success', axis=1)
y_true = new_data['success']

# Predict using the best XGBoost model
xgb_predictions = xgb_model.predict(X_new)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({'XGB Predictions': xgb_predictions,
                               'Actual': y_true})

# Add a column to indicate incorrect predictions
predictions_df['XGB Correct'] = predictions_df['XGB Predictions'] == predictions_df['Actual']

# Calculate accuracy for XGB model
xgb_accuracy = predictions_df['XGB Correct'].mean()

# Print the comparison table and accuracy
print("Comparison Table:")
print(predictions_df)
print("\nAccuracy:")
print("XGB Accuracy:", xgb_accuracy)


Comparison Table:
     XGB Predictions  Actual  XGB Correct
0                  1       0        False
1                  0       0         True
2                  1       0        False
3                  1       1         True
4                  0       0         True
..               ...     ...          ...
121                1       0        False
122                1       0        False
123                0       0         True
124                0       0         True
125                1       0        False

[126 rows x 3 columns]

Accuracy:
XGB Accuracy: 0.5634920634920635
