In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

# Load the dataset
data = pd.read_csv('./data/augmented_data.csv')

# Separate the features and the target variable
X = data.drop('success', axis=1)
y = data['success']

In [2]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the number of folds for cross-validation
k = 5

In [3]:
# Calculate the correlation between features and the target variable
correlations = X_train.corrwith(y_train)

# Set a threshold for considering high correlation
threshold = 0.1

# Find the highly correlated features
highly_correlated_features = correlations[correlations.abs() >= threshold]

# Print the highly correlated features
print("Highly Correlated Features:")
print(highly_correlated_features)

Highly Correlated Features:
cet_oapr           0.162478
english            0.110551
abstract           0.128883
shs_gpa            0.174932
p2                 0.206944
p5                 0.102112
p9                 0.161773
p14                0.153593
p21                0.104269
p25                0.185681
p26                0.169650
p29                0.119224
p30                0.154880
p33                0.125639
p39                0.115047
s2                -0.103951
s9                 0.136165
s11                0.145717
s18                0.100273
bscs               0.127453
bsit              -0.112728
male               0.104429
class_rank_5       0.137379
class_rank_none   -0.136054
awards_none       -0.112535
income_20k         0.102095
income_10k         0.139542
income_na         -0.135231
dtype: float64


### Classical Model

In [4]:
# Create a k-fold cross-validation object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear')
svm_cv_scores = cross_val_score(svm_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
svm_accuracy = svm_cv_scores.mean()

# Decision Tree (DT) model
dt_model = DecisionTreeClassifier()
dt_cv_scores = cross_val_score(dt_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
dt_accuracy = dt_cv_scores.mean()

# Neural Network (NN) model
nn_model = MLPClassifier()
nn_cv_scores = cross_val_score(nn_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
nn_accuracy = nn_cv_scores.mean()

# Train the models on the full training set
svm_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
nn_model.fit(X_train, y_train)

# Make predictions on the test set
svm_predictions = svm_model.predict(X_test)
dt_predictions = dt_model.predict(X_test)
nn_predictions = nn_model.predict(X_test)



In [5]:
# Calculate evaluation metrics
svm_report = classification_report(y_test, svm_predictions, output_dict=True)
svm_auc = roc_auc_score(y_test, svm_predictions)

dt_report = classification_report(y_test, dt_predictions, output_dict=True)
dt_auc = roc_auc_score(y_test, dt_predictions)

nn_report = classification_report(y_test, nn_predictions, output_dict=True)
nn_auc = roc_auc_score(y_test, nn_predictions)

# Calculate confusion matrix for each model
svm_confusion = confusion_matrix(y_test, svm_predictions)
dt_confusion = confusion_matrix(y_test, dt_predictions)
nn_confusion = confusion_matrix(y_test, nn_predictions)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Model': ['SVM', 'Decision Tree', 'Neural Network'],
    'CV Accuracy': [svm_accuracy, dt_accuracy, nn_accuracy],
    'Accuracy': [svm_model.score(X_test, y_test), dt_model.score(X_test, y_test), nn_model.score(X_test, y_test)],
    'F1-Score': [svm_report['macro avg']['f1-score'], dt_report['macro avg']['f1-score'], nn_report['macro avg']['f1-score']],
    'Precision': [svm_report['macro avg']['precision'], dt_report['macro avg']['precision'], nn_report['macro avg']['precision']],
    'Recall': [svm_report['macro avg']['recall'], dt_report['macro avg']['recall'], nn_report['macro avg']['recall']],
    'AUC': [svm_auc, dt_auc, nn_auc],
    'Confusion Matrix': [svm_confusion, dt_confusion, nn_confusion]
})

# Print the results
print("Evaluation Metrics:")
print(results_df)

Evaluation Metrics:
            Model  CV Accuracy  Accuracy  F1-Score  Precision    Recall  \
0             SVM       0.7225     0.730  0.725359   0.730260  0.724235   
1   Decision Tree       0.7425     0.740  0.736815   0.739086  0.735910   
2  Neural Network       0.8375     0.885  0.882530   0.893974  0.879026   

        AUC      Confusion Matrix  
0  0.724235  [[86, 22], [32, 60]]  
1  0.735910  [[85, 23], [29, 63]]  
2  0.879026  [[103, 5], [18, 74]]  


In [6]:
# Obtain feature importances for linear SVM model
svm_feature_importances = abs(svm_model.coef_[0])

# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'SVM Importance': svm_feature_importances
})

# Sort the DataFrame by SVM Importance values in descending order
importances_df = importances_df.sort_values(by='SVM Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("Top 10 Feature Importances (SVM Linear):")
print(top_10_importances)

Top 10 Feature Importances (SVM Linear):
                Feature  SVM Importance
113    school_volunteer        1.377329
73                 bscs        1.344929
97        class_auditor        1.204813
102    school_president        1.178214
110          school_pio        1.121102
142   internet_pre_paid        1.111975
117    father_no_school        1.109185
141  internet_post_paid        0.921944
118           father_na        0.894519
144           s25_tutor        0.872724


In [7]:
# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': dt_model.feature_importances_
})

# Sort the DataFrame by importance values in descending order
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("DT Top 10 Feature Importances:")
print(top_10_importances)

DT Top 10 Feature Importances:
         Feature  Importance
2        reading    0.054963
6        shs_gpa    0.054595
26           p20    0.053145
8             p2    0.052679
5       abstract    0.035324
30           p24    0.034538
42           p36    0.034168
145  s25_friends    0.031799
63           s16    0.030139
62           s15    0.028728


In [8]:
# Obtain feature importances for Neural Network model
nn_feature_importances = abs(nn_model.coefs_[0].mean(axis=0))

# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns[:len(nn_feature_importances)],
    'NN Importance': nn_feature_importances
})

# Sort the DataFrame by NN Importance values in descending order
importances_df = importances_df.sort_values(by='NN Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("Top 10 Feature Importances (Neural Network):")
print(top_10_importances)

Top 10 Feature Importances (Neural Network):
   Feature  NN Importance
42     p36       0.031718
19     p13       0.029813
45     p39       0.027653
20     p14       0.023295
21     p15       0.022406
25     p19       0.021561
63     s16       0.019936
71     s24       0.017576
69     s22       0.017562
26     p20       0.017029


### Ensemble Model

In [9]:
# Create the ensemble model using Voting Classifier
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('dt', dt_model), ('nn', nn_model)],
    voting='hard'
)

# Perform k-fold cross-validation on the ensemble model
ensemble_cv_scores = cross_val_score(ensemble_model, X_train, y_train, cv=kf)
ensemble_accuracy = ensemble_cv_scores.mean()

# Train the ensemble model on the full training set
ensemble_model.fit(X_train, y_train)

# Make predictions on the test set
ensemble_predictions = ensemble_model.predict(X_test)

# Calculate evaluation metrics for the ensemble model
ensemble_report = classification_report(y_test, ensemble_predictions, output_dict=True)
ensemble_auc = roc_auc_score(y_test, ensemble_predictions)



In [10]:
# Create the ensemble model using Bagging Classifier
bagging_model = BaggingClassifier(base_estimator=SVC(kernel='linear'), n_estimators=10)
bagging_cv_scores = cross_val_score(bagging_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
bagging_accuracy = bagging_cv_scores.mean()

# Train the model on the full training set
bagging_model.fit(X_train, y_train)

# Make predictions on the test set
bagging_predictions = bagging_model.predict(X_test)

# Calculate evaluation metrics
bagging_report = classification_report(y_test, bagging_predictions, output_dict=True)
bagging_auc = roc_auc_score(y_test, bagging_predictions)



In [11]:
# Create the ensemble model using Stacking Classifier
stacking_model = StackingClassifier(
    estimators=[('SVM', svm_model), ('Decision Tree', dt_model), ('Neural Network', nn_model)],
    final_estimator=SVC(kernel='linear')
)

stacking_cv_scores = cross_val_score(stacking_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
stacking_accuracy = stacking_cv_scores.mean()

# Train the model on the full training set
stacking_model.fit(X_train, y_train)

# Make predictions on the test set
stacking_predictions = stacking_model.predict(X_test)

# Calculate evaluation metrics
stacking_report = classification_report(y_test, stacking_predictions, output_dict=True)
stacking_auc = roc_auc_score(y_test, stacking_predictions)



In [12]:
# Create the XGBoost model
xgb_model = XGBClassifier()

# Perform k-fold cross-validation
xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=kf)
xgb_accuracy = xgb_cv_scores.mean()

# Train the model on the full training set
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test)

# Calculate evaluation metrics
xgb_report = classification_report(y_test, xgb_predictions, output_dict=True)
xgb_auc = roc_auc_score(y_test, xgb_predictions)

In [13]:
# Calculate confusion matrix for each model
ensemble_confusion = confusion_matrix(y_test, ensemble_predictions)
bagging_confusion = confusion_matrix(y_test, bagging_predictions)
stacking_confusion = confusion_matrix(y_test, stacking_predictions)
xgb_confusion = confusion_matrix(y_test, xgb_predictions)

# Create a DataFrame for the ensemble models
ensemble_results = pd.DataFrame({
    'Model': ['Voting', 'Bagging', 'Stacking', 'XGB'],
    'CV Accuracy': [ensemble_accuracy, bagging_accuracy, stacking_accuracy, xgb_accuracy],
    'Accuracy': [ensemble_model.score(X_test, y_test), bagging_model.score(X_test, y_test), stacking_model.score(X_test, y_test), xgb_model.score(X_test, y_test)],
    'F1-Score': [ensemble_report['macro avg']['f1-score'], bagging_report['macro avg']['f1-score'], stacking_report['macro avg']['f1-score'], xgb_report['macro avg']['f1-score']],
    'Precision': [ensemble_report['macro avg']['precision'], bagging_report['macro avg']['precision'], stacking_report['macro avg']['precision'], xgb_report['macro avg']['precision']],
    'Recall': [ensemble_report['macro avg']['recall'], bagging_report['macro avg']['recall'], stacking_report['macro avg']['recall'], xgb_report['macro avg']['recall']],
    'AUC': [ensemble_auc, bagging_auc, stacking_auc, xgb_auc],
    'Confusion Matrix': [ensemble_confusion, bagging_confusion, stacking_confusion, xgb_confusion]
})

# Print the results of the ensemble models
print("Ensemble Model Results:")
print(ensemble_results)

Ensemble Model Results:
      Model  CV Accuracy  Accuracy  F1-Score  Precision    Recall       AUC  \
0    Voting      0.81625     0.845  0.842923   0.846803  0.841184  0.841184   
1   Bagging      0.73875     0.750  0.742560   0.756971  0.741143  0.741143   
2  Stacking      0.83250     0.890  0.888889   0.890828  0.887681  0.887681   
3       XGB      0.83750     0.880  0.878247   0.883415  0.876006  0.876006   

       Confusion Matrix  
0  [[96, 12], [19, 73]]  
1  [[92, 16], [34, 58]]  
2   [[99, 9], [13, 79]]  
3  [[100, 8], [16, 76]]  


### Batch Prediction

In [14]:
# Load the test dataset
new_data = pd.read_csv('./data/testing_encoded_data.csv')

# Separate the features
X_new = new_data.drop('success', axis=1)
y_true = new_data['success']

# Predict using the SVM model
svm_predictions = svm_model.predict(X_new)

# Predict using the Decision Tree model
dt_predictions = dt_model.predict(X_new)

# Predict using the Neural Network model
nn_predictions = nn_model.predict(X_new)

# Predict using the Ensemble(Voting)
ensemble_predictions = ensemble_model.predict(X_new)

# Predict using the Ensemble(Bagging)
bagging_predictions = bagging_model.predict(X_new)

# Predict using the Ensemble(Stacking)
stacking_predictions = stacking_model.predict(X_new)

# Predict using the Ensemble(XGB)
xgb_predictions = stacking_model.predict(X_new)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({'SVM Predictions': svm_predictions,
                               'DT Predictions': dt_predictions,
                               'NN Predictions': nn_predictions,
                               'Voting Predictions': ensemble_predictions,
                               'Bagging Predictions': bagging_predictions,
                               'Stacking Predictions': stacking_predictions,
                               'XGB Predictions': xgb_predictions,
                               'Actual': y_true})

# Add a column to indicate incorrect predictions
predictions_df['SVM Correct'] = predictions_df['SVM Predictions'] == predictions_df['Actual']
predictions_df['DT Correct'] = predictions_df['DT Predictions'] == predictions_df['Actual']
predictions_df['NN Correct'] = predictions_df['NN Predictions'] == predictions_df['Actual']
predictions_df['Voting Correct'] = predictions_df['Voting Predictions'] == predictions_df['Actual']
predictions_df['Bagging Correct'] = predictions_df['Bagging Predictions'] == predictions_df['Actual']
predictions_df['Stacking Correct'] = predictions_df['Stacking Predictions'] == predictions_df['Actual']
predictions_df['XGB Correct'] = predictions_df['XGB Predictions'] == predictions_df['Actual']

# Calculate accuracy for each model
svm_accuracy = predictions_df['SVM Correct'].mean()
dt_accuracy = predictions_df['DT Correct'].mean()
nn_accuracy = predictions_df['NN Correct'].mean()
ensemble_accuracy = predictions_df['Voting Correct'].mean()
bagging_accuracy = predictions_df['Bagging Correct'].mean()
stacking_accuracy = predictions_df['Stacking Correct'].mean()
xgb_accuracy = predictions_df['XGB Correct'].mean()

# Print the comparison table and accuracy
print("Comparison Table:")
print(predictions_df)
print("\nAccuracy:")
print("SVM Accuracy:", svm_accuracy)
print("DT Accuracy:", dt_accuracy)
print("NN Accuracy:", nn_accuracy)
print("Voting Accuracy:", ensemble_accuracy)
print("Bagging Accuracy:", bagging_accuracy)
print("Stacking Accuracy:", stacking_accuracy)
print("XGB Accuracy:", xgb_accuracy)

Comparison Table:
    SVM Predictions  DT Predictions  NN Predictions  Voting Predictions  \
0                 0               1               0                   0   
1                 0               0               0                   0   
2                 1               1               0                   1   
3                 0               1               0                   0   
4                 1               1               1                   1   
5                 1               1               1                   1   
6                 1               0               0                   0   
7                 0               0               0                   0   
8                 0               0               0                   0   
9                 0               1               1                   0   
10                0               0               0                   0   
11                1               0               0                   0   
12     