In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import confusion_matrix

# Load the dataset
data = pd.read_csv('./data/augmented_data.csv')

# Separate the features and the target variable
X = data.drop('success', axis=1)
y = data['success']

In [2]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the number of folds for cross-validation
k = 5

In [3]:
# Calculate the correlation between features and the target variable
correlations = X_train.corrwith(y_train)

# Set a threshold for considering high correlation
threshold = 0.1

# Find the highly correlated features
highly_correlated_features = correlations[correlations.abs() >= threshold]

# Print the highly correlated features
print("Highly Correlated Features:")
print(highly_correlated_features)

Highly Correlated Features:
abstract                  0.100910
shs_gpa                   0.183404
p2                        0.171578
p21                       0.109439
p25                       0.111669
p30                       0.129419
p33                       0.120059
s9                        0.114212
s19                       0.119713
s20                      -0.124845
bscs                      0.130715
bsit                     -0.121787
class_rank_5              0.110160
class_rank_none          -0.122872
high_honor                0.109804
awards_none              -0.112335
class_president           0.116537
class_none               -0.133929
school_auditor            0.113201
school_grade_level_rep    0.115394
school_none              -0.105792
income_10k                0.139563
dtype: float64


### Classical Model

In [4]:
# Create a k-fold cross-validation object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear')
svm_cv_scores = cross_val_score(svm_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
svm_accuracy = svm_cv_scores.mean()

# Decision Tree (DT) model
dt_model = DecisionTreeClassifier()
dt_cv_scores = cross_val_score(dt_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
dt_accuracy = dt_cv_scores.mean()

# Neural Network (NN) model
nn_model = MLPClassifier()
nn_cv_scores = cross_val_score(nn_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
nn_accuracy = nn_cv_scores.mean()

# Train the models on the full training set
svm_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
nn_model.fit(X_train, y_train)

# Make predictions on the test set
svm_predictions = svm_model.predict(X_test)
dt_predictions = dt_model.predict(X_test)
nn_predictions = nn_model.predict(X_test)



In [5]:
# Calculate evaluation metrics
svm_report = classification_report(y_test, svm_predictions, output_dict=True)
svm_auc = roc_auc_score(y_test, svm_predictions)

dt_report = classification_report(y_test, dt_predictions, output_dict=True)
dt_auc = roc_auc_score(y_test, dt_predictions)

nn_report = classification_report(y_test, nn_predictions, output_dict=True)
nn_auc = roc_auc_score(y_test, nn_predictions)

# Calculate confusion matrix for each model
svm_confusion = confusion_matrix(y_test, svm_predictions)
dt_confusion = confusion_matrix(y_test, dt_predictions)
nn_confusion = confusion_matrix(y_test, nn_predictions)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Model': ['SVM', 'Decision Tree', 'Neural Network'],
    'CV Accuracy': [svm_accuracy, dt_accuracy, nn_accuracy],
    'Accuracy': [svm_model.score(X_test, y_test), dt_model.score(X_test, y_test), nn_model.score(X_test, y_test)],
    'F1-Score': [svm_report['macro avg']['f1-score'], dt_report['macro avg']['f1-score'], nn_report['macro avg']['f1-score']],
    'Precision': [svm_report['macro avg']['precision'], dt_report['macro avg']['precision'], nn_report['macro avg']['precision']],
    'Recall': [svm_report['macro avg']['recall'], dt_report['macro avg']['recall'], nn_report['macro avg']['recall']],
    'AUC': [svm_auc, dt_auc, nn_auc],
    'Confusion Matrix': [svm_confusion, dt_confusion, nn_confusion]
})

# Print the results
print("Evaluation Metrics:")
print(results_df)

Evaluation Metrics:
            Model  CV Accuracy  Accuracy  F1-Score  Precision    Recall  \
0             SVM      0.73625     0.780  0.779205   0.780193  0.778846   
1   Decision Tree      0.74250     0.825  0.824785   0.824692  0.824920   
2  Neural Network      0.86625     0.930  0.929825   0.930349  0.929487   

        AUC      Confusion Matrix  
0  0.778846  [[84, 20], [24, 72]]  
1  0.824920  [[86, 18], [17, 79]]  
2  0.929487    [[98, 6], [8, 88]]  


In [6]:
# Obtain feature importances for linear SVM model
svm_feature_importances = abs(svm_model.coef_[0])

# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'SVM Importance': svm_feature_importances
})

# Sort the DataFrame by SVM Importance values in descending order
importances_df = importances_df.sort_values(by='SVM Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("Top 10 Feature Importances (SVM Linear):")
print(top_10_importances)

Top 10 Feature Importances (SVM Linear):
                   Feature  SVM Importance
89           highest_honor        1.821289
98   class_project_manager        1.212668
108         school_auditor        1.130361
92             awards_none        1.114917
73                    bscs        1.062360
101             class_none        1.029394
127              mother_na        0.978955
81                      he        0.853504
65                     s18        0.839842
103  school_vice_president        0.811215


In [7]:
# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': dt_model.feature_importances_
})

# Sort the DataFrame by importance values in descending order
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("DT Top 10 Feature Importances:")
print(top_10_importances)

DT Top 10 Feature Importances:
          Feature  Importance
6         shs_gpa    0.074407
4    quantitative    0.070912
18            p12    0.058292
66            s19    0.054742
21            p15    0.049071
3         science    0.036465
72         stotal    0.031699
136    income_10k    0.031658
67            s20    0.031607
40            p34    0.028183


In [8]:
# Obtain feature importances for Neural Network model
nn_feature_importances = abs(nn_model.coefs_[0].mean(axis=0))

# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns[:len(nn_feature_importances)],
    'NN Importance': nn_feature_importances
})

# Sort the DataFrame by NN Importance values in descending order
importances_df = importances_df.sort_values(by='NN Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("Top 10 Feature Importances (Neural Network):")
print(top_10_importances)

Top 10 Feature Importances (Neural Network):
          Feature  NN Importance
79          humss       0.025269
61            s14       0.021012
29            p23       0.020336
89  highest_honor       0.019815
39            p33       0.018705
5        abstract       0.016697
68            s21       0.016521
84         sports       0.015562
11             p5       0.015525
42            p36       0.014538


### Ensemble Model

In [9]:
# Create the ensemble model using Voting Classifier
ensemble_model = VotingClassifier(
    estimators=[('svm', svm_model), ('dt', dt_model), ('nn', nn_model)],
    voting='hard'
)

# Perform k-fold cross-validation on the ensemble model
ensemble_cv_scores = cross_val_score(ensemble_model, X_train, y_train, cv=kf)
ensemble_accuracy = ensemble_cv_scores.mean()

# Train the ensemble model on the full training set
ensemble_model.fit(X_train, y_train)

# Make predictions on the test set
ensemble_predictions = ensemble_model.predict(X_test)

# Calculate evaluation metrics for the ensemble model
ensemble_report = classification_report(y_test, ensemble_predictions, output_dict=True)
ensemble_auc = roc_auc_score(y_test, ensemble_predictions)



In [10]:
# Create the ensemble model using Bagging Classifier
bagging_model = BaggingClassifier(base_estimator=SVC(kernel='linear'), n_estimators=10)
bagging_cv_scores = cross_val_score(bagging_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
bagging_accuracy = bagging_cv_scores.mean()

# Train the model on the full training set
bagging_model.fit(X_train, y_train)

# Make predictions on the test set
bagging_predictions = bagging_model.predict(X_test)

# Calculate evaluation metrics
bagging_report = classification_report(y_test, bagging_predictions, output_dict=True)
bagging_auc = roc_auc_score(y_test, bagging_predictions)



In [11]:
# Create the ensemble model using Stacking Classifier
stacking_model = StackingClassifier(
    estimators=[('SVM', svm_model), ('Decision Tree', dt_model), ('Neural Network', nn_model)],
    final_estimator=SVC(kernel='linear')
)

stacking_cv_scores = cross_val_score(stacking_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
stacking_accuracy = stacking_cv_scores.mean()

# Train the model on the full training set
stacking_model.fit(X_train, y_train)

# Make predictions on the test set
stacking_predictions = stacking_model.predict(X_test)

# Calculate evaluation metrics
stacking_report = classification_report(y_test, stacking_predictions, output_dict=True)
stacking_auc = roc_auc_score(y_test, stacking_predictions)



In [12]:
# Calculate confusion matrix for each model
ensemble_confusion = confusion_matrix(y_test, ensemble_predictions)
bagging_confusion = confusion_matrix(y_test, bagging_predictions)
stacking_confusion = confusion_matrix(y_test, stacking_predictions)

# Create a DataFrame for the ensemble models
ensemble_results = pd.DataFrame({
    'Model': ['Ensemble', 'Bagging', 'Stacking'],
    'CV Accuracy': [ensemble_accuracy, bagging_accuracy, stacking_accuracy],
    'Accuracy': [ensemble_model.score(X_test, y_test), bagging_model.score(X_test, y_test), stacking_model.score(X_test, y_test)],
    'F1-Score': [ensemble_report['macro avg']['f1-score'], bagging_report['macro avg']['f1-score'], stacking_report['macro avg']['f1-score']],
    'Precision': [ensemble_report['macro avg']['precision'], bagging_report['macro avg']['precision'], stacking_report['macro avg']['precision']],
    'Recall': [ensemble_report['macro avg']['recall'], bagging_report['macro avg']['recall'], stacking_report['macro avg']['recall']],
    'AUC': [ensemble_auc, bagging_auc, stacking_auc],
    'Confusion Matrix': [ensemble_confusion, bagging_confusion, stacking_confusion]
})

# Print the results of the ensemble models
print("Ensemble Model Results:")
print(ensemble_results)

Ensemble Model Results:
      Model  CV Accuracy  Accuracy  F1-Score  Precision    Recall       AUC  \
0  Ensemble       0.8425     0.875  0.874293   0.877366  0.873397  0.873397   
1   Bagging       0.7525     0.790  0.789474   0.789843  0.789263  0.789263   
2  Stacking       0.8575     0.905  0.904807   0.905013  0.904647  0.904647   

       Confusion Matrix  
0   [[95, 9], [16, 80]]  
1  [[84, 20], [22, 74]]  
2   [[95, 9], [10, 86]]  


### Batch Prediction

In [13]:
# Load the test dataset
new_data = pd.read_csv('./data/testing_encoded_data.csv')

# Separate the features
X_new = new_data.drop('success', axis=1)
y_true = new_data['success']

# Predict using the SVM model
svm_predictions = svm_model.predict(X_new)

# Predict using the Decision Tree model
dt_predictions = dt_model.predict(X_new)

# Predict using the Neural Network model
nn_predictions = nn_model.predict(X_new)

# Predict using the Ensemble(Voting)
ensemble_predictions = ensemble_model.predict(X_new)

# Predict using the Ensemble(Bagging)
bagging_predictions = bagging_model.predict(X_new)

# Predict using the Ensemble(Stacking)
stacking_predictions = stacking_model.predict(X_new)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({'SVM Predictions': svm_predictions,
                               'DT Predictions': dt_predictions,
                               'NN Predictions': nn_predictions,
                               'Voting Predictions': ensemble_predictions,
                               'Bagging Predictions': bagging_predictions,
                               'Stacking Predictions': stacking_predictions,
                               'Actual': y_true})

# Add a column to indicate incorrect predictions
predictions_df['SVM Correct'] = predictions_df['SVM Predictions'] == predictions_df['Actual']
predictions_df['DT Correct'] = predictions_df['DT Predictions'] == predictions_df['Actual']
predictions_df['NN Correct'] = predictions_df['NN Predictions'] == predictions_df['Actual']
predictions_df['Voting Correct'] = predictions_df['Voting Predictions'] == predictions_df['Actual']
predictions_df['Bagging Correct'] = predictions_df['Bagging Predictions'] == predictions_df['Actual']
predictions_df['Stacking Correct'] = predictions_df['Stacking Predictions'] == predictions_df['Actual']

# Calculate accuracy for each model
svm_accuracy = predictions_df['SVM Correct'].mean()
dt_accuracy = predictions_df['DT Correct'].mean()
nn_accuracy = predictions_df['NN Correct'].mean()
ensemble_accuracy = predictions_df['Voting Correct'].mean()
bagging_accuracy = predictions_df['Bagging Correct'].mean()
stacking_accuracy = predictions_df['Stacking Correct'].mean()

# Print the comparison table and accuracy
print("Comparison Table:")
print(predictions_df)
print("\nAccuracy:")
print("SVM Accuracy:", svm_accuracy)
print("DT Accuracy:", dt_accuracy)
print("NN Accuracy:", nn_accuracy)
print("Voting Accuracy:", ensemble_accuracy)
print("Bagging Accuracy:", bagging_accuracy)
print("Stacking Accuracy:", stacking_accuracy)

Comparison Table:
     SVM Predictions  DT Predictions  NN Predictions  Voting Predictions  \
0                  1               1               0                   1   
1                  1               0               0                   0   
2                  0               1               0                   0   
3                  0               0               1                   0   
4                  0               1               1                   0   
..               ...             ...             ...                 ...   
121                0               1               1                   0   
122                1               1               1                   1   
123                0               1               0                   0   
124                1               0               1                   1   
125                0               0               1                   0   

     Bagging Predictions  Stacking Predictions  Actual  SVM Correct  