In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, KFold

# Load the dataset
data = pd.read_csv('./data/augmented_data.csv')

# Separate the features and the target variable
X = data.drop('success', axis=1)
y = data['success']

In [2]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the number of folds for cross-validation
k = 5

# Create a k-fold cross-validation object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear')
svm_cv_scores = cross_val_score(svm_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
svm_accuracy = svm_cv_scores.mean()

# Decision Tree (DT) model
dt_model = DecisionTreeClassifier()
dt_cv_scores = cross_val_score(dt_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
dt_accuracy = dt_cv_scores.mean()

# Neural Network (NN) model
nn_model = MLPClassifier()
nn_cv_scores = cross_val_score(nn_model, X_train, y_train, cv=kf)  # Perform k-fold cross-validation
nn_accuracy = nn_cv_scores.mean()

# Train the models on the full training set
svm_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
nn_model.fit(X_train, y_train)

# Make predictions on the test set
svm_predictions = svm_model.predict(X_test)
dt_predictions = dt_model.predict(X_test)
nn_predictions = nn_model.predict(X_test)

# Calculate evaluation metrics
svm_report = classification_report(y_test, svm_predictions, output_dict=True)
svm_auc = roc_auc_score(y_test, svm_predictions)

dt_report = classification_report(y_test, dt_predictions, output_dict=True)
dt_auc = roc_auc_score(y_test, dt_predictions)

nn_report = classification_report(y_test, nn_predictions, output_dict=True)
nn_auc = roc_auc_score(y_test, nn_predictions)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Model': ['SVM', 'Decision Tree', 'Neural Network'],
    'CV Accuracy': [svm_accuracy, dt_accuracy, nn_accuracy],
    'Accuracy': [svm_model.score(X_test, y_test), dt_model.score(X_test, y_test), nn_model.score(X_test, y_test)],
    'F1-Score': [svm_report['macro avg']['f1-score'], dt_report['macro avg']['f1-score'], nn_report['macro avg']['f1-score']],
    'Precision': [svm_report['macro avg']['precision'], dt_report['macro avg']['precision'], nn_report['macro avg']['precision']],
    'Recall': [svm_report['macro avg']['recall'], dt_report['macro avg']['recall'], nn_report['macro avg']['recall']],
    'AUC': [svm_auc, dt_auc, nn_auc]
})

# Print the results
print("Evaluation Metrics:")
print(results_df)




Evaluation Metrics:
            Model  CV Accuracy  Accuracy  F1-Score  Precision    Recall  \
0             SVM     0.742385  0.818182  0.817717   0.818182  0.821462   
1   Decision Tree     0.746167  0.787879  0.787792   0.792012  0.793939   
2  Neural Network     0.881267  0.934343  0.933964   0.932897  0.936244   

        AUC  
0  0.821462  
1  0.793939  
2  0.936244  




In [3]:
# Obtain feature importances for linear SVM model
svm_feature_importances = abs(svm_model.coef_[0])

# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'SVM Importance': svm_feature_importances
})

# Sort the DataFrame by SVM Importance values in descending order
importances_df = importances_df.sort_values(by='SVM Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("Top 10 Feature Importances (SVM Linear):")
print(top_10_importances)

Top 10 Feature Importances (SVM Linear):
                                               Feature  SVM Importance
92   classroom_org_I forgot already, might be just ...        1.676340
166                                       s25_Leave it        1.427534
72                   program_BS Information Technology        1.397573
151                         mothers_employment_Retired        1.340495
108                         school_org_Project Manager        1.312652
157                         income_P50,000 to P100,000        1.267727
102                                 school_org_Auditor        1.239745
145      mothers_employment_Disabled, not able to work        1.233484
139  mothers_education_Not Applicable (no work, not...        1.231842
174                          s26_More than 4 hours/day        1.177111


In [4]:
# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': dt_model.feature_importances_
})

# Sort the DataFrame by importance values in descending order
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("DT Top 10 Feature Importances:")
print(top_10_importances)

DT Top 10 Feature Importances:
     Feature  Importance
65       s19    0.072260
6    shs_gpa    0.057699
7         p1    0.040422
1    english    0.039387
3    science    0.038781
2    reading    0.036118
29       p23    0.035834
61       s15    0.035396
43       p37    0.032524
5   abstract    0.030400


In [5]:
# Obtain feature importances for Neural Network model
nn_feature_importances = abs(nn_model.coefs_[0].mean(axis=0))

# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns[:len(nn_feature_importances)],
    'NN Importance': nn_feature_importances
})

# Sort the DataFrame by NN Importance values in descending order
importances_df = importances_df.sort_values(by='NN Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("Top 10 Feature Importances (Neural Network):")
print(top_10_importances)

Top 10 Feature Importances (Neural Network):
                        Feature  NN Importance
49                           s3       0.030415
36                          p30       0.022564
40                          p34       0.020094
67                          s21       0.017994
71  program_BS Computer Science       0.017732
25                          p19       0.017372
34                          p28       0.016591
5                      abstract       0.015354
84            class_rank_Top 20       0.015343
46                          p40       0.014003


In [6]:
# Load the test dataset
new_data = pd.read_csv('./data/testing.csv')

# Separate the features
X_new = new_data.drop('success', axis=1)
y_true = new_data['success']

# Predict using the SVM model
svm_predictions = svm_model.predict(X_new)

# Predict using the Decision Tree model
dt_predictions = dt_model.predict(X_new)

# Predict using the Neural Network model
nn_predictions = nn_model.predict(X_new)

In [7]:
# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({'SVM Predictions': svm_predictions,
                               'DT Predictions': dt_predictions,
                               'NN Predictions': nn_predictions,
                               'Actual': y_true})

# Add a column to indicate incorrect predictions
predictions_df['SVM Correct'] = predictions_df['SVM Predictions'] == predictions_df['Actual']
predictions_df['DT Correct'] = predictions_df['DT Predictions'] == predictions_df['Actual']
predictions_df['NN Correct'] = predictions_df['NN Predictions'] == predictions_df['Actual']

# Calculate accuracy for each model
svm_accuracy = predictions_df['SVM Correct'].mean()
dt_accuracy = predictions_df['DT Correct'].mean()
nn_accuracy = predictions_df['NN Correct'].mean()

# Print the comparison table and accuracy
print("Comparison Table:")
print(predictions_df)
print("\nAccuracy:")
print("SVM Accuracy:", svm_accuracy)
print("DT Accuracy:", dt_accuracy)
print("NN Accuracy:", nn_accuracy)

Comparison Table:
   SVM Predictions  DT Predictions  NN Predictions  Actual  SVM Correct  \
0                1               1               1       1         True   
1                0               0               0       0         True   
2                0               0               0       0         True   
3                0               1               1       1        False   
4                0               0               0       0         True   
5                1               0               1       1         True   
6                0               0               0       0         True   
7                1               1               1       1         True   
8                1               1               0       0        False   
9                1               1               1       1         True   

   DT Correct  NN Correct  
0        True        True  
1        True        True  
2        True        True  
3        True        True  
4        True   