In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance

# Load the dataset
data = pd.read_csv('./data/augmented_data.csv')

# Separate the features and the target variable
X = data.drop('success', axis=1)
y = data['success']

In [2]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

# Decision Tree (DT) model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

# Neural Network (NN) model
nn_model = MLPClassifier()
nn_model.fit(X_train, y_train)
nn_predictions = nn_model.predict(X_test)

# Calculate evaluation metrics
svm_report = classification_report(y_test, svm_predictions, output_dict=True)
svm_auc = roc_auc_score(y_test, svm_predictions)

dt_report = classification_report(y_test, dt_predictions, output_dict=True)
dt_auc = roc_auc_score(y_test, dt_predictions)

nn_report = classification_report(y_test, nn_predictions, output_dict=True)
nn_auc = roc_auc_score(y_test, nn_predictions)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Model': ['SVM', 'Decision Tree', 'Neural Network'],
    'Accuracy': [svm_model.score(X_test, y_test), dt_model.score(X_test, y_test), nn_model.score(X_test, y_test)],
    'F1-Score': [svm_report['macro avg']['f1-score'], dt_report['macro avg']['f1-score'], nn_report['macro avg']['f1-score']],
    'Precision': [svm_report['macro avg']['precision'], dt_report['macro avg']['precision'], nn_report['macro avg']['precision']],
    'Recall': [svm_report['macro avg']['recall'], dt_report['macro avg']['recall'], nn_report['macro avg']['recall']],
    'AUC': [svm_auc, dt_auc, nn_auc]
})

# Print the results
print("Evaluation Metrics:")
print(results_df)

Evaluation Metrics:
            Model  Accuracy  F1-Score  Precision    Recall       AUC
0             SVM     0.765  0.764947   0.767669  0.768720  0.768720
1   Decision Tree     0.815  0.814215   0.813784  0.815016  0.815016
2  Neural Network     0.925  0.924848   0.924642  0.927335  0.927335




In [3]:
# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': dt_model.feature_importances_
})

# Sort the DataFrame by importance values in descending order
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("DT Top 10 Feature Importances:")
print(top_10_importances)

DT Top 10 Feature Importances:
                                               Feature  Importance
5                                             abstract    0.102404
3                                              science    0.061057
64                                                 s18    0.055970
0                                             cet_oapr    0.035806
59                                                 s13    0.033132
13                                                  p7    0.032185
23                                                 p17    0.031270
26                                                 p20    0.030766
45                                                 p39    0.028481
148  mothers_employment_Not employed, looking for work    0.026764


In [4]:
# Obtain feature importances for linear SVM model
svm_feature_importances = abs(svm_model.coef_[0])

# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'SVM Importance': svm_feature_importances
})

# Sort the DataFrame by SVM Importance values in descending order
importances_df = importances_df.sort_values(by='SVM Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("Top 10 Feature Importances (SVM Linear):")
print(top_10_importances)

Top 10 Feature Importances (SVM Linear):
                                               Feature  SVM Importance
152                               income_Below P10,000        2.929409
172     s26_More than 30 min. but less than 1 hour/day        1.745363
92                                  classroom_org_Muse        1.443935
122     fathers_education_Some high school, no diploma        1.410439
144      mothers_employment_Disabled, not able to work        1.396722
118  fathers_education_Not Applicable (no work, not...        1.342237
136                  mothers_education_Master’s degree        1.159210
98                             classroom_org_Secretary        1.144344
164                      s25_Ask my teachers at school        1.098102
174                           s26_Up to 30 minutes/day        1.082468


In [5]:
# Obtain feature importances for Neural Network model
nn_feature_importances = abs(nn_model.coefs_[0].mean(axis=0))

# Create a DataFrame for feature importances
importances_df = pd.DataFrame({
    'Feature': X_train.columns[:len(nn_feature_importances)],
    'NN Importance': nn_feature_importances
})

# Sort the DataFrame by NN Importance values in descending order
importances_df = importances_df.sort_values(by='NN Importance', ascending=False)

# Display the top 10 feature importances
top_10_importances = importances_df.head(10)
print("Top 10 Feature Importances (Neural Network):")
print(top_10_importances)

Top 10 Feature Importances (Neural Network):
                       Feature  NN Importance
66                         s20       0.033163
80               strand_TVL-HE       0.027310
17                         p11       0.025010
94           classroom_org_PIO       0.022386
2                      reading       0.022123
75              strand_ABM/BAM       0.021315
89  academic_awards_With Honor       0.019799
51                          s5       0.018868
73                  sex_Female       0.017524
9                           p3       0.017156


In [6]:
# Load the test dataset
new_data = pd.read_csv('./data/testing.csv')

# Separate the features
X_new = new_data.drop('success', axis=1)
y_true = new_data['success']

# Predict using the SVM model
svm_predictions = svm_model.predict(X_new)

# Predict using the Decision Tree model
dt_predictions = dt_model.predict(X_new)

# Predict using the Neural Network model
nn_predictions = nn_model.predict(X_new)

In [7]:
# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({'SVM Predictions': svm_predictions,
                               'DT Predictions': dt_predictions,
                               'NN Predictions': nn_predictions,
                               'Actual': y_true})

# Add a column to indicate incorrect predictions
predictions_df['SVM Correct'] = predictions_df['SVM Predictions'] == predictions_df['Actual']
predictions_df['DT Correct'] = predictions_df['DT Predictions'] == predictions_df['Actual']
predictions_df['NN Correct'] = predictions_df['NN Predictions'] == predictions_df['Actual']

# Calculate accuracy for each model
svm_accuracy = predictions_df['SVM Correct'].mean()
dt_accuracy = predictions_df['DT Correct'].mean()
nn_accuracy = predictions_df['NN Correct'].mean()

# Print the comparison table and accuracy
print("Comparison Table:")
print(predictions_df)
print("\nAccuracy:")
print("SVM Accuracy:", svm_accuracy)
print("DT Accuracy:", dt_accuracy)
print("NN Accuracy:", nn_accuracy)

Comparison Table:
   SVM Predictions  DT Predictions  NN Predictions  Actual  SVM Correct  \
0                1               1               1       1         True   
1                0               0               0       0         True   
2                0               0               0       0         True   
3                1               1               1       1         True   
4                0               0               0       0         True   
5                1               1               1       1         True   
6                0               0               0       0         True   
7                1               1               1       1         True   
8                0               0               0       0         True   
9                1               1               1       1         True   

   DT Correct  NN Correct  
0        True        True  
1        True        True  
2        True        True  
3        True        True  
4        True   