# Data Cleaning

In [1]:
FILE_PATH = "dataset/ILPD/ilpd.csv"
RESULT_CSV_PATH = "output/result_model_comp.csv"
TARGET_COL = "class"
DROP_COLS = []

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [3]:
def load_and_clean_data(file_path, drop_cols):
    data_frame = pd.read_csv(file_path)

    # Drop the columns that are not required
    if len(drop_cols) > 0:
        data_frame.drop(drop_cols, axis=1, inplace=True)
    
    # Clean the data by removing any missing values and duplicated rows
    data_frame.dropna(inplace=True)
    data_frame.drop_duplicates(inplace=True)
    
    return data_frame


# Data Preprocessing

## Preprocess Function
- Label Encoders
- Train Test Split
- Standardization

In [4]:
from imblearn.over_sampling import ADASYN

def preprocess_data(file_path, target_col, drop_cols):

    data_frame = load_and_clean_data(file_path, drop_cols)

    # Find categorical columns, encode them, and create a dictionary of LabelEncoders
    label_encoders = {}
    for col in data_frame.columns:
        if data_frame[col].dtype == 'object':
            le = LabelEncoder()
            data_frame[col] = le.fit_transform(data_frame[col])
            label_encoders[col] = le
    
    # Split the dataset into training and testing sets
    X = data_frame.drop(target_col, axis=1)
    y = data_frame[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    # Apply ADASYN for oversampling
    ada = ADASYN(random_state=42)
    X_train_resampled, y_train_resampled = ada.fit_resample(X_train, y_train)
    
    # Standardize the data
    scaler = StandardScaler()
    X_train_resampled = pd.DataFrame(scaler.fit_transform(X_train_resampled), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
    return X_train_resampled, X_test, y_train_resampled, y_test, label_encoders, data_frame



## Get Label Mapping

In [5]:
def get_mapping(column_name, label_encoders):
    if column_name in label_encoders:
        mapping = dict(zip(label_encoders[column_name].classes_, label_encoders[column_name].transform(label_encoders[column_name].classes_)))
        return mapping
    else:
        return None


# Call Clean and Preprocess Functions

In [6]:
# Preprocess the data
X_train, X_test, y_train, y_test, label_encoders, df = preprocess_data(file_path=FILE_PATH, 
                                                                       target_col=TARGET_COL,
                                                                       drop_cols=DROP_COLS)

In [7]:
# Get the key-value mapping for a specific column
# column_name = "Student ID"
# mapping = get_mapping(column_name, label_encoders)
# print(mapping)

## Original Data Info

In [8]:
# display(load_and_clean_data(FILE_PATH, DROP_COLS).head())
# display(load_and_clean_data(FILE_PATH, DROP_COLS).info())
# display(load_and_clean_data(FILE_PATH, DROP_COLS).describe())

## Preprocessed Data Info

In [9]:
# display(df.head())
# display(df.info())
# display(df.describe())

# Feature Selection

In [10]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif, chi2, f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder

def get_feature_rank(data_frame, target_col):
    X = data_frame.drop(target_col, axis=1)
    y = data_frame[target_col]
    
    # Encode the target column if it contains categorical values
    if y.dtype == 'object':
        y = LabelEncoder().fit_transform(y)
    
    # Calculate feature scores using different methods
    info_gain = pd.Series(mutual_info_classif(X, y), index=X.columns)
    chi2_test = pd.Series(chi2(X, y)[0], index=X.columns)
    mutual_info = pd.Series(mutual_info_classif(X, y), index=X.columns)
    
    # Scale the relief scores using min-max scaling
    scaler = MinMaxScaler()
    relief_scaled = scaler.fit_transform(SelectKBest(score_func=f_classif, k='all').fit(X, y).scores_.reshape(-1, 1))
    relief = pd.Series(relief_scaled.flatten(), index=X.columns)
    
    # Create a random forest classifier to calculate Gini importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    gini_importance = pd.Series(rf.feature_importances_, index=X.columns)
    
    # Combine the scores and calculate the final rank
    feature_ranks = pd.DataFrame({
        'Information Gain': info_gain,
        'Chi-Square Test': chi2_test,
        'Mutual Information': mutual_info,
        'Relief': relief,
        'Gini Importance': gini_importance
    })
    
    # Normalize each rank before averaging
    feature_ranks_normalized = feature_ranks.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
    
    # Calculate the final rank by averaging the normalized ranks across methods
    feature_ranks_normalized['Average Rank'] = feature_ranks_normalized.mean(axis=1)
    
    # Sort the features based on the average rank in descending order
    feature_rank_df = feature_ranks_normalized.sort_values(by='Average Rank', ascending=False).reset_index()
    feature_rank_df.columns = ['feature', 'Information Gain', 'Chi-Square Test', 'Mutual Information', 'Relief',
                               'Gini Importance', 'Average Rank']
    
    return feature_rank_df


In [11]:
get_feature_rank(df, target_col=TARGET_COL)

Unnamed: 0,feature,Information Gain,Chi-Square Test,Mutual Information,Relief,Gini Importance,Average Rank
0,sgot,1.0,1.0,0.907976,0.334108,0.934166,0.83525
1,sgpt,0.979483,0.611121,1.0,0.387015,0.836652,0.762854
2,db,0.672985,0.019088,0.858298,1.0,0.575508,0.625176
3,tb,0.670242,0.033733,0.498733,0.785387,0.644386,0.526496
4,alkphos,0.249641,0.404922,0.122663,0.524236,1.0,0.460292
5,age,0.502199,0.005965,0.716144,0.25666,0.816605,0.459515
6,ag_ratio,0.0,0.000165,0.392722,0.439355,0.482128,0.262874
7,alb,0.160036,0.000297,0.038788,0.406172,0.568255,0.23471
8,tp,0.0,0.0,0.0,0.0,0.546276,0.109255
9,gender,0.11379,6.9e-05,0.105891,0.071303,0.0,0.058211


# Models

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_selection import RFECV
from mlxtend.feature_selection import SequentialFeatureSelector

def evaluate_model(file_path, target_col, drop_cols, model):

    # Preprocess data
    X_train, X_test, y_train, y_test, label_encoders, df = preprocess_data(file_path, target_col, drop_cols)

    # Get feature rank
    feature_rank_df = get_feature_rank(df, target_col=target_col)
    
    # Perform 10-fold cross-validation
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_accuracy = cross_val_score(model, X_train, y_train, scoring='accuracy')
    
    # Train the model and evaluate on test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    # Calculate evaluation criteria for all features
    cv_f1_score = cross_val_score(model, X_train, y_train, scoring='f1_macro')
    test_f1_score = f1_score(y_test, y_pred, average='macro')
    cv_precision = cross_val_score(model, X_train, y_train, scoring='precision_macro')
    test_precision = precision_score(y_test, y_pred, average='macro')
    cv_recall = cross_val_score(model, X_train, y_train, scoring='recall_macro')
    test_recall = recall_score(y_test, y_pred, average='macro')
    
    # Store the results for all features
    all_features_result = {
        'configuration_name': f"{model.__class__.__name__} all features",
        'cv_accuracy': np.round(np.mean(cv_accuracy), decimals=2),
        'test_accuracy': np.round(test_accuracy, decimals=2),
        'cv_f1_score': np.round(np.mean(cv_f1_score), decimals=2),
        'test_f1_score': np.round(test_f1_score, decimals=2),
        'cv_precision': np.round(np.mean(cv_precision), decimals=2),
        'test_precision': np.round(test_precision, decimals=2),
        'cv_recall': np.round(np.mean(cv_recall), decimals=2),
        'test_recall': np.round(test_recall, decimals=2),
        'list_of_features': X_train.columns.tolist(),
        'len_of_features': len(X_train.columns),
        'total_features': len(X_train.columns)
    }
    
    results = [all_features_result]
    
    # Forward Selection with feature rank (ADRA method)
    best_features_adra = feature_rank_df['feature'].tolist()
    selected_features_adra = []
    best_accuracy_adra = 0
    
    for feature in best_features_adra:
        selected_features_adra.append(feature)
        X_train_fs_adra = X_train[selected_features_adra]
        X_test_fs_adra = X_test[selected_features_adra]
        
        cv_accuracy_fs_adra = cross_val_score(model, X_train_fs_adra, y_train, scoring='accuracy')
        
        if np.mean(cv_accuracy_fs_adra) > best_accuracy_adra:
            best_accuracy_adra = np.mean(cv_accuracy_fs_adra)
        else:
            selected_features_adra.remove(feature)
    
    # Train the model with selected features (ADRA) and evaluate on test set
    model.fit(X_train_fs_adra, y_train)
    y_pred_fs_adra = model.predict(X_test_fs_adra)
    test_accuracy_fs_adra = accuracy_score(y_test, y_pred_fs_adra)
    
    # Calculate evaluation criteria for ADRA method
    cv_f1_score_fs_adra = cross_val_score(model, X_train_fs_adra, y_train, scoring='f1_macro')
    test_f1_score_fs_adra = f1_score(y_test, y_pred_fs_adra, average='macro')
    cv_precision_fs_adra = cross_val_score(model, X_train_fs_adra, y_train, scoring='precision_macro')
    test_precision_fs_adra = precision_score(y_test, y_pred_fs_adra, average='macro')
    cv_recall_fs_adra = cross_val_score(model, X_train_fs_adra, y_train, scoring='recall_macro')
    test_recall_fs_adra = recall_score(y_test, y_pred_fs_adra, average='macro')
    
    # Store the results for ADRA method
    result_adra = {
        'configuration_name': f"{model.__class__.__name__} selected features with ADRA",
        'cv_accuracy': np.round(np.mean(cv_accuracy_fs_adra), decimals=2),
        'test_accuracy': np.round(test_accuracy_fs_adra, decimals=2),
        'cv_f1_score': np.round(np.mean(cv_f1_score_fs_adra), decimals=2),
        'test_f1_score': np.round(test_f1_score_fs_adra, decimals=2),
        'cv_precision': np.round(np.mean(cv_precision_fs_adra), decimals=2),
        'test_precision': np.round(test_precision_fs_adra, decimals=2),
        'cv_recall': np.round(np.mean(cv_recall_fs_adra), decimals=2),
        'test_recall': np.round(test_recall_fs_adra, decimals=2),
        'list_of_features': selected_features_adra,
        'len_of_features': len(selected_features_adra),
        'total_features': len(X_train.columns)
    }
    
    results.append(result_adra)
    
    # Recursive Feature Elimination (RFE)
    rfe = RFECV(estimator=DecisionTreeClassifier(random_state=42), cv=cv)
    rfe.fit(X_train, y_train)
    selected_features_rfe = X_train.columns[rfe.support_].tolist()
    
    # Train the model with RFE-selected features and evaluate on test set
    X_train_rfe = X_train[selected_features_rfe]
    X_test_rfe = X_test[selected_features_rfe]
    model.fit(X_train_rfe, y_train)
    y_pred_rfe = model.predict(X_test_rfe)
    test_accuracy_rfe = accuracy_score(y_test, y_pred_rfe)
    
    # Calculate evaluation criteria for RFE method
    cv_f1_score_rfe = cross_val_score(model, X_train_rfe, y_train, scoring='f1_macro')
    test_f1_score_rfe = f1_score(y_test, y_pred_rfe, average='macro')
    cv_precision_rfe = cross_val_score(model, X_train_rfe, y_train, scoring='precision_macro')
    test_precision_rfe = precision_score(y_test, y_pred_rfe, average='macro')
    cv_recall_rfe = cross_val_score(model, X_train_rfe, y_train, scoring='recall_macro')
    test_recall_rfe = recall_score(y_test, y_pred_rfe, average='macro')
    
    # Store the results for RFE method
    result_rfe = {
        'configuration_name': f"{model.__class__.__name__} RFE",
        'cv_accuracy': np.round(np.mean(cv_accuracy), decimals=2),
        'test_accuracy': np.round(test_accuracy_rfe, decimals=2),
        'cv_f1_score': np.round(np.mean(cv_f1_score_rfe), decimals=2),
        'test_f1_score': np.round(test_f1_score_rfe, decimals=2),
        'cv_precision': np.round(np.mean(cv_precision_rfe), decimals=2),
        'test_precision': np.round(test_precision_rfe, decimals=2),
        'cv_recall': np.round(np.mean(cv_recall_rfe), decimals=2),
        'test_recall': np.round(test_recall_rfe, decimals=2),
        'list_of_features': selected_features_rfe,
        'len_of_features': len(selected_features_rfe),
        'total_features': len(X_train.columns)
    }
    
    results.append(result_rfe)
    
    # Forward Selection (without feature rank)
    sfs_forward = SequentialFeatureSelector(estimator=model, k_features='best', forward=True, cv=cv)
    sfs_forward.fit(X_train, y_train)
    selected_features_forward = X_train.columns[list(sfs_forward.k_feature_idx_)].tolist()
    
    # Train the model with forward-selected features and evaluate on test set
    X_train_forward = X_train[selected_features_forward]
    X_test_forward = X_test[selected_features_forward]
    model.fit(X_train_forward, y_train)
    y_pred_forward = model.predict(X_test_forward)
    test_accuracy_forward = accuracy_score(y_test, y_pred_forward)
    
    # Calculate evaluation criteria for forward selection
    cv_f1_score_forward = cross_val_score(model, X_train_forward, y_train, scoring='f1_macro')
    test_f1_score_forward = f1_score(y_test, y_pred_forward, average='macro')
    cv_precision_forward = cross_val_score(model, X_train_forward, y_train, scoring='precision_macro')
    test_precision_forward = precision_score(y_test, y_pred_forward, average='macro')
    cv_recall_forward = cross_val_score(model, X_train_forward, y_train, scoring='recall_macro')
    test_recall_forward = recall_score(y_test, y_pred_forward, average='macro')
    
    # Store the results for forward selection
    result_forward = {
        'configuration_name': f"{model.__class__.__name__} Forward Selection",
        'cv_accuracy': np.round(np.mean(cv_accuracy), decimals=2),
        'test_accuracy': np.round(test_accuracy_forward, decimals=2),
        'cv_f1_score': np.round(np.mean(cv_f1_score_forward), decimals=2),
        'test_f1_score': np.round(test_f1_score_forward, decimals=2),
        'cv_precision': np.round(np.mean(cv_precision_forward), decimals=2),
        'test_precision': np.round(test_precision_forward, decimals=2),
        'cv_recall': np.round(np.mean(cv_recall_forward), decimals=2),
        'test_recall': np.round(test_recall_forward, decimals=2),
        'list_of_features': selected_features_forward,
        'len_of_features': len(selected_features_forward),
        'total_features': len(X_train.columns)
    }
    
    results.append(result_forward)
    
    # Backward Elimination (without feature rank)
    sfs_backward = SequentialFeatureSelector(estimator=model, k_features='best', forward=False, cv=cv)
    sfs_backward.fit(X_train, y_train)
    selected_features_backward = X_train.columns[list(sfs_backward.k_feature_idx_)].tolist()
    
    # Train the model with backward-selected features and evaluate on test set
    X_train_backward = X_train[selected_features_backward]
    X_test_backward = X_test[selected_features_backward]
    model.fit(X_train_backward, y_train)
    y_pred_backward = model.predict(X_test_backward)
    test_accuracy_backward = accuracy_score(y_test, y_pred_backward)
    
    # Calculate evaluation criteria for backward elimination
    cv_f1_score_backward = cross_val_score(model, X_train_backward, y_train, scoring='f1_macro')
    test_f1_score_backward = f1_score(y_test, y_pred_backward, average='macro')
    cv_precision_backward = cross_val_score(model, X_train_backward, y_train, scoring='precision_macro')
    test_precision_backward = precision_score(y_test, y_pred_backward, average='macro')
    cv_recall_backward = cross_val_score(model, X_train_backward, y_train, scoring='recall_macro')
    test_recall_backward = recall_score(y_test, y_pred_backward, average='macro')
    
    # Store the results for backward elimination
    result_backward = {
        'configuration_name': f"{model.__class__.__name__} Backward Elimination",
        'cv_accuracy': np.round(np.mean(cv_accuracy), decimals=2),
        'test_accuracy': np.round(test_accuracy_backward, decimals=2),
        'cv_f1_score': np.round(np.mean(cv_f1_score_backward), decimals=2),
        'test_f1_score': np.round(test_f1_score_backward, decimals=2),
        'cv_precision': np.round(np.mean(cv_precision_backward), decimals=2),
        'test_precision': np.round(test_precision_backward, decimals=2),
        'cv_recall': np.round(np.mean(cv_recall_backward), decimals=2),
        'test_recall': np.round(test_recall_backward, decimals=2),
        'list_of_features': selected_features_backward,
        'len_of_features': len(selected_features_backward),
        'total_features': len(X_train.columns)
    }
    
    results.append(result_backward)
    
    # Create a dataframe to store the results
    results_df = pd.DataFrame(results)
    results_df = results_df[['configuration_name', 'cv_accuracy', 'test_accuracy', 'cv_f1_score', 'test_f1_score',
                             'cv_precision', 'test_precision', 'cv_recall', 'test_recall', 'len_of_features',
                             'total_features', 'list_of_features']]
    
    return results_df


In [13]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
results_df_DecisionTreeClassifier = evaluate_model(file_path=FILE_PATH, target_col=TARGET_COL, drop_cols=DROP_COLS, model=model)
results_df_DecisionTreeClassifier

Unnamed: 0,configuration_name,cv_accuracy,test_accuracy,cv_f1_score,test_f1_score,cv_precision,test_precision,cv_recall,test_recall,len_of_features,total_features,list_of_features
0,DecisionTreeClassifier all features,0.71,0.6,0.71,0.55,0.71,0.55,0.71,0.55,10,10,"[age, gender, tb, db, alkphos, sgpt, sgot, tp,..."
1,DecisionTreeClassifier selected features with ...,0.7,0.57,0.7,0.51,0.71,0.52,0.7,0.52,8,10,"[sgpt, sgot, tb, db, age, alkphos, ag_ratio, tp]"
2,DecisionTreeClassifier RFE,0.71,0.6,0.71,0.55,0.71,0.55,0.71,0.55,10,10,"[age, gender, tb, db, alkphos, sgpt, sgot, tp,..."
3,DecisionTreeClassifier Forward Selection,0.71,0.6,0.71,0.55,0.71,0.55,0.71,0.55,10,10,"[age, gender, tb, db, alkphos, sgpt, sgot, tp,..."
4,DecisionTreeClassifier Backward Elimination,0.71,0.6,0.71,0.55,0.71,0.55,0.71,0.55,10,10,"[age, gender, tb, db, alkphos, sgpt, sgot, tp,..."


In [14]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
results_df_GaussianNB = evaluate_model(file_path=FILE_PATH, target_col=TARGET_COL, drop_cols=DROP_COLS, model=model)
results_df_GaussianNB

Unnamed: 0,configuration_name,cv_accuracy,test_accuracy,cv_f1_score,test_f1_score,cv_precision,test_precision,cv_recall,test_recall,len_of_features,total_features,list_of_features
0,GaussianNB all features,0.66,0.54,0.64,0.54,0.75,0.69,0.68,0.68,10,10,"[age, gender, tb, db, alkphos, sgpt, sgot, tp,..."
1,GaussianNB selected features with ADRA,0.66,0.54,0.64,0.54,0.75,0.69,0.68,0.68,8,10,"[sgot, sgpt, tb, db, alkphos, age, ag_ratio, tp]"
2,GaussianNB RFE,0.66,0.54,0.64,0.54,0.75,0.69,0.68,0.68,10,10,"[age, gender, tb, db, alkphos, sgpt, sgot, tp,..."
3,GaussianNB Forward Selection,0.66,0.54,0.65,0.54,0.75,0.69,0.68,0.68,8,10,"[age, gender, db, alkphos, sgpt, tp, alb, ag_r..."
4,GaussianNB Backward Elimination,0.66,0.54,0.65,0.54,0.75,0.69,0.68,0.68,8,10,"[age, gender, db, alkphos, sgpt, tp, alb, ag_r..."


In [15]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(random_state=42, max_iter=100, hidden_layer_sizes=(32,16,))
results_df_MLPClassifier = evaluate_model(file_path=FILE_PATH, target_col=TARGET_COL, drop_cols=DROP_COLS, model=model)
results_df_MLPClassifier



In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
results_df_RandomForestClassifier = evaluate_model(file_path=FILE_PATH, target_col=TARGET_COL, drop_cols=DROP_COLS, model=model)
results_df_RandomForestClassifier

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,configuration_name,cv_accuracy,test_accuracy,cv_f1_score,test_f1_score,cv_precision,test_precision,cv_recall,test_recall,len_of_features,total_features,list_of_features
0,RandomForestClassifier all features,0.55,0.52,0.52,0.52,0.51,0.55,0.54,0.52,32,32,"[STUDENTID, AGE, GENDER, HS_TYPE, SCHOLARSHIP,..."
1,RandomForestClassifier selected features with ...,0.64,0.59,0.63,0.55,0.64,0.64,0.64,0.57,6,32,"[STUDENTID, CUML_GPA, SCHOLARSHIP, FATHER_JOB,..."
2,RandomForestClassifier RFE,0.55,0.55,0.53,0.55,0.55,0.57,0.54,0.55,25,32,"[STUDENTID, HS_TYPE, SCHOLARSHIP, WORK, ACTIVI..."
3,RandomForestClassifier Forward Selection,0.55,0.55,0.57,0.54,0.58,0.58,0.59,0.54,14,32,"[AGE, GENDER, WORK, TRANSPORT, LIVING, FATHER_..."
4,RandomForestClassifier Backward Elimination,0.55,0.72,0.66,0.73,0.7,0.81,0.67,0.71,12,32,"[STUDENTID, SCHOLARSHIP, WORK, ACTIVITY, KIDS,..."


In [None]:
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(n_estimators=100, random_state=42)
results_df_BaggingClassifier = evaluate_model(file_path=FILE_PATH, target_col=TARGET_COL, drop_cols=DROP_COLS, model=model)
results_df_BaggingClassifier

Unnamed: 0,configuration_name,cv_accuracy,test_accuracy,cv_f1_score,test_f1_score,cv_precision,test_precision,cv_recall,test_recall,len_of_features,total_features,list_of_features
0,BaggingClassifier all features,0.5,0.45,0.48,0.44,0.49,0.52,0.5,0.43,32,32,"[STUDENTID, AGE, GENDER, HS_TYPE, SCHOLARSHIP,..."
1,BaggingClassifier selected features with ADRA,0.64,0.48,0.64,0.49,0.66,0.53,0.65,0.48,9,32,"[STUDENTID, CUML_GPA, EXP_GPA, COURSE ID, WORK..."
2,BaggingClassifier RFE,0.5,0.41,0.54,0.41,0.55,0.45,0.56,0.4,25,32,"[STUDENTID, HS_TYPE, SCHOLARSHIP, WORK, ACTIVI..."
3,BaggingClassifier Forward Selection,0.5,0.48,0.6,0.48,0.61,0.5,0.62,0.48,7,32,"[WORK, TRANSPORT, STUDY_HRS, ATTEND_DEPT, LIST..."
4,BaggingClassifier Backward Elimination,0.5,0.59,0.59,0.57,0.59,0.59,0.6,0.58,10,32,"[STUDENTID, ACTIVITY, SALARY, MOTHER_EDU, KIDS..."


In [None]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=42)
results_df_AdaBoostClassifier = evaluate_model(file_path=FILE_PATH, target_col=TARGET_COL, drop_cols=DROP_COLS, model=model)
results_df_AdaBoostClassifier

Unnamed: 0,configuration_name,cv_accuracy,test_accuracy,cv_f1_score,test_f1_score,cv_precision,test_precision,cv_recall,test_recall,len_of_features,total_features,list_of_features
0,AdaBoostClassifier all features,0.43,0.48,0.42,0.49,0.49,0.5,0.42,0.48,32,32,"[STUDENTID, AGE, GENDER, HS_TYPE, SCHOLARSHIP,..."
1,AdaBoostClassifier selected features with ADRA,0.53,0.55,0.53,0.56,0.56,0.59,0.53,0.55,6,32,"[STUDENTID, CUML_GPA, IMPACT, NOTES, ATTEND, F..."
2,AdaBoostClassifier RFE,0.43,0.55,0.56,0.56,0.58,0.59,0.56,0.55,25,32,"[STUDENTID, HS_TYPE, SCHOLARSHIP, WORK, ACTIVI..."
3,AdaBoostClassifier Forward Selection,0.43,0.41,0.41,0.41,0.44,0.43,0.42,0.41,19,32,"[STUDENTID, GENDER, HS_TYPE, ACTIVITY, SALARY,..."
4,AdaBoostClassifier Backward Elimination,0.43,0.55,0.43,0.56,0.45,0.56,0.43,0.56,21,32,"[STUDENTID, AGE, HS_TYPE, WORK, PARTNER, SALAR..."


In [None]:
result_student_performance_prediction_western_oc2_lab = pd.concat([results_df_DecisionTreeClassifier, 
                                                                    results_df_GaussianNB, 
                                                                    results_df_MLPClassifier, 
                                                                    results_df_RandomForestClassifier, 
                                                                    results_df_BaggingClassifier, 
                                                                    results_df_AdaBoostClassifier])
result_student_performance_prediction_western_oc2_lab.to_csv(RESULT_CSV_PATH, index=False)