# Template to properly conduct the baseline modeling:

First, download the `data_reduced.csv()` from the Github. Then, branch out from the main branch and conduct the modeling in this template:

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer

In [2]:
# Importing the csv file
df_reduced = pd.read_csv('data_reduced.csv')
df_reduced = df_reduced.drop(columns=['Unnamed: 0'])
df_reduced.head()

Unnamed: 0,Marital status,Application mode,Course,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,...,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Target
0,single,Other,Tourism,160.0,Secondary Education - 12th Year of Schooling o...,Higher Education - Degree,Intermediate Level Technicians and Professions,Intermediate Level Technicians and Professions,142.5,yes,...,no,male,no,19,no,0,6,6,14.0,Graduate
1,single,1st phase - general contingent,Communication Design,122.0,Basic education 1st cycle (4th/5th year) or eq...,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,Unskilled Workers,124.8,yes,...,no,male,no,19,no,0,6,0,0.0,Dropout
2,single,2nd phase - general contingent,Journalism and Communication,122.0,Basic Education 2nd Cycle (6th/7th/8th Year) o...,Basic education 1st cycle (4th/5th year) or eq...,"Personal Services, Security and Safety Workers...",Intermediate Level Technicians and Professions,119.6,yes,...,yes,female,no,20,no,0,6,6,13.428571,Graduate
3,married,Over 23 years old,Social Service (evening attendance),100.0,Basic education 1st cycle (4th/5th year) or eq...,Basic Education 2nd Cycle (6th/7th/8th Year) o...,Unskilled Workers,Unskilled Workers,141.5,no,...,yes,female,no,45,no,0,6,5,12.333333,Graduate
4,married,Over 23 years old,Management (evening attendance),133.1,Basic education 1st cycle (4th/5th year) or eq...,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,"Skilled Workers in Industry, Construction and ...",114.8,no,...,yes,male,no,50,no,0,5,5,11.857143,Graduate


Now, that it is imported let's split this up into X and Y components. Let's also identify the numerical feature columns and categorical feature columns:

In [3]:
# Split data into X and Y components
X = df_reduced.drop(columns=['Target'])
Y = df_reduced['Target']

# Identify column types
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(include=['number']).columns
categorical_columns, numerical_columns

(Index(['Marital status', 'Application mode', 'Course',
        'Mother's qualification', 'Father's qualification',
        'Mother's occupation', 'Father's occupation', 'Displaced',
        'Educational special needs', 'Debtor', 'Tuition fees up to date',
        'Gender', 'Scholarship holder', 'International'],
       dtype='object'),
 Index(['Previous qualification (grade)', 'Admission grade',
        'Age at enrollment', 'Curricular units 1st sem (credited)',
        'Curricular units 1st sem (enrolled)',
        'Curricular units 1st sem (approved)',
        'Curricular units 1st sem (grade)'],
       dtype='object'))

Now we go about the modeling. Below is my implementation of Naive Bayes but obviously replace it with your model of choice.

In [4]:
# Importing necessary libraries
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [5]:
# Initialize the Stratefied K-Fold
strat_kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 88)

# Storing the metrics
results = []

for fold, (train_idx, test_idx) in enumerate(strat_kf.split(X, Y), start = 1):
    # Split the data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]

    # Properly transform the data depending on data type
    preprocessors = ColumnTransformer([
    ('num_scaler', StandardScaler(), numerical_columns),
    ('categorical_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])
    X_train_transformed = preprocessors.fit_transform(X_train)
    X_test_transformed = preprocessors.transform(X_test)

    # Ensure X's are dense and not sparse data type
    # This is necessary because of the OneHotEncoding 
    # making more columns.
    X_train_transformed = (X_train_transformed.todense()).A
    X_test_transformed = (X_test_transformed.todense()).A

    # Train a baseline Naive Bayes Model
    base_model = GaussianNB()
    base_model.fit(X_train_transformed, Y_train)
    
    # Predict
    Y_pred = base_model.predict(X_test_transformed)

    # Compute metrics
    # Using weighted average for some metrics due to class inbalance
    results.append([
        fold,
        precision_score(Y_test, Y_pred, labels=["Graduate"], average=None)[0],
        precision_score(Y_test, Y_pred, labels=["Enrolled"], average=None)[0],
        precision_score(Y_test, Y_pred, labels=["Dropout"], average=None)[0],
        recall_score(Y_test, Y_pred, labels=["Graduate"], average=None)[0],
        recall_score(Y_test, Y_pred, labels=["Enrolled"], average=None)[0],
        recall_score(Y_test, Y_pred, labels=["Dropout"], average=None)[0],
        f1_score(Y_test, Y_pred, labels=["Graduate"], average=None)[0],
        f1_score(Y_test, Y_pred, labels=["Enrolled"], average=None)[0],
        f1_score(Y_test, Y_pred, labels=["Dropout"], average=None)[0]])
    

To show the results in a pretty way, I'll create a dataframe:

In [11]:
df_results = pd.DataFrame(results, columns=["Fold", "Precision_Graduate", "Precision_Enrolled", "Precision_Dropout",
                                                "Recall_Graduate", "Recall_Enrolled", "Recall_Dropout",
                                                "F1_Graduate", "F1_Enrolled", "F1_Dropout"])
df_results.loc["Average"] = df_results.mean(numeric_only=True)  # Add row for average
print(f"\nModel: GaussianNB")
print(df_results)  # Display results for each model
print("\n")


Model: GaussianNB
         Fold  Precision_Graduate  Precision_Enrolled  Precision_Dropout  \
0         1.0            0.760309            0.301818           0.731183   
1         2.0            0.751256            0.310924           0.727700   
2         3.0            0.741772            0.279070           0.729592   
3         4.0            0.767624            0.289552           0.763359   
4         5.0            0.700935            0.239024           0.688372   
Average   3.0            0.744379            0.284078           0.728041   

         Recall_Graduate  Recall_Enrolled  Recall_Dropout  F1_Graduate  \
0               0.690867         0.542484        0.505576     0.723926   
1               0.700234         0.483660        0.576208     0.724848   
2               0.686183         0.470588        0.531599     0.712895   
3               0.688525         0.629870        0.373134     0.725926   
4               0.704225         0.320261        0.550186     0.702576   
Aver

That's all there is to it!