# Template to properly conduct the baseline modeling:

First, download the `data_reduced.csv()` from the Github. Then, branch out from the main branch and conduct the modeling in this template:

In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer

In [28]:
# Importing the csv file
df_reduced = pd.read_csv('data_reduced.csv')
df_reduced = df_reduced.drop(columns=['Unnamed: 0'])
df_reduced.head()

Unnamed: 0,Marital status,Application mode,Course,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Target
0,single,2nd phase - general contingent,Animation and Multimedia Design,122.0,Basic Education 3rd Cycle (9th/10th/11th Year)...,Other,"Personal Services, Security and Safety Workers...",Unskilled Workers,127.3,yes,no,no,yes,male,no,20,no,0,0,0,0.0,Dropout
1,single,Other,Tourism,160.0,Secondary Education - 12th Year of Schooling o...,Higher Education - Degree,Intermediate Level Technicians and Professions,Intermediate Level Technicians and Professions,142.5,yes,no,no,no,male,no,19,no,0,6,6,14.0,Graduate
2,single,1st phase - general contingent,Communication Design,122.0,Basic education 1st cycle (4th/5th year) or eq...,Basic education 1st cycle (4th/5th year) or eq...,Unskilled Workers,Unskilled Workers,124.8,yes,no,no,no,male,no,19,no,0,6,0,0.0,Dropout
3,single,2nd phase - general contingent,Journalism and Communication,122.0,Basic Education 2nd Cycle (6th/7th/8th Year) o...,Basic education 1st cycle (4th/5th year) or eq...,"Personal Services, Security and Safety Workers...",Intermediate Level Technicians and Professions,119.6,yes,no,no,yes,female,no,20,no,0,6,6,13.428571,Graduate
4,married,Over 23 years old,Social Service (evening attendance),100.0,Basic education 1st cycle (4th/5th year) or eq...,Basic Education 2nd Cycle (6th/7th/8th Year) o...,Unskilled Workers,Unskilled Workers,141.5,no,no,no,yes,female,no,45,no,0,6,5,12.333333,Graduate


Now, that it is imported let's split this up into X and Y components. Let's also identify the numerical feature columns and categorical feature columns:

In [29]:
# Split data into X and Y components
X = df_reduced.drop(columns=['Target'])
Y = df_reduced['Target']

# Identify column types
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(include=['number']).columns
categorical_columns, numerical_columns

(Index(['Marital status', 'Application mode', 'Course',
        'Mother's qualification', 'Father's qualification',
        'Mother's occupation', 'Father's occupation', 'Displaced',
        'Educational special needs', 'Debtor', 'Tuition fees up to date',
        'Gender', 'Scholarship holder', 'International'],
       dtype='object'),
 Index(['Previous qualification (grade)', 'Admission grade',
        'Age at enrollment', 'Curricular units 1st sem (credited)',
        'Curricular units 1st sem (enrolled)',
        'Curricular units 1st sem (approved)',
        'Curricular units 1st sem (grade)'],
       dtype='object'))

Now we go about the modeling. Below is my implementation of Naive Bayes but obviously replace it with your model of choice.

In [30]:
# Importing necessary libraries
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [31]:
# Initialize the Stratefied K-Fold
strat_kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 88)

# Storing the metrics
results = []

for fold, (train_idx, test_idx) in enumerate(strat_kf.split(X_vals, Y_vals), start = 1):
    # Split the data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y[train_idx], Y[test_idx]

    # Properly transform the data depending on data type
    preprocessors = ColumnTransformer([
    ('num_scaler', StandardScaler(), numerical_columns),
    ('categorical_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])
    X_train_transformed = preprocessors.fit_transform(X_train)
    X_test_transformed = preprocessors.transform(X_test)

    # Ensure X's are dense and not sparse data type
    # This is necessary because of the OneHotEncoding 
    # making more columns.
    X_train_transformed = (X_train_transformed.todense()).A
    X_test_transformed = (X_test_transformed.todense()).A

    # Train a baseline Naive Bayes Model
    model = GaussianNB()
    model.fit(X_train_transformed, Y_train)
    
    # Predict
    Y_pred = model.predict(X_test_transformed)

    # Compute metrics
    # Using weighted average for some metrics due to class inbalance
    results.append([
        fold,
        accuracy_score(Y_test, Y_pred),
        precision_score(Y_test, Y_pred, zero_division=0, average = 'weighted'),
        recall_score(Y_test, Y_pred, average = 'weighted'),
        f1_score(Y_test, Y_pred, average = 'weighted')
    ])

To show the results in a pretty way, I'll create a dataframe:

In [32]:
# Convert results to DataFrame for tabular display
df_results = pd.DataFrame(results, columns=["Fold", "Accuracy", "Precision", "Recall", "F1-score"])
df_results.loc["Average"] = df_results.mean(numeric_only=True)  # Add row for average

# Print results as a table
df_results

Unnamed: 0,Fold,Accuracy,Precision,Recall,F1-score
0,1.0,0.583051,0.643628,0.583051,0.601863
1,2.0,0.583051,0.645294,0.583051,0.599858
2,3.0,0.60226,0.633084,0.60226,0.612751
3,4.0,0.611299,0.643069,0.611299,0.609419
4,5.0,0.441176,0.689279,0.441176,0.469581
Average,3.0,0.564167,0.650871,0.564167,0.578694


That's all there is to it!