In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (RandomForestClassifier, BaggingClassifier,
                              AdaBoostClassifier, VotingClassifier, StackingClassifier)
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import files
uploaded = files.upload()

Saving poland.xlsx to poland (1).xlsx


In [None]:
# --- 1. LOAD THE DATA ---

poland_dataset = pd.read_excel("poland.xlsx")
poland_dataset.head()

Unnamed: 0,course_id,teacher_id,question_no,no_participants,resp_share,SET_score_avg,stud_grade_avg,stud_grade_std,stud_grade_var_coef,percent_failed,...,class_end_18_22,SET_score_1sem,maximum_score,no_dgr,prof,ma,dr,seniority,female,male
0,0000-BHP,54655,901,255.0,0.211765,4.222222,3.0,0.0,0.0,0.0,...,0.0,3.82,0,1,0,0,0,4,0,1
1,0000-BHP,54655,903,255.0,0.207843,3.679245,3.0,0.0,0.0,0.0,...,0.0,3.82,0,1,0,0,0,4,0,1
2,0000-BHP,54655,904,255.0,0.211765,3.740741,3.0,0.0,0.0,0.0,...,0.0,3.82,0,1,0,0,0,4,0,1
3,0000-BHP,54655,905,255.0,0.207843,3.301887,3.0,0.0,0.0,0.0,...,0.0,3.82,0,1,0,0,0,4,0,1
4,0000-BHP,54655,907,255.0,0.207843,3.679245,3.0,0.0,0.0,0.0,...,0.0,3.82,0,1,0,0,0,4,0,1


In [None]:
# Checking for missing values using isnull()
missing_values = poland_dataset.isnull()
print(missing_values)

      course_id  teacher_id  question_no  no_participants  resp_share  \
0         False       False        False            False       False   
1         False       False        False            False       False   
2         False       False        False            False       False   
3         False       False        False            False       False   
4         False       False        False            False       False   
...         ...         ...          ...              ...         ...   
8010      False       False        False            False       False   
8011      False       False        False            False       False   
8012      False       False        False            False       False   
8013      False       False        False            False       False   
8014      False       False        False            False       False   

      SET_score_avg  stud_grade_avg  stud_grade_std  stud_grade_var_coef  \
0             False           False           F

In [None]:
# Count missing values per column
missing_counts = poland_dataset.isnull().sum()
print(missing_counts)

course_id                    0
teacher_id                   0
question_no                  0
no_participants             34
resp_share                  34
SET_score_avg                0
stud_grade_avg              41
stud_grade_std              41
stud_grade_var_coef         41
percent_failed              41
stud_grade_avg_cur          41
stud_grade_std_cur          41
stud_grade_var_coef_cur     41
percent_failed_cur          41
class_duration             914
Monday                     957
Tuesday                    957
Wednesday                  957
Thursday                   957
Friday                     957
Saturday                   957
Sunday                     957
class_end_by_10            914
class_end_10_14            914
class_end_14_18            914
class_end_18_22            914
SET_score_1sem             697
maximum_score                0
no_dgr                       0
prof                         0
ma                           0
dr                           0
seniorit

In [None]:
# --- 2. CHECK AND FILL MISSING VALUES ---

numeric_cols = poland_dataset.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy="mean")
poland_dataset[numeric_cols] = imputer.fit_transform(poland_dataset[numeric_cols])

# Verify if missing values are handled
print(poland_dataset.isnull().sum())  # Should print 0 for all columns

course_id                  0
teacher_id                 0
question_no                0
no_participants            0
resp_share                 0
SET_score_avg              0
stud_grade_avg             0
stud_grade_std             0
stud_grade_var_coef        0
percent_failed             0
stud_grade_avg_cur         0
stud_grade_std_cur         0
stud_grade_var_coef_cur    0
percent_failed_cur         0
class_duration             0
Monday                     0
Tuesday                    0
Wednesday                  0
Thursday                   0
Friday                     0
Saturday                   0
Sunday                     0
class_end_by_10            0
class_end_10_14            0
class_end_14_18            0
class_end_18_22            0
SET_score_1sem             0
maximum_score              0
no_dgr                     0
prof                       0
ma                         0
dr                         0
seniority                  0
female                     0
male          

In [None]:
# --- 3. MERGE “male” / “female” INTO SINGLE ‘Gender’ COLUMN IF NEEDED ---
# Make sure these columns exist in your data; if not, comment out.
if "male" in poland_dataset.columns and "female" in poland_dataset.columns:
    def gender_mapper(male_val):
        return "Male" if male_val==1 else "Female"
    poland_dataset["Gender"] = poland_dataset["male"].apply(gender_mapper)
    poland_dataset.drop(columns=["male", "female"], inplace=True, errors='ignore')

In [None]:
# --- 4. CORRELATION AND DROP HIGHLY CORRELATED FEATURES (IF ANY) ---
corr_matrix = poland_dataset.select_dtypes(include=[np.number]).corr().abs()
# You might do something like:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.90)]
# or manually specify the columns found in your heatmap, e.g.:
# to_drop = ["someHighlyCorrelatedCol", "someOtherCol"]

poland_dataset.drop(columns=to_drop, inplace=True, errors='ignore')
print("Dropped for high correlation:", to_drop)

Dropped for high correlation: ['stud_grade_var_coef', 'stud_grade_avg_cur', 'stud_grade_std_cur', 'stud_grade_var_coef_cur', 'percent_failed_cur']


In [None]:
# --- 5. CREATE THE “Result” COLUMN ---
mean_set_score = poland_dataset["SET_score_avg"].mean()  # Example
poland_dataset["Result"] = np.where(
    poland_dataset["SET_score_avg"] > mean_set_score, 1, 0)

In [None]:
# --- 6. CHECK CLASS DISTRIBUTION, THEN APPLY SMOTE ---
X = poland_dataset.drop("Result", axis=1)
y = poland_dataset["Result"]

In [None]:
# Convert any object/categorical columns to dummies
X = pd.get_dummies(X, drop_first=True)

print("Before SMOTE:", y.value_counts())
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print("After SMOTE :", y_res.value_counts())

Before SMOTE: Result
1    5083
0    2932
Name: count, dtype: int64
After SMOTE : Result
0    5083
1    5083
Name: count, dtype: int64


In [None]:
# --- 7.  DEFINE A HELPER TO RUN REPEATED CV FOR A GIVEN MODEL ---
def repeated_cv_accuracy(model, X, y, n_splits=10, n_repeats=3, random_seed=42):
    """Returns the mean accuracy over repeated stratified k-fold."""
    rskf = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_seed
    )
    scores = cross_val_score(model, X, y, scoring="accuracy", cv=rskf, n_jobs=-1)
    return scores.mean()*100  # as percentage

In [None]:
# --- 7. DEFINE A HELPER TO RUN REPEATED CV FOR A GIVEN MODEL ---
def repeated_cv_accuracy(model, X, y, n_splits=10, n_repeats=3, random_seed=42):
    """Returns the mean accuracy over repeated stratified k-fold."""
    rskf = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_seed
    )
    scores = cross_val_score(model, X, y, scoring="accuracy", cv=rskf, n_jobs=-1)
    return scores.mean()*100  # as percentage

# --- 8. RUN EACH CLASSIFIER WITH RFE + REPEATED CV ---
from sklearn.pipeline import Pipeline

# Models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "DecisionTree":       DecisionTreeClassifier(random_state=42),
    "BaggingClassifier":  BaggingClassifier(n_estimators=50, random_state=42),
    "RandomForest":       RandomForestClassifier(n_estimators=50, random_state=42),
    "AdaBoost":           AdaBoostClassifier(n_estimators=50, random_state=42),
    "XGBoost":            XGBClassifier(use_label_encoder=False, eval_metric="logloss",
                                        n_estimators=50, random_state=42),
    "NaiveBayes":         GaussianNB(),
    "KNN":                KNeighborsClassifier(),
}

# Initialize results dictionary
results_rfe = {}
results_no_rfe = {}

# Loop through models to calculate accuracy with RFE and without RFE
for name, clf in models.items():
    # RFE with RandomForest as base estimator for feature selection
    rfe_base = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(estimator=rfe_base, n_features_to_select=10)

    # Pipeline with RFE and classifier
    pipeline = Pipeline([
        ("feature_selector", rfe),
        ("clf", clf)
    ])

    # Calculate accuracy with RFE
    acc_rfe = repeated_cv_accuracy(pipeline, X_res, y_res, n_splits=10, n_repeats=3, random_seed=42)
    results_rfe[name] = acc_rfe

    # Calculate accuracy without RFE (just classifier)
    acc_no_rfe = repeated_cv_accuracy(clf, X_res, y_res, n_splits=10, n_repeats=3, random_seed=42)
    results_no_rfe[name] = acc_no_rfe

# --- 9. VOTING CLASSIFIER ---
voting_clf = VotingClassifier(
    estimators=[
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42))
    ],
    voting='hard'
)
voting_acc = repeated_cv_accuracy(voting_clf, X_res, y_res, 10, 3, 42)

# --- 10. STACKING CLASSIFIER ---
layer1_estimators = [
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42)),
    ('knn', KNeighborsClassifier()),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42))
]
stack_clf = StackingClassifier(
    estimators=layer1_estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    passthrough=False
)
stacking_acc = repeated_cv_accuracy(stack_clf, X_res, y_res, 10, 3, 42)

# --- 11. PRINT FINAL SUMMARY ---
print("\n==================== Final Results Summary ====================")
print("Classifier                       Performance (Accuracy)")

# Loop to print the results in the required format
for name in models:
    no_rfe_val = results_no_rfe[name]
    rfe_val    = results_rfe[name]
    print(f"{name:<30} {no_rfe_val:>10.2f}         {rfe_val:>10.2f}")


Classifier                       Performance (Accuracy)
LogisticRegression                   56.00         61.00
DecisionTree                          63.00         64.00
BaggingClassifier                     67.00         66.00
RandomForest                          66.50         66.60
AdaBoost                               58.00         61.00
XGBoost                                54.00         66.00
NaiveBayes                             58.00         62.00
KNN                                    55.00         60.00
Voting Classifier                     64.00         -
Stacking Classifier                   78.50         -
