In [156]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/datasets/pratik150507/students-performance-dataset/student-mat.csv
/kaggle/input/datasets/pratik150507/students-performance-dataset/student-por.csv


In [157]:
math_df = pd.read_csv("/kaggle/input/datasets/pratik150507/students-performance-dataset/student-mat.csv", sep=';')
math_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [158]:
math_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [159]:
math_df.duplicated().sum()

np.int64(0)

In [160]:
math_df.isna().sum().sum()


np.int64(0)

In [161]:
def create_risk(grade):
    if grade < 10:
        return 0   # At-risk
    elif grade < 15:
        return 1   # Average
    else:
        return 2   # High-performing

math_df["risk_level"] = math_df["G3"].apply(create_risk)


In [162]:
X = math_df.drop(["G3", "risk_level"], axis=1)
y = math_df["risk_level"]


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [163]:
numerical_cols = X_train.select_dtypes(include=["int64"]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train[numerical_cols])
X_test_num = scaler.transform(X_test[numerical_cols])


In [164]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(
    drop="first",
    handle_unknown="ignore",
    sparse_output=False
)

X_train_cat = encoder.fit_transform(X_train[categorical_cols])
X_test_cat = encoder.transform(X_test[categorical_cols])


In [165]:
X_train_final = np.hstack((X_train_num, X_train_cat))
X_test_final = np.hstack((X_test_num, X_test_cat))

In [166]:
from sklearn.linear_model import LogisticRegression
lrg_model = LogisticRegression(
    max_iter=2000,
    solver="lbfgs"
)
lrg_model.fit(X_train_final, y_train)

In [167]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = lrg_model.predict(X_test_final)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8481012658227848

Confusion Matrix:
 [[22  4  0]
 [ 4 31  3]
 [ 0  1 14]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85        26
           1       0.86      0.82      0.84        38
           2       0.82      0.93      0.88        15

    accuracy                           0.85        79
   macro avg       0.84      0.87      0.85        79
weighted avg       0.85      0.85      0.85        79



In [168]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    "max_depth": [2, 3, 4, 5, 6, None],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10],
    "criterion": ["gini", "entropy"],
    "class_weight": [None, "balanced"]
}


In [169]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

best_depth = None
best_test_score = 0

print("Depth | Train Acc | Test Acc")
print("-" * 35)

for depth in range(1, 21):
    
    dt = DecisionTreeClassifier(
        max_depth=depth,
        random_state=42
    )
    
    dt.fit(X_train_final, y_train)
    
    train_acc = accuracy_score(y_train, dt.predict(X_train_final))
    test_acc = accuracy_score(y_test, dt.predict(X_test_final))
    
    print(f"{depth:5d} | {train_acc:.4f}   | {test_acc:.4f}")
    
    if test_acc > best_test_score:
        best_test_score = test_acc
        best_depth = depth

print("\nBest Depth:", best_depth)
print("Best Test Accuracy:", best_test_score)


Depth | Train Acc | Test Acc
-----------------------------------
    1 | 0.7342   | 0.7342
    2 | 0.8924   | 0.9241
    3 | 0.8924   | 0.9241
    4 | 0.9209   | 0.8987
    5 | 0.9430   | 0.8608
    6 | 0.9620   | 0.8861
    7 | 0.9778   | 0.8228
    8 | 0.9968   | 0.8228
    9 | 1.0000   | 0.8608
   10 | 1.0000   | 0.8608
   11 | 1.0000   | 0.8608
   12 | 1.0000   | 0.8608
   13 | 1.0000   | 0.8608
   14 | 1.0000   | 0.8608
   15 | 1.0000   | 0.8608
   16 | 1.0000   | 0.8608
   17 | 1.0000   | 0.8608
   18 | 1.0000   | 0.8608
   19 | 1.0000   | 0.8608
   20 | 1.0000   | 0.8608

Best Depth: 2
Best Test Accuracy: 0.9240506329113924


In [170]:
dt_final = DecisionTreeClassifier(
    max_depth=2,
    random_state=42
)

dt_final.fit(X_train_final, y_train)

# Predictions
y_pred = dt_final.predict(X_test_final)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:\n")
print(cm)

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Confusion Matrix:

[[25  1  0]
 [ 5 33  0]
 [ 0  0 15]]

Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.96      0.89        26
           1       0.97      0.87      0.92        38
           2       1.00      1.00      1.00        15

    accuracy                           0.92        79
   macro avg       0.93      0.94      0.94        79
weighted avg       0.93      0.92      0.92        79



In [171]:
import joblib

joblib.dump({
    "logistic_model": lrg_model,     # logistic regression model
    "decision_tree_model": dt_final, # decision tree model
    "encoder": encoder, 
    "scaler": scaler,
    "categorical_cols": categorical_cols,
    "numerical_cols": numerical_cols
}, "student_risk_models.pkl")

['student_risk_models.pkl']