In [17]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/datasets/pratik150507/students-performance-dataset/student-mat.csv
/kaggle/input/datasets/pratik150507/students-performance-dataset/student-por.csv


In [19]:
math_df = pd.read_csv("/kaggle/input/datasets/pratik150507/students-performance-dataset/student-mat.csv", sep=';')
math_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [20]:
math_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [21]:
math_df.duplicated().sum()

np.int64(0)

In [22]:
math_df.isna().sum().sum()


np.int64(0)

In [23]:
def create_risk(grade):
    if grade < 10:
        return 0   # At-risk
    elif grade < 15:
        return 1   # Average
    else:
        return 2   # High-performing

math_df["risk_level"] = math_df["G3"].apply(create_risk)


In [24]:
X = math_df.drop(["G3", "risk_level"], axis=1)
y = math_df["risk_level"]


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [25]:
numerical_cols = X_train.select_dtypes(include=["int64"]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train[numerical_cols])
X_test_num = scaler.transform(X_test[numerical_cols])


In [26]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(
    drop="first",
    handle_unknown="ignore",
    sparse_output=False
)

X_train_cat = encoder.fit_transform(X_train[categorical_cols])
X_test_cat = encoder.transform(X_test[categorical_cols])


In [27]:
X_train_final = np.hstack((X_train_num, X_train_cat))
X_test_final = np.hstack((X_test_num, X_test_cat))

In [28]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    max_iter=2000,
    solver="lbfgs"
)
model.fit(X_train_final, y_train)

In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test_final)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8481012658227848

Confusion Matrix:
 [[22  4  0]
 [ 4 31  3]
 [ 0  1 14]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85        26
           1       0.86      0.82      0.84        38
           2       0.82      0.93      0.88        15

    accuracy                           0.85        79
   macro avg       0.84      0.87      0.85        79
weighted avg       0.85      0.85      0.85        79



In [None]:
cat_feature_names = encoder.get_feature_names_out(categorical_cols)
feature_names = list(numerical_cols) + list(cat_feature_names)

# Get model coefficients
coefs = model.coef_


importance_df = pd.DataFrame(
    coefs.T,
    index=feature_names,
    columns=["At-risk (0)", "Average (1)", "High-performing (2)"]
)

# Show top features for At-risk
print("\nTop Features Increasing At-risk:")
print(importance_df.sort_values(by="At-risk (0)", ascending=False).head(10))

# Show overall strongest features
importance_df["Total_Importance"] = importance_df.abs().sum(axis=1)

print("\nOverall Most Important Features:")
print(importance_df.sort_values(by="Total_Importance", ascending=False).head(15))


Top Features Increasing At-risk:
                At-risk (0)  Average (1)  High-performing (2)
age                0.476990    -0.210908            -0.266081
activities_yes     0.429012    -0.159697            -0.269314
absences           0.424318     0.133252            -0.557569
guardian_other     0.398105    -0.306924            -0.091181
nursery_yes        0.388532     0.419856            -0.808388
Fjob_health        0.353246    -0.178199            -0.175047
Fjob_services      0.349976    -0.213390            -0.136586
goout              0.349223    -0.051144            -0.298079
sex_M              0.341559    -0.473363             0.131804
Dalc               0.338007     0.178023            -0.516030

Overall Most Important Features:
                At-risk (0)  Average (1)  High-performing (2)  \
G2                -3.863880     0.126781             3.737099   
G1                -1.898938    -0.051930             1.950868   
nursery_yes        0.388532     0.419856            -0.

In [31]:
import joblib
import joblib

joblib.dump({
    "model": model,
    "encoder": encoder,
    "scaler": scaler,
    "categorical_cols": categorical_cols,
    "numerical_cols": numerical_cols
}, "student_risk_model.pkl")


['student_risk_model.pkl']