In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [7]:
# Load the dataset
file_path = "/Users/lakshmimahadevan/Downloads/student+performance/student/student-mat.csv"
df = pd.read_csv(file_path, sep=";")

In [29]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [9]:
# Feature selection
# selecting a subset of relevant features
features = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 
            'schoolsup', 'famsup', 'paid', 'activities', 'higher', 'internet', 
            'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

In [11]:
X = df[features]

In [13]:
# Target variable--> (dropout risk: 1 if G3 < 10, otherwise 0)
y = np.where(df['G3'] < 10, 1, 0)

In [15]:
# Converting categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

In [17]:
# Splitign the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Standardize the feature variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [23]:
# Model predictions
y_pred = model.predict(X_test)

In [25]:
# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[47  5]
 [17 10]]

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.90      0.81        52
           1       0.67      0.37      0.48        27

    accuracy                           0.72        79
   macro avg       0.70      0.64      0.64        79
weighted avg       0.71      0.72      0.70        79



In [27]:
# Feature importance
importance = pd.Series(model.coef_[0], index=X.columns)
importance = importance.sort_values(ascending=False)
print("\nFeature Importance:")
print(importance)



Feature Importance:
failures          0.583612
goout             0.484266
famsup_yes        0.424205
age               0.338017
schoolsup_yes     0.242641
activities_yes    0.135994
absences          0.073103
health            0.054026
romantic_yes      0.047444
Dalc              0.044204
traveltime       -0.035221
Fedu             -0.075613
internet_yes     -0.096919
Medu             -0.136664
famrel           -0.145652
studytime        -0.152358
Walc             -0.156893
freetime         -0.177438
higher_yes       -0.238169
paid_yes         -0.245309
dtype: float64
