Connected to causal_test (Python 3.9.19)

In [None]:
# Goals: 
# - Show how to use correlation and association to understand students who fail a subject.
# - Use modelling to draw conclusions and develop experiments.

# Questions: 
# - What influences whether a student will fail a subject?
# - What can be done to decrease students failing?

# This scenario is pre-treatment analysis

In [1]:
import pandas as pd
import numpy as np
import pytimetk as tk
import re
import math
from missingno import matrix

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Binomial
from statsmodels.genmod.families.links import logit
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

import graphviz
import networkx as next
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
por_raw_df = pd.read_csv('data/student-por_raw.csv')

In [3]:
por_df = por_raw_df.copy()

In [4]:
por_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,...,5,4,2,1,2,5,4,10,11,10
645,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
646,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
647,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


In [None]:
por_df['grade_avg'] = por_df[['G1', 'G2', 'G3']].mean(axis=1)

In [None]:
por_df['passed'] = [0 if g3 < 10 else 1 for g3 in por_df['G3']]

In [None]:
pass_fail_counts = por_df['passed'].value_counts()
total_students = len(por_df)
pass_fail_proportions = pass_fail_counts / total_students

In [None]:
print(f"Number of students who passed: {pass_fail_counts[1]}")
print(f"Number of students who failed: {pass_fail_counts.get(0, 0)}")
print(f"Proportion of students who passed: {pass_fail_proportions[1]:.3f}")
print(f"Proportion of students who failed: {pass_fail_proportions.get(0, 0):.3f}")

In [None]:
por_df.glimpse()

In [None]:
por_df = por_df.drop(columns=['G1', 'G2', 'G3'])

In [None]:
por_binarized_df = por_df.binarize()

In [None]:
por_binarized_df.glimpse()

In [None]:
por_correlated_df = por_binarized_df.correlate(target="passed__0")

In [None]:
por_correlated_df

In [None]:
por_correlated_df.plot_correlation_funnel()

In [None]:
por_subset_df = por_df.drop(columns=['grade_avg'])

In [None]:
por_subset_df

In [None]:
study_df = por_subset_df[['passed', 'studytime']].copy()

In [None]:
study_df

In [None]:
studytime_dummies = pd.get_dummies(study_df['studytime'], prefix='studytime')

In [None]:
y = study_df['passed'].astype(float)
X = studytime_dummies.astype(float)


In [None]:
X = sm.add_constant(X)

In [None]:
logit_model = sm.Logit(y, X)
result = logit_model.fit()

In [None]:
print(result.summary())

In [None]:
np.exp(-0.5035)
np.exp(1.1243)

In [None]:
health_df = por_subset_df[['passed', 'health']].copy()

In [None]:
health_dummies = pd.get_dummies(health_df['health'], prefix='health')
y = health_df['passed'].astype(float)
X = health_dummies.astype(float)

In [None]:
X = sm.add_constant(X)
logit_model = sm.Logit(y, X)
result = logit_model.fit()

In [None]:
print(result.summary())

In [None]:
sex_df = por_subset_df[['passed', 'sex']].copy()

In [None]:
sex_df

In [None]:
sex_dummies = pd.get_dummies(sex_df['sex'], prefix='sex')
y = sex_df['passed'].astype(float)
X = sex_dummies.astype(float)

In [None]:
X = sm.add_constant(X)
logit_model = sm.Logit(y, X)
result = logit_model.fit()

In [None]:
print(result.summary())

In [None]:
# This does not give us an accurate picture of the problem so we need to consider confounders

In [None]:
df = por_subset_df.copy()
df = pd.get_dummies(df, drop_first=True) 

for column in df.columns:
    if df[column].dtype == 'bool':
        df[column] = df[column].astype(int)

train_df = df.sample(frac=0.8, random_state=123)
test_df = df.drop(train_df.index)

target_column = 'passed_1'

X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]

model = GLM(y_train, sm.add_constant(X_train), family=Binomial(link=logit()))
result = model.fit_regularized(method='elastic_net', alpha=0.01, L1_wt=1.0)  # L1_wt=1.0 for Lasso

X_test = sm.add_constant(test_df.drop(columns=[target_column]))
y_test = test_df[target_column]

predicted_probabilities = result.predict(X_test)

roc_auc = roc_auc_score(y_test, predicted_probabilities)

params_df = pd.DataFrame(result.params, index=X_train.columns, columns=['Coefficient'])

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
roc_auc

In [None]:
params_df