# Reweighing
The algorithm selected for transforming the original dataset is Reweighing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from aif360.datasets import StandardDataset
from aif360.algorithms.preprocessing import Reweighing


pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


In [2]:
# Load the dataset
ogf = pd.read_excel("student-por.xlsx")

# Convert 'sex' column to binary values (0 for Female, 1 for Male)
ogf["sex"] = ogf["sex"].map({"F": 0, "M": 1})

# Ensure all categorical columns are properly encoded
ogf = ogf.apply(lambda col: col.astype('category').cat.codes if col.dtypes == 'object' else col)

# Convert the dataset into an AIF360-compatible format
dataset = StandardDataset(
    ogf,
    label_name="G3",  # Outcome variable
    favorable_classes=[ogf["G3"].max()],  # Favorable outcome (highest grade)
    protected_attribute_names=["sex"],  # Protected attribute
    privileged_classes=[[1]],  # Male is privileged group
)

# Apply the Reweighing algorithm
reweighing = Reweighing(unprivileged_groups=[{"sex": 0}], privileged_groups=[{"sex": 1}])
reweighed_dataset = reweighing.fit_transform(dataset)

# Add weights to the original dataset
ogf["weights"] = reweighed_dataset.instance_weights

# Reverse-map sex column: 0 → F, 1 → M
ogf["sex"] = ogf["sex"].map({0: "F", 1: "M"})

# Save the transformed dataset
ogf.to_excel("reweighed_dataset.xlsx", index=False)

## Statistical Parity and Disparate Impact Reweighed
### Calculating Sex vs. Grades (First Period, Second Period, Final Grade)

In [3]:
df = pd.read_excel("reweighed_dataset.xlsx")
grades = ['G1', 'G2', 'G3']

In [4]:
results = []
for grade in grades:
    df['pass'] = (df[grade] >= 10).astype(int)
    pass_rates = df.groupby('sex')['pass'].mean()
    
    pass_f = pass_rates['F']
    pass_m = pass_rates['M']

    stat_par = pass_f - pass_m
    dis_imp = pass_f / pass_m if pass_m > 0 else float('inf')

    results.append({
        'Grade': grade,
        'Predicted Pass Rate (Female)': round(pass_f, 3),
        'Predicted Pass Rate (Male)': round(pass_m, 3),
        'Statistical Parity Diff': round(stat_par, 3),
        'Disparate Impact (F/M)': round(dis_imp, 3)
    })
print(pd.DataFrame(results))

  Grade  Predicted Pass Rate (Female)  Predicted Pass Rate (Male)  \
0    G1                         0.789                       0.714   
1    G2                         0.812                       0.726   
2    G3                         0.869                       0.812   

   Statistical Parity Diff  Disparate Impact (F/M)  
0                    0.074                   1.104  
1                    0.086                   1.119  
2                    0.057                   1.071  


### Calculating Age vs. Grades (First Period, Second Period, Final Grade)

In [14]:
age_grade_results = []
df['age_group'] = pd.cut(df['age'], bins = [14,17,df['age'].max()], labels=['15-17', '18-22'])

for grade in grades:
    ## Redefining passing threshold
    df['pass'] = (df[grade] >= 10).astype(int)
    pass_rates = df.groupby('age_group', observed=True)['pass'].mean()

    older = pass_rates['18-22']
    younger = pass_rates['15-17']

    ## Privileged group is younger, unprivileged group is older
    stat_par = older - younger
    dis_imp = older / younger if younger > 0 else float('inf')

    ## Perfect fairness threshold for statistical parity difference is 0
    age_grade_results.append({
        'Grade': grade,
        'Predicted Pass Rate (18+)': round(older, 3),
        'Predicted Pass Rate (15-17)': round(younger, 3),
        'Statistical Parity Diff': round(stat_par, 3),
        'Disparate Impact': round(dis_imp, 3)
    })

print(pd.DataFrame(age_grade_results))

  Grade  Predicted Pass Rate (18+)  Predicted Pass Rate (15-17)  \
0    G1                      0.652                        0.799   
1    G2                      0.696                        0.808   
2    G3                      0.790                        0.868   

   Statistical Parity Diff  Disparate Impact  
0                   -0.147             0.816  
1                   -0.112             0.862  
2                   -0.077             0.911  


# Splitting the Original Dataset
To assure that the reweighing reduced bias, the dataset will be split into testing and training data. 
1. The classifer will be used on the original dataset.
2. The testing dataset will be used to compute the fairness metrics 'statistical parity' and 'disparate impact'. 

In [6]:
og_df = pd.read_excel('student-por.xlsx')
X = og_df.drop(columns=['G3'])
y = (og_df['G3'] >= 10).astype(int)
X_coded = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X_coded, y, test_size=0.5, random_state=42)

## Training Classifier on Original Dataset
Dependent variable selected: G3 (Final Grade)

In [7]:
clf = RandomForestClassifier(random_state=42)

X_full_code = pd.get_dummies(X, drop_first=True)

clf.fit(X_train, y_train)

og_df['predicted_label'] = clf.predict(X_full_code)
y_pred_test = clf.predict(X_test)
test_df = X_test.copy()
test_df['pass'] = y_test
test_df['predicted_label'] = y_pred_test
test_df['sex'] = og_df.loc[X_test.index, 'sex']
test_df['age'] = og_df.loc[X_test.index, 'age']
test_df['G3'] = og_df.loc[X_test.index, 'G3']

## Compute Fairness Metrics with Testing Dataset
## Statistical Parity and Disparate Impact
### Calculating Sex vs. Grades (Final Grade)

In [8]:
sex_test_results = []
test_df['pass'] = (test_df['G3'] >= 10).astype(int)
pass_rates = test_df.groupby('sex')['predicted_label'].mean()
    
pass_f = pass_rates['F']
pass_m = pass_rates['M']

stat_par = pass_f - pass_m
dis_imp = pass_f / pass_m if pass_m > 0 else float('inf')

sex_test_results.append({
    'Grade': grade,
    'Predicted Pass Rate (Female)': round(pass_f, 3),
    'Predicted Pass Rate (Male)': round(pass_m, 3),
    'Statistical Parity Diff': round(stat_par, 3),
    'Disparate Impact (F/M)': round(dis_imp, 3)
})
print(pd.DataFrame(sex_test_results))

  Grade  Predicted Pass Rate (Female)  Predicted Pass Rate (Male)  \
0    G3                         0.883                       0.854   

   Statistical Parity Diff  Disparate Impact (F/M)  
0                    0.029                   1.034  


In [9]:
age_test_results = []
test_df['age_group'] = pd.cut(test_df['age'], bins = [14,17,test_df['age'].max()], labels=['15-17', '18-22'])
test_df['pass'] = (test_df['G3'] >= 10).astype(int)
pass_rates = test_df.groupby('age_group', observed=True)['pass'].mean()
    
pass_18 = pass_rates['18-22']
pass_15 = pass_rates['15-17']

stat_par = pass_18 - pass_15
dis_imp = pass_18 / pass_15 if pass_15 > 0 else float('inf')

age_test_results.append({
    'Grade': grade,
    'Predicted Pass Rate (18-22)': round(pass_18, 3),
    'Predicted Pass Rate (15-17)': round(pass_15, 3),
    'Statistical Parity Diff': round(stat_par, 3),
    'Disparate Impact': round(dis_imp, 3)
})
print(pd.DataFrame(age_test_results))

  Grade  Predicted Pass Rate (18-22)  Predicted Pass Rate (15-17)  \
0    G3                        0.809                         0.89   

   Statistical Parity Diff  Disparate Impact  
0                   -0.081             0.909  


# Splitting the Transformed Dataset
To assure that the reweighing reduced bias, the dataset will be split into testing and training data. 
1. The classifer will be used on the transformed dataset.
2. The testing dataset will be used to compute the fairness metrics 'statistical parity' and 'disparate impact'. 

In [10]:
reweighed_X = df.drop(columns=['G3'])
reweighed_y = (df['G3'] >= 10).astype(int)
re_X_code = pd.get_dummies(reweighed_X, drop_first=True)

re_X_train, re_X_test, re_y_train, re_y_test = train_test_split(re_X_code, reweighed_y, test_size=0.5, random_state=42)

In [11]:
re_X_full = pd.get_dummies(reweighed_X, drop_first=True)

clf.fit(re_X_train, re_y_train)

df['predicted_label'] = clf.predict(re_X_full)
y_pred_weighed = clf.predict(re_X_test)
weighed_df = re_X_test.copy()
weighed_df['pass'] = y_test
weighed_df['predicted_label'] = y_pred_weighed
weighed_df['sex'] = og_df.loc[X_test.index, 'sex']
weighed_df['age'] = og_df.loc[X_test.index, 'age']
weighed_df['G3'] = og_df.loc[X_test.index, 'G3']

## Compute Fairness Metrics with Testing Dataset
## Statistical Parity and Disparate Impact
### Calculating Sex vs. Grades (Final Grade)

In [12]:
sex_weighed_results = []
weighed_df['pass'] = (weighed_df['G3'] >= 10).astype(int)
pass_rates = weighed_df.groupby('sex')['predicted_label'].mean()
    
pass_f = pass_rates['F']
pass_m = pass_rates['M']

stat_par = pass_f - pass_m
dis_imp = pass_f / pass_m if pass_m > 0 else float('inf')

sex_weighed_results.append({
    'Grade': grade,
    'Predicted Pass Rate (Female)': round(pass_f, 3),
    'Predicted Pass Rate (Male)': round(pass_m, 3),
    'Statistical Parity Diff': round(stat_par, 3),
    'Disparate Impact (F/M)': round(dis_imp, 3)
})
print(pd.DataFrame(sex_weighed_results))

  Grade  Predicted Pass Rate (Female)  Predicted Pass Rate (Male)  \
0    G3                         0.894                       0.832   

   Statistical Parity Diff  Disparate Impact (F/M)  
0                    0.062                   1.074  


In [17]:
age_weighed_results = []
weighed_df['age_group'] = pd.cut(weighed_df['age'], bins = [14,17,weighed_df['age'].max()], labels=['15-17', '18-22'])
weighed_df['pass'] = (weighed_df['G3'] >= 10).astype(int)
weighed_pass_rates = weighed_df.groupby('age_group', observed=True)['pass'].mean()
    
pass_18 = weighed_pass_rates['18-22']
pass_15 = weighed_pass_rates['15-17']

stat_par = pass_18 - pass_15
dis_imp = pass_18 / pass_15 if pass_15 > 0 else float('inf')

age_weighed_results.append({
    'Grade': grade,
    'Predicted Pass Rate (18-22)': round(pass_18, 3),
    'Predicted Pass Rate (15-17)': round(pass_15, 3),
    'Statistical Parity Diff': round(stat_par, 3),
    'Disparate Impact': round(dis_imp, 3)
})
print(pd.DataFrame(age_weighed_results))

  Grade  Predicted Pass Rate (18-22)  Predicted Pass Rate (15-17)  \
0    G3                        0.809                         0.89   

   Statistical Parity Diff  Disparate Impact  
0                   -0.081             0.909  
