In [24]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def above_threshold(query_results, T, epsilon):
    T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)
    
    for idx, q in enumerate(query_results):
        nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
        if q + nu_i >= T_hat:
            return idx
    return None

import requests
from io import StringIO

url = 'https://raw.githubusercontent.com/kierajclarke/Final-Data-Project/main/student_math_clean.csv'
response = requests.get(url)
data = pd.read_csv(StringIO(response.text))


# Preprocessing

First we select columns related to academic metrics and sex and preprocess the data. Since our aim is to analyze the data of highschool students, we want to limit the age range of the students so that outlying ages do not skew accuracy.

In [9]:
# Filter the dataset for ages between 15 and 18, including selected columns
data = data.loc[(data['age'] >= 15) & (data['age'] <= 18), ['age', 'sex', 'grade_1', 'grade_2', 'final_grade']]


# Convert grades to percentages
# Multiply by 5 to convert from 0-20 to 0-100
data[['grade_1', 'grade_2', 'final_grade']] = data[['grade_1', 'grade_2', 'final_grade']] * 5  

# Display the first few rows with grades converted to percentages
print(data.head())

   age sex  grade_1  grade_2  final_grade
0   18   F       25       30           30
1   17   F       25       25           30
2   15   F       35       40           50
3   15   F       75       70           75
4   16   F       30       50           50


# Correlations

Now we want to examine the correlation between male and female

In [10]:
# Separate the data by gender
female_data = data[data['sex'] == 'F']
male_data = data[data['sex'] == 'M']

# Calculate the correlations for females
female_correlation = female_data[['grade_1', 'grade_2', 'final_grade']].corr()
print("Correlation for Female students:")
print(female_correlation)

# Calculate the correlations for males
male_correlation = male_data[['grade_1', 'grade_2', 'final_grade']].corr()
print("\nCorrelation for Males students:")
print(male_correlation)


Correlation for Female students:
              grade_1   grade_2  final_grade
grade_1      1.000000  0.839361      0.78435
grade_2      0.839361  1.000000      0.89311
final_grade  0.784350  0.893110      1.00000

Correlation for Males students:
              grade_1   grade_2  final_grade
grade_1      1.000000  0.857281     0.825429
grade_2      0.857281  1.000000     0.928189
final_grade  0.825429  0.928189     1.000000


In terms of differential privacy, can we get the same results

In [13]:
# Example of adding Laplace noise to sex-academic metrics correlations
epsilon = 1.0 
sensitivity = 1.0  

noisy_female_correlation = female_correlation.apply(
    lambda x: laplace_mech(x, sensitivity, epsilon)
)

print(noisy_female_correlation)

noisy_male_correlation = male_correlation.apply(
    lambda x: laplace_mech(x, sensitivity, epsilon)
)

print(noisy_male_correlation)


              grade_1   grade_2  final_grade
grade_1      1.030977  0.599287     1.373324
grade_2      0.870338  0.759926     1.482084
final_grade  0.815327  0.653037     1.588974
              grade_1   grade_2  final_grade
grade_1      4.011367  0.396288     1.334592
grade_2      3.868648  0.539007     1.437352
final_grade  3.836796  0.467196     1.509163


# Mean Grades

Computing actual means for Male and Female students

In [22]:
# Calculate means for each sex
mean_grade_male = male_data['final_grade'].mean()
mean_grade_female = female_data['final_grade'].mean()

print("Mean final grade for Male students:", mean_grade_male)
print("Mean final grade for Female students:", mean_grade_female)


Mean final grade for Male students: 55.46242774566474
Mean final grade for Female students: 50.284974093264246


Now using Laplace Mechanism to create noise 

In [25]:
def laplace_mean_grade(df, epsilon):
    def create_query(b):
        return df.clip(lower=0, upper=b).sum() - df.clip(lower=0, upper=b+1).sum()
    
    bs = list(range(0, 100, 1))

    # Construct the stream of queries
    queries = [create_query(b) for b in bs]
    
    # Run AboveThreshold, using 1/3 of the privacy budget, to find a good upper clipping parameter/sensitivity
    epsilon_svt = epsilon / 3
    final_b = bs[above_threshold(queries, 0, epsilon_svt)]

    # Compute the noisy sum and noisy count, using 1/3 of the privacy budget for each
    epsilon_sum = epsilon / 3
    epsilon_count = epsilon / 3
    
    noisy_sum = laplace_mech(df.clip(lower=0, upper=final_b).sum(), final_b, epsilon_sum)
    noisy_count = laplace_mech(len(df), 1, epsilon_count)
    
    return noisy_sum / noisy_count

# Apply Laplace noise to counts and sums
noisy_mean_grade_male = laplace_mean_grade(male_data['final_grade'], 1.0)
noisy_mean_grade_female = laplace_mean_grade(female_data['final_grade'], 1.0)

print("Noisy mean final grade for Male students:", noisy_mean_grade_male)
print("Noisy mean final grade for Female students:", noisy_mean_grade_female)


Noisy mean final grade for Male students: 54.038424033831554
Noisy mean final grade for Female students: 49.06521589141111


Now using Gaussian Mechanism to create noise

In [16]:
# Apply Gaussian noise to means for male and female grades
delta = 1e-5
    
noisy_mean_grade_male = gaussian_mech(mean_grade_male, sensitivity, epsilon, delta)
noisy_mean_grade_female = gaussian_mech(mean_grade_female, sensitivity, epsilon, delta)

print("Noisy mean grades for Male students:")
print(noisy_mean_grade_male)

print("\nNoisy mean grades for Female students:")
print(noisy_mean_grade_female)

Noisy mean grades for Male students:
647.3883269193497

Noisy mean grades for Female students:
-181.66161156564064
