In [1]:
import requests
import json
import pandas as pd
import numpy as np

## Generating data points

In [2]:
# potential schools for datapoints 
schools = ['Brown University', 'Columbia University', 'SUNY Binghamton University', 'SUNY New Paltz',
           'Providence College','Rhode Island School of Design', 'Bentley University', 'Colgate University']

# potential gpas for datapoints 
gpa = np.arange(1.2,4.1,0.1,dtype=float)
gpa = [f"{grade:.1f}" for grade in gpa]

# degree options for datapoints 
degrees = ['Bachelors','Masters','Phd']

# potential locations for datapoints 
locations = ['Providence','Boston','New York City','Los Angeles','Miami','Chicago', 'Detroit', 'Washington D.C.']

# potential genders 
gender = ['M', 'F', 'N/A']

# potential veteran status 
veteran = ['0','1','N/A']

# potential work authorization
work_ath = ['0','1']

# potential value for disability 
disability = ['0','1','N/A']

# potential ethnicities 
ethnicity = ['0','1','2','3','4']

# potential roles 
roles = ['Junior SWE', 'Senior SWE','Data scientist','Lawyer','ML Engineer','N/A', 'Chef','Bus Driver']

# master list for all attributes not include roles 
candidate_atts = [schools, gpa, degrees, locations, gender, veteran, work_ath, disability, ethnicity]

# columns for dataframe 
COLUMNS = ['School Name','GPA','Degree','Location','Gender',
        'Veteran status','Work authorization','Disability','Ethnicity',
        'Role 1', 'Start 1', 'End 1','Role 2','Start 2','End 2','Role 3','Start 3','End 3']


In [3]:
import numpy as np
import pandas as pd

MONTHS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
COLUMNS = ['School Name', 'GPA', 'Degree', 'Location', 'Gender',
           'Veteran status', 'Work authorization', 'Disability', 'Ethnicity',
           'Role 1', 'Start 1', 'End 1', 'Role 2', 'Start 2', 'End 2', 'Role 3', 'Start 3', 'End 3']

def generate_data(n_samples: int, candidate_atts: list, roles: list) -> pd.DataFrame:
    all_candidates = []  # List for all datapoints

    # Generate for each datapoint
    for i in range(n_samples):
        candidate = [i + 1]  # New candidate starting with Applicant ID

        # Loop through each attribute type for given candidate
        for attribute in candidate_atts:
            # Pick random attribute from list
            candidate.append(attribute[np.random.randint(0, len(attribute))])

        # Generate role types
        cand_roles = [roles[np.random.randint(0, len(roles))] for _ in range(3)]
        role_history = []  # Candidates job history

        # Loop through candidate roles to populate dates
        for index, role in enumerate(cand_roles):
            if role == 'N/A':
                while len(role_history) != 9:
                    role_history.append('N/A')
                break
            else:
                start_month = MONTHS[np.random.randint(0, len(MONTHS))]
                end_month = start_month  # Initialize end_month same as start_month

                start_year = np.random.randint(10, 24)
                range_high = 23 - start_year
                if range_high <= 0:  # Safeguard against low >= high
                    end_year = 23  # Set to max year if no range is available
                else:
                    end_year = start_year + np.random.randint(0, range_high)

                # If the start year equals the end year, adjust the end month to be after the start month
                if start_year == end_year:
                    if start_month == 12:  # Special case where start month is December
                        end_month = 1
                        end_year += 1  # Increment the year if end month cannot be later
                    else:
                        end_month = np.random.randint(start_month + 1, 13)

                start = f"{start_month}/{start_year}"
                if index == 0 and np.random.rand() < 0.1:  # 10% chance to end as 'N/A'
                    end = 'N/A'
                else:
                    end = f"{end_month}/{end_year}"

                role_history.append(role)
                role_history.append(start)
                role_history.append(end)

        candidate.extend(role_history)
        all_candidates.append(candidate)

    # Insert columns including Applicant ID at the start
    columns_with_id = ['Applicant ID'] + COLUMNS
    df = pd.DataFrame(all_candidates, columns=columns_with_id)

    return df

# Generate the DataFrame
df = generate_data(4000, candidate_atts, roles)

# Convert DataFrame to JSON, one dictionary per row
dataset_json = df.to_json(orient='records') #str type
dataset_json = json.loads(dataset_json) #list type
# print(dataset_json)

In [4]:
df.head()

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,1,Rhode Island School of Design,2.1,Bachelors,Chicago,M,1.0,1,0.0,4,Bus Driver,12/21,1/22,Senior SWE,1/10,1/12,Bus Driver,6/17,6/20
1,2,Providence College,2.0,Masters,Providence,F,1.0,0,,1,,,,,,,,,
2,3,Rhode Island School of Design,1.2,Masters,Detroit,F,,0,1.0,1,Senior SWE,5/10,5/13,Junior SWE,12/15,12/18,,,
3,4,Brown University,3.4,Masters,Chicago,,,1,,0,Senior SWE,1/11,,Chef,6/12,6/21,ML Engineer,5/23,8/23
4,5,Providence College,1.2,Phd,Los Angeles,M,1.0,0,1.0,2,Lawyer,1/14,1/18,Data scientist,12/11,12/17,Senior SWE,5/14,9/14


## Sending requests to API

In [5]:
# call the first API - resume scorer
dataset = json.dumps(dataset_json) #str type

url = 'https://jennjwang.pythonanywhere.com'
headers = {'Content-Type': 'application/json'}

response = requests.post(url, data=dataset, headers=headers) #response object
# print(response.text)

In [6]:
# add resume score to dataset_json
response_data = json.loads(response.text)
predictions = json.loads(response_data['prediction'])
score_map = {item['applicant_id']: item['score'] for item in predictions}

for applicant in dataset_json:
    applicant_id = str(applicant['Applicant ID'])
    if applicant_id in score_map:
        applicant['Resume score'] = score_map[applicant_id]

In [7]:
# call the second API - candidate scorer

url = 'https://heonlee.pythonanywhere.com'
headers = {'Content-Type': 'application/json'}

response = requests.post(url, data=dataset, headers=headers)
# print(response.text)

In [8]:
# final dataset
final_response_data = json.loads(response.text)
final_predictions = json.loads(final_response_data['prediction'])
final_score_map = {item['applicant_id']: item['prediction'] for item in final_predictions}
for applicant in dataset_json:
    applicant_id = str(applicant['Applicant ID'])
    if applicant_id in final_score_map:
        applicant['Interview prediction'] = final_score_map[applicant_id]

In [9]:
df = pd.DataFrame(dataset_json)
df

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,Resume score,Interview prediction
0,1,Rhode Island School of Design,2.1,Bachelors,Chicago,M,1,1,0,4,...,12/21,1/22,Senior SWE,1/10,1/12,Bus Driver,6/17,6/20,9.98,0
1,2,Providence College,2.0,Masters,Providence,F,1,0,,1,...,,,,,,,,,8.27,0
2,3,Rhode Island School of Design,1.2,Masters,Detroit,F,,0,1,1,...,5/10,5/13,Junior SWE,12/15,12/18,,,,8.35,1
3,4,Brown University,3.4,Masters,Chicago,,,1,,0,...,1/11,,Chef,6/12,6/21,ML Engineer,5/23,8/23,4.11,0
4,5,Providence College,1.2,Phd,Los Angeles,M,1,0,1,2,...,1/14,1/18,Data scientist,12/11,12/17,Senior SWE,5/14,9/14,6.39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3996,Providence College,3.8,Phd,Washington D.C.,F,1,1,,3,...,4/11,,Senior SWE,11/14,11/20,Data scientist,9/17,9/21,0.99,0
3996,3997,Providence College,3.1,Bachelors,Detroit,F,1,0,,0,...,8/22,9/22,Lawyer,6/11,6/14,Senior SWE,7/12,9/12,9.72,0
3997,3998,Bentley University,1.7,Phd,New York City,,0,1,0,0,...,1/14,1/17,Senior SWE,8/12,8/14,Senior SWE,4/17,4/21,7.96,0
3998,3999,Brown University,3.4,Phd,Washington D.C.,M,,1,0,4,...,,,,,,,,,2.59,0


## Fairness metrics to evaluate the model

In [10]:
# Statistical Parity Difference
def spd(sensitive_attribute, dataset, predicted_labels, majority_class, minority_class):
    """
    Calculate the Statistical Parity Difference (SPD) between majority and minority classes based on predicted labels.

    Parameters:
    - sensitive_attribute (str): Name of the column representing the sensitive attribute.
    - dataset (pd.DataFrame): The dataset containing the sensitive attribute and true outcome variable.
    - predicted_labels (pd.Series): Predicted labels for the outcome variable.
    - majority_class: Value representing the majority class in the sensitive attribute.
    - minority_class: Value representing the minority class in the sensitive attribute.

    Returns:
    - spd (float): Statistical Parity Difference between majority and minority classes.
    """
    predicted_labels = pd.to_numeric(predicted_labels)
    predicted_labels_series = pd.Series(predicted_labels, index=dataset.index)
    majority = dataset[dataset[sensitive_attribute] == majority_class]
    minority = dataset[dataset[sensitive_attribute] == minority_class]

    p_majority = predicted_labels_series[majority.index].mean()
    p_minority = predicted_labels_series[minority.index].mean()

    spd_val =  p_minority - p_majority
    return spd_val

In [11]:
# Disparate Impact
def di(sensitive_attribute, dataset, predicted_labels, majority_class, minority_class):
    """
    Calculate the Disparate Impact (DI) between majority and minority classes based on predicted labels.

    Parameters:
    - sensitive_attribute (str): Name of the column representing the sensitive attribute.
    - dataset (pd.DataFrame): The dataset containing the sensitive attribute and true outcome variable.
    - predicted_labels (pd.Series): Predicted labels for the outcome variable.
    - majority_class: Value representing the majority class in the sensitive attribute.
    - minority_class: Value representing the minority class in the sensitive attribute.

    Returns:
    - di (float): Disparate Impact between majority and minority classes.
    """
    predicted_labels = pd.to_numeric(predicted_labels)
    predicted_labels_series = pd.Series(predicted_labels, index=dataset.index)
    majority = dataset[dataset[sensitive_attribute] == majority_class]
    minority = dataset[dataset[sensitive_attribute] == minority_class]

    p_majority = predicted_labels_series[majority.index].mean()
    p_minority = predicted_labels_series[minority.index].mean()

    di_val = p_minority / p_majority
    return di_val

In [12]:
# spd for current dataset
"""
Range: The range of SPD is [-1, 1]. 
A value of -1 indicates that all favorable outcomes are allocated to the majority group, 
whereas a value of 1 indicates that all favorable outcomes are allocated to the minority group. 
Perfect Fairness: A value of 0 indicates perfect fairness, 
meaning the probability of receiving a favorable outcome is equal for both the majority and minority groups.
"""

spd("Gender", df, df["Interview prediction"], "M", "F")

-0.1477029558793497

In [13]:
# di for current dataset
"""
Range: DI is a ratio, so its range is [0, ∞). 
A value of 0 indicates extreme bias against the minority group, 
and a very high value indicates extreme bias against the majority group. 
Perfect Fairness: A DI of 1 (or close to 1) represents perfect fairness, 
suggesting that the probability of receiving a favorable outcome are equal for both groups.
"""

di("Gender", df, df["Interview prediction"], "M", "F")

0.6053650754111584