In [115]:
import requests
import json
import pandas as pd
import numpy as np
import lime
import lime.lime_tabular
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

## Generating data points

In [172]:
# potential schools for datapoints 
schools = ['Brown University', 'Columbia University', 'SUNY Binghamton University', 'SUNY New Paltz',
           'Providence College','Rhode Island School of Design', 'Bentley University', 'Colgate University']

# potential gpas for datapoints 
gpa = np.arange(1.2,4.1,0.1,dtype=float)

# degree options for datapoints 
degrees = ['Bachelors','Masters','Phd']

# potential locations for datapoints 
locations = ['Providence','Boston','New York City','Los Angeles','Miami','Chicago', 'Detroit', 'Washington D.C.']

# potential genders 
gender = ['M', 'F', 'N/A']

# potential veteran status 
veteran = ['0','1','N/A']

# potential work authorization
work_ath = ['0','1']

# potential value for disability 
disability = ['0','1','N/A']

# potential ethnicities 
ethnicity = ['0','1','2','3','4']

# potential roles 
roles = ['Junior SWE', 'Senior SWE','Data scientist','Lawyer','ML Engineer','N/A', 'Chef','Bus Driver']

# master list for all attributes not include roles 
candidate_atts = [schools, gpa, degrees, locations, gender, veteran, work_ath, disability, ethnicity]

# columns for dataframe 
COLUMNS = ['School Name','GPA','Degree','Location','Gender',
        'Veteran status','Work authorization','Disability','Ethnicity',
        'Role 1', 'Start 1', 'End 1','Role 2','Start 2','End 2','Role 3','Start 3','End 3']


In [173]:
import numpy as np
import pandas as pd

MONTHS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
COLUMNS = ['School Name', 'GPA', 'Degree', 'Location', 'Gender',
           'Veteran status', 'Work authorization', 'Disability', 'Ethnicity',
           'Role 1', 'Start 1', 'End 1', 'Role 2', 'Start 2', 'End 2', 'Role 3', 'Start 3', 'End 3']

def generate_data(n_samples: int, candidate_atts: list, roles: list) -> pd.DataFrame:
    all_candidates = []  # List for all datapoints

    # Generate for each datapoint
    for i in range(n_samples):
        candidate = [i + 1]  # New candidate starting with Applicant ID

        # Loop through each attribute type for given candidate
        for attribute in candidate_atts:
            # Pick random attribute from list
            candidate.append(attribute[np.random.randint(0, len(attribute))])

        # Generate role types
        cand_roles = [roles[np.random.randint(0, len(roles))] for _ in range(3)]
        role_history = []  # Candidates job history

        # Loop through candidate roles to populate dates
        for index, role in enumerate(cand_roles):
            if role == 'N/A':
                while len(role_history) != 9:
                    role_history.append('N/A')
                break
            else:
                start_month = MONTHS[np.random.randint(0, len(MONTHS))]
                end_month = start_month  # Initialize end_month same as start_month

                start_year = np.random.randint(10, 24)
                range_high = 23 - start_year
                if range_high <= 0:  # Safeguard against low >= high
                    end_year = 23  # Set to max year if no range is available
                else:
                    end_year = start_year + np.random.randint(0, range_high)

                # If the start year equals the end year, adjust the end month to be after the start month
                if start_year == end_year:
                    if start_month == 12:  # Special case where start month is December
                        end_month = 1
                        end_year += 1  # Increment the year if end month cannot be later
                    else:
                        end_month = np.random.randint(start_month + 1, 13)

                start = f"{start_month}/{start_year}"
                if index == 0 and np.random.rand() < 0.1:  # 10% chance to end as 'N/A'
                    end = 'N/A'
                else:
                    end = f"{end_month}/{end_year}"

                role_history.append(role)
                role_history.append(start)
                role_history.append(end)

        candidate.extend(role_history)
        all_candidates.append(candidate)

    # Insert columns including Applicant ID at the start
    columns_with_id = ['Applicant ID'] + COLUMNS
    df = pd.DataFrame(all_candidates, columns=columns_with_id)

    return df

# Generate the DataFrame
df = generate_data(4000, candidate_atts, roles)

# Convert DataFrame to JSON, one dictionary per row
dataset_json = df.to_json(orient='records') #str type
dataset_json = json.loads(dataset_json) #list type
# print(dataset_json)

In [174]:
print(df.head())
print(df.dtypes)

   Applicant ID                    School Name  GPA     Degree  \
0             1  Rhode Island School of Design  2.2    Masters   
1             2               Brown University  2.7    Masters   
2             3             Providence College  3.1    Masters   
3             4            Columbia University  3.8        Phd   
4             5               Brown University  4.0  Bachelors   

          Location Gender Veteran status Work authorization Disability  \
0       Providence    N/A              1                  1        N/A   
1  Washington D.C.      M              1                  1          0   
2           Boston      F              0                  1          1   
3           Boston    N/A              0                  0        N/A   
4       Providence      F            N/A                  1          1   

  Ethnicity          Role 1 Start 1 End 1      Role 2 Start 2 End 2  \
0         1      Senior SWE    1/11  1/16  Senior SWE    6/15  7/15   
1         3     

## Sending requests to API

In [95]:
def model_predict(data_original):
    
    data = data_original.copy()
    if isinstance(data, pd.DataFrame):
        data['GPA'] = data['GPA'].apply(lambda x: f"{x:.2f}")
        data = data.to_dict(orient='records')
    # Serialize the input data to JSON
    dataset = json.dumps(data)
    
    # Define the headers for JSON content type
    headers = {'Content-Type': 'application/json'}
    
    # Call the first API - resume scorer
    resume_url = 'https://jennjwang.pythonanywhere.com'
    resume_response = requests.post(resume_url, data=dataset, headers=headers)
    resume_response_data = json.loads(resume_response.text)
    resume_predictions = json.loads(resume_response_data['prediction'])
    resume_score_map = {item['applicant_id']: item['score'] for item in resume_predictions}

    # Update the input data with the resume score
    for applicant in data:
        applicant_id = str(applicant['Applicant ID'])
        applicant['Resume score'] = resume_score_map.get(applicant_id, None)
    
    # Serialize the updated data for the next API call
    updated_dataset = json.dumps(data)
    
    # Call the second API - candidate scorer
    candidate_url = 'https://heonlee.pythonanywhere.com'
    candidate_response = requests.post(candidate_url, data=updated_dataset, headers=headers)
    candidate_response_data = json.loads(candidate_response.text)
    final_predictions = json.loads(candidate_response_data['prediction'])
    final_score_map = {item['applicant_id']: item['prediction'] for item in final_predictions}
    
    for applicant in data:
        applicant_id = str(applicant['Applicant ID'])
        applicant['Interview prediction'] = final_score_map.get(applicant_id, 0)
        
    results = pd.DataFrame(data)
    results['GPA'] = results['GPA'].astype(float)
    results['Resume score'] = results['Resume score'].astype(float)
    results['Interview prediction'] = results['Interview prediction'].astype(int)
    
    return results

df_prediction = model_predict(df)
df_prediction

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,Resume score,Interview prediction
0,1,Columbia University,2.9,Bachelors,Washington D.C.,F,0,0,0,3,...,10/19,12/19,Lawyer,10/23,11/23,Data scientist,4/16,4/19,4.34,0
1,2,Colgate University,3.8,Phd,Chicago,,1,1,0,1,...,3/17,3/19,Chef,9/10,9/12,Bus Driver,5/11,5/22,6.52,0
2,3,Bentley University,2.8,Masters,Boston,M,,0,,2,...,11/21,11/22,Lawyer,9/13,9/17,Data scientist,11/17,11/21,4.85,0
3,4,Providence College,2.9,Masters,Providence,F,1,1,0,0,...,,,,,,,,,1.24,0
4,5,SUNY New Paltz,2.1,Masters,Chicago,F,,1,,2,...,9/13,9/14,Data scientist,1/23,12/23,Lawyer,8/10,8/19,5.80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3996,Brown University,1.8,Masters,Detroit,F,,0,1,1,...,6/13,6/18,ML Engineer,7/23,9/23,Senior SWE,10/11,12/11,3.35,0
3996,3997,SUNY Binghamton University,3.0,Phd,Chicago,F,0,0,,3,...,6/15,6/19,,,,,,,3.44,1
3997,3998,Bentley University,2.2,Phd,Los Angeles,M,0,0,,0,...,10/13,10/21,Junior SWE,5/21,5/22,Chef,5/21,5/22,1.79,1
3998,3999,Providence College,3.9,Bachelors,New York City,M,,1,0,4,...,,,,,,,,,1.85,1


## Fairness metrics to evaluate the model

In [96]:
# Statistical Parity Difference
def spd(sensitive_attribute, dataset, predicted_labels, majority_class, minority_class):
    """
    Calculate the Statistical Parity Difference (SPD) between majority and minority classes based on predicted labels.

    Parameters:
    - sensitive_attribute (str): Name of the column representing the sensitive attribute.
    - dataset (pd.DataFrame): The dataset containing the sensitive attribute and true outcome variable.
    - predicted_labels (pd.Series): Predicted labels for the outcome variable.
    - majority_class: Value representing the majority class in the sensitive attribute.
    - minority_class: Value representing the minority class in the sensitive attribute.

    Returns:
    - spd (float): Statistical Parity Difference between majority and minority classes.
    """
    predicted_labels = pd.to_numeric(predicted_labels)
    predicted_labels_series = pd.Series(predicted_labels, index=dataset.index)
    majority = dataset[dataset[sensitive_attribute] == majority_class]
    minority = dataset[dataset[sensitive_attribute] == minority_class]

    p_majority = predicted_labels_series[majority.index].mean()
    p_minority = predicted_labels_series[minority.index].mean()

    spd_val =  p_minority - p_majority
    return spd_val

In [97]:
# Disparate Impact
def di(sensitive_attribute, dataset, predicted_labels, majority_class, minority_class):
    """
    Calculate the Disparate Impact (DI) between majority and minority classes based on predicted labels.

    Parameters:
    - sensitive_attribute (str): Name of the column representing the sensitive attribute.
    - dataset (pd.DataFrame): The dataset containing the sensitive attribute and true outcome variable.
    - predicted_labels (pd.Series): Predicted labels for the outcome variable.
    - majority_class: Value representing the majority class in the sensitive attribute.
    - minority_class: Value representing the minority class in the sensitive attribute.

    Returns:
    - di (float): Disparate Impact between majority and minority classes.
    """
    predicted_labels = pd.to_numeric(predicted_labels)
    predicted_labels_series = pd.Series(predicted_labels, index=dataset.index)
    majority = dataset[dataset[sensitive_attribute] == majority_class]
    minority = dataset[dataset[sensitive_attribute] == minority_class]

    p_majority = predicted_labels_series[majority.index].mean()
    p_minority = predicted_labels_series[minority.index].mean()

    di_val = p_minority / p_majority
    return di_val

In [98]:
# spd for current dataset
"""
Range: The range of SPD is [-1, 1]. 
A value of -1 indicates that all favorable outcomes are allocated to the majority group, 
whereas a value of 1 indicates that all favorable outcomes are allocated to the minority group. 
Perfect Fairness: A value of 0 indicates perfect fairness, 
meaning the probability of receiving a favorable outcome is equal for both the majority and minority groups.
"""

spd("Gender", df_prediction, df_prediction["Interview prediction"], "M", "F")

-0.15372753642516587

In [99]:
# di for current dataset
"""
Range: DI is a ratio, so its range is [0, ∞). 
A value of 0 indicates extreme bias against the minority group, 
and a very high value indicates extreme bias against the majority group. 
Perfect Fairness: A DI of 1 (or close to 1) represents perfect fairness, 
suggesting that the probability of receiving a favorable outcome are equal for both groups.
"""

di("Gender", df_prediction, df_prediction["Interview prediction"], "M", "F")

0.581364203836114

## LIME

In [190]:
# Assuming df is your DataFrame
categorical_features = [i for i, col in enumerate(df.columns) if df[col].dtype == 'object']

# Initialize the explainer for classification, specifying categorical features
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(df),
    feature_names=df.columns,
    class_names=['No Interview', 'Interview'],  # 0 = No Interview, 1 = Interview
    mode='classification',
    categorical_features=categorical_features,  # Specify which features are categorical
    discretize_continuous=True
)

# Select an instance to explain
instance_index = 1  # Adjust this to your specific instance
instance = df.drop(['Applicant ID'], axis=1).iloc[instance_index].values

# Define your model_predict_api function as before
def model_predict_api(data_as_np):
    input_df = pd.DataFrame(data_as_np, columns=df.columns)
    predictions_df = model_predict(input_df)
    return predictions_df['Interview prediction'].values

# Explain the prediction
exp = explainer.explain_instance(
    instance,
    model_predict_api, 
    num_features=5, 
    top_labels=1
)
exp.show_in_notebook(show_table=True, show_all=False)

ValueError: could not convert string to float: 'Rhode Island School of Design'