In [1]:
import math
import numpy as np
import pandas as pd

In [2]:
def loadPpl(race, gender):
    numPpl = 500
    Gender = {"male": .60, "female": .40}

    races = {
        "Caucasian": .55,
        "African-American": .14,
        "Asian": .15,
        "Hispanic": .10,
        "Other": .06
    }
    
    return int(math.floor(numPpl * Gender[gender] * races[race]))


# Run Cell Below to Form Up Dataset

In [3]:
# insertMetric1Skews - a function which skews the distributions of the features correlated to the first metric
# which is the Manager's Assessment Score. Moreover, we do some last minute data sorting/cleaning including the 
# discretization of continuous "education values" for the sake of being more realistic. The skew is applied
# explicitly, where the Manager's Assessment Score is penalized for females and African-Americans/Hispanics
# this is meant to illustrate a blatantly racist type assessment.
# @params: The dataset is the df of individuals to manipulate
# @return: The df of individuals with the features educPrestige/softSkills of minorities skewed
def insertMetric1Skews(dataset):
    df = dataset.copy()
    # Prestige Normalization     
    prestigeMax = df["Educational Prestige"].max()
    df["Educational Prestige"] = np.round(df["Educational Prestige"].apply(lambda x: x / prestigeMax), 2)
    
    # Education Discretization into quintiles
    df = df.sort_values(["Education"], ascending=False)
    fifth = math.floor(len(dataset) / 5)
    num = 5
    labels = []
    for x in range(5):
        for y in range(fifth):
            labels.append(num)
        num -= 1
    df["Education"] = labels
    df = df.sample(frac=1).reset_index(drop=True) # Need to shuffle after ordering.
    # Increase in the manager's assessment score to try and resolve the weights - but soft skills is a mess.     
    df["Manager's Assessment Score"] = df["Manager's Assessment Score"] + 10
    
    # Actual Skewage...
    # Applying skew to educationalPrestige & softSkills for metric1
    minorities = extractMinorities(df, ["African-American", "Hispanic"])
    minorityIndxs = minorities.index.tolist()
    educPrestige = list(df["Educational Prestige"])
    softSkills = list(df["Soft Skills"])
    for indx in minorityIndxs:
        educPrestige[indx] = educPrestige[indx] - .3
    minorityIndxs = extractMinorities(df, ['Asian', "Hispanic", "African-American"], "female").index.tolist()
    print(len(minorityIndxs))
    for indx in minorityIndxs:
        softSkills[indx] = softSkills[indx] - 2.4
    df["Educational Prestige"] = educPrestige
    df["Soft Skills"] = softSkills
    return df

# printStats - simply prints out the mean and median values in the dataset for each feature in the feature array 
# by the label value. For example suppose label values for the metric of choice, job tenure, have been categorized from 0 to 5,
# such that each individual in the dataframe has a label value of 0-5. This means that function will print the 
# mean and median values of each feature in features with label value 0, then label value 1, etc.
# @params: employees is the df of individuals, label is the column that has the discrete label of each employee, features
# is an array containing features we want to examine the mean and median of per label value.
# @returns: Nothing just prints out the values.
def printStats(employees, label, features):
    df = employees.copy()
    
    label_vals = df[label].unique()
    label_vals.sort()
    
    for feature in features:
        for label_val in label_vals:
            
            selectedFeature = df.loc[(df[label] == label_val)][feature]
            print("The mean of " + feature + " for employees with " + str(label_val) + " is", selectedFeature.mean())
            print("The median of " + feature + " for employees with " + str(label_val) + " is", selectedFeature.median())
            print("")
            
# insertMetric2Skews - is a function which purposefully manipulates dataset features that correlate with the 
# 2nd metric which is Job Tenure. In this iteration of the skews, we've directly subtracted 
# .5 from the cultural fit on African-American and Hispanic individuals 
# @params - dataset is the the df of individuals to manipulate
# @returns - the manipulated df            
def insertMetric2Skews(dataset):
    df = dataset.copy()
    
    minorities = extractMinorities(df, ["African-American", "Hispanic"])
    minorityIndxs = minorities.index.tolist()

    culturalFit = list(df["Cultural Fit"])
    for indx in minorityIndxs:
        culturalFit[indx] = culturalFit[indx] - .5
    df["Cultural Fit"] = culturalFit
    
    return df

# insertMetric3Skews is a function which purposefully manipulates dataset features that correlate with the 3rd metric
# which is the Average Number of Deals closed. In this iteration of the skews, there are no biases placed on the 
# average number of deals closed.
# @param: the dataset of individuals to skew 
# @return: skewed dataset
def insertMetric3Skews(dataset):
    df = dataset.copy()
    return df
#     df = insertAgeBias(df, 35, "Technical Aptitude")
    return df

# extractMinorities - is a function which takes in a dataset of individuals, an array of selected races, 
# and an optional gender flag. If the raceArray elem at indx 0 is "all", then it simply checks to see if the 
# gender flag is populated and returns the dataset of individuals belonging to that gender. Else, the function 
# returns a dataframe of all the individuals in the dataset that are of the gender and are one of the races in the 
# passed in raceArray.
# @param: dataset is the df of individuals, raceArray is an array containing races the caller would like to find people of
# gender is an optional flag that can be specified to return either just males or females.
# @return: dataset of individuals whose race is one of those specified in the raceArray.
def extractMinorities(dataset, raceArray, gender=None):
    races = [
    "Caucasian",
    "African-American",
    "Asian",
    "Hispanic",
    "Other"
    ]
    
    df = dataset.copy()
    if raceArray[0] == "all":
        df = df.loc[(df["Race"] != "Caucasian")]
        if gender != None:
            df = df.loc[(df["Gender"] == gender)]
        return df
    else:
        dfs = []
        for race in raceArray:
            if race not in races:
                print("Please select one of these races for ", races)
                return
            else:
                mini_df = df.loc[df["Race"]== race]
                if gender != None:
                    mini_df = mini_df.loc[(mini_df["Gender"] == gender)]
                dfs.append(mini_df)
        return pd.concat(dfs)
    
# roundToHalves - takes an array of floats and rounds them to the nearest half (e.g. .0, .5). 
# e.g. [1.2344, 2.45, 3.9] --> [1.0, 2.5, 4.0]
# @params - array of floats to round.
# @return - array of rounded arrays.    
def roundToHalves(old_array):
    array = old_array
    for x in range(len(array)):
        num = array[x]
        decimal = num - int(num)
        if decimal > .33 and decimal < .66:
            array[x] = int(num) + .5
        elif decimal < .33:
            array[x] = int(num)
        else:
            array[x] = int(num) + 1
    return array



# insertAgeBias takes in a dataset, and an age threshold, and for each data sample within the df whose age, 
# exceeds the age threshold, adjusts that data sample's featureToManipulate, by randomly re-assigning it a value in
# lower 10th percentile of the featureToManipulate's values. 
def insertAgeBias(dataset, ageThreshold, featureToManipulate):
    df = dataset.copy()
    featureArray = np.asarray(df[featureToManipulate])
    ageArray = list(df["Age"])
    
    bottom10percentile = np.percentile(featureArray, 10)
    bottom10thvalues = [featureArray[x] for x in range(len(featureArray)) if featureArray[x] < bottom10percentile]
    
    assert(len(ageArray) == len(featureArray))
    for x in range(len(featureArray)):
        if ageArray[x] > ageThreshold:
            featureArray[x] = random.choice(bottom10thvalues)
    df[featureToManipulate] = featureArray
    return df

In [5]:
# loadDataset - Concatenates all the .csvs that are partitioned by race x gender, into one big df. Then applies
# simplifications to some columns (continuous values for years are rounded to half years), and 
# skews related to the 3 metrics 
# @param: None, simply looks the .csvs and key consts are already inside our functions (something to prob change)
# @return: the full dataset which is the df of individuals to use in Milestone1
def loadDataset():
    genders = ["male", "female"]
    races = [
        "Caucasian",
        "African-American",
        "Asian",
        "Hispanic",
        "Other"
        ]
    
    dfs = []
    for gender in genders:
        for race in races:
            dfs.append(pd.read_csv("employeeData_" + gender + "_" + race + ".csv", index_col=0))
    dataset = pd.concat(dfs)
    
    cols = dataset.columns.tolist()
    cols = cols[-7:] + cols[:-7]
    dataset = dataset[cols]
    
    # Adjust Metric 1 vars
    yearsOfExp = roundToHalves(list(dataset["Years of Experience"]))
    dataset["Years of Experience"] = yearsOfExp
    dataset = insertMetric1Skews(dataset)
    
    # Adjust Metric 2 vars
    jobTenure = roundToHalves(list(dataset["Job Tenure"]))
    militaryTenure = roundToHalves(list(dataset["Military Tenure"]))
    dataset["Job Tenure"] = jobTenure
    dataset["Military Tenure"] = militaryTenure
    dataset = insertMetric2Skews(dataset) 

    # Adjust Metric 3 vars
    dataset = insertMetric3Skews(dataset)
    dataset.info()
    return dataset

dataset = loadDataset()
dataset.to_csv("candidates_milestone1.csv", index=False)

78
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Names                         500 non-null    object 
 1   Race                          500 non-null    object 
 2   Gender                        500 non-null    object 
 3   Sports                        500 non-null    float64
 4   Birth Origin                  500 non-null    object 
 5   Age                           500 non-null    float64
 6   Zip                           500 non-null    int64  
 7   Education                     500 non-null    int64  
 8   GPA                           500 non-null    float64
 9   Educational Prestige          500 non-null    float64
 10  Years of Experience           500 non-null    float64
 11  Soft Skills                   500 non-null    float64
 12  Manager's Assessment Score    500 non-null    float64
 13  Mi