In [70]:
import math
import numpy as np
import pandas as pd
import random

In [71]:
def loadPpl(race, gender):
    numPpl = 2500
    Gender = {"male": .60, "female": .40}

    races = {
        "Caucasian": .55,
        "African-American": .14,
        "Asian": .15,
        "Hispanic": .10,
        "Other": .06
    }
    
    return int(math.floor(numPpl * Gender[gender] * races[race]))


In [72]:
loadPpl("Caucasian", "male")

825

# Run Cell Below to Form Up Dataset

In [73]:
def roundToHalves(old_array):
    array = old_array
    for x in range(len(array)):
        num = array[x]
        decimal = num - int(num)
        if decimal > .33 and decimal < .66:
            array[x] = int(num) + .5
        elif decimal < .33:
            array[x] = int(num)
        else:
            array[x] = int(num) + 1
    return array



# insertAgeBias takes in a dataset, and an age threshold, and for each data sample within the df whose age, 
# exceeds the age threshold, adjusts that data sample's featureToManipulate, by randomly re-assigning it a value in
# lower 10th percentile of the featureToManipulate's values. 
def insertAgeBias(dataset, ageThreshold, featureToManipulate):
    df = dataset.copy()
    featureArray = np.asarray(df[featureToManipulate])
    ageArray = list(df["Age"])
    
    bottom10percentile = np.percentile(featureArray, 10)
    bottom10thvalues = [featureArray[x] for x in range(len(featureArray)) if featureArray[x] < bottom10percentile]
    
    assert(len(ageArray) == len(featureArray))
    for x in range(len(featureArray)):
        if ageArray[x] > ageThreshold:
            featureArray[x] = random.choice(bottom10thvalues)
    df[featureToManipulate] = featureArray
    return df

In [74]:
def insertMetric2Skews(dataset):
    df = dataset.copy()
    
    minorities = extractMinorities(df, ["African-American", "Hispanic"])
    minorityIndxs = minorities.index.tolist()

    culturalFit = list(df["Cultural Fit"])
    for indx in minorityIndxs:
        culturalFit[indx] = culturalFit[indx] - .5
    df["Cultural Fit"] = culturalFit

    
    minorities = extractMinorities(df, ["Hispanic", "African-American"], "male")
    minorityIndxs = minorities.index.tolist()
    
    meanCommuteTime = minorities["Avg Commute Time"].mean()
    
    commuteTimes = list(df["Avg Commute Time"])
    for indx in minorityIndxs:
        if commuteTimes[indx] < meanCommuteTime:
            commuteTimes[indx] += meanCommuteTime
    
    df["Avg Commute Time"] = commuteTimes 
    
    return df

def insertMetric3Skews(dataset):
    df = dataset.copy()
    return df
#     df = insertAgeBias(df, 35, "Technical Aptitude")
    return df

def extractMinorities(dataset, raceArray, gender=None):
    races = [
    "Caucasian",
    "African-American",
    "Asian",
    "Hispanic",
    "Other"
    ]
    
    df = dataset.copy()
    if raceArray[0] == "all":
        df = df.loc[(df["Race"] != "Caucasian")]
        if gender != None:
            df = df.loc[(df["Gender"] == gender)]
        return df
    else:
        dfs = []
        for race in raceArray:
            if race not in races:
                print("Please select one of these races for ", races)
                return
            else:
                mini_df = df.loc[df["Race"]== race]
                if gender != None:
                    mini_df = mini_df.loc[(mini_df["Gender"] == gender)]
                dfs.append(mini_df)
        return pd.concat(dfs)
  

In [75]:
def insertMetric1Skews(dataset):
    df = dataset.copy()
    # Prestige Normalization     
    prestigeMax = df["Educational Prestige"].max()
    df["Educational Prestige"] = np.round(df["Educational Prestige"].apply(lambda x: x / prestigeMax), 2)
    
    # Education Discretization into quintiles
    df = df.sort_values(["Education"], ascending=False)
    fifth = math.floor(len(dataset) / 5)
    num = 5
    labels = []
    for x in range(5):
        for y in range(fifth):
            labels.append(num)
        num -= 1
    df["Education"] = labels
    df = df.sample(frac=1).reset_index(drop=True) # Need to shuffle after ordering.
    # Increase in the manager's assessment score to try and resolve the weights - but soft skills is a mess.     
    df["Manager's Assessment Score"] = df["Manager's Assessment Score"] + 10
    
    # Actual Skewage...
    # Applying skew to educationalPrestige & softSkills for metric1
    minorities = extractMinorities(df, ["African-American", "Hispanic"])

    minorityIndxs = minorities.index.tolist()

    educPrestige = list(df["Educational Prestige"])
    softSkills = list(df["Soft Skills"])
    
    for indx in minorityIndxs:
        educPrestige[indx] = educPrestige[indx] - .3
    
    minorityIndxs = extractMinorities(df, ['Asian', "Hispanic", "African-American"], "female").index.tolist()
    print(len(minorityIndxs))
    for indx in minorityIndxs:
        softSkills[indx] = softSkills[indx] - 2.4
        
    df["Educational Prestige"] = educPrestige
    df["Soft Skills"] = softSkills
    return df


def printStats(employees, label, features):
    df = employees.copy()
    
    label_vals = df[label].unique()
    label_vals.sort()
    
    for feature in features:
        for label_val in label_vals:
            
            selectedFeature = df.loc[(df[label] == label_val)][feature]
            print("The mean of " + feature + " for employees with " + str(label_val) + " is", selectedFeature.mean())
            print("The median of " + feature + " for employees with " + str(label_val) + " is", selectedFeature.median())
            print("")

In [76]:
def loadDataset():
    genders = ["male", "female"]
    races = [
        "Caucasian",
        "African-American",
        "Asian",
        "Hispanic",
        "Other"
        ]
    
    dfs = []
    for gender in genders:
        for race in races:
            dfs.append(pd.read_csv("employeeData_" + gender + "_" + race + ".csv", index_col=0))
    dataset = pd.concat(dfs)
    
    
    cols = dataset.columns.tolist()
    cols = cols[-7:] + cols[:-7]
    dataset = dataset[cols]
    
    # Adjust Metric 1 vars
    yearsOfExp = roundToHalves(list(dataset["Years of Experience"]))
    dataset["Years of Experience"] = yearsOfExp
    dataset = insertMetric1Skews(dataset)
    
    # Adjust Metric 2 vars
    jobTenure = roundToHalves(list(dataset["Job Tenure"]))
    militaryTenure = roundToHalves(list(dataset["Military Tenure"]))
    dataset["Job Tenure"] = jobTenure
    dataset["Military Tenure"] = militaryTenure
    dataset = insertMetric2Skews(dataset) 

    # Adjust Metric 3 vars
    dataset = insertMetric3Skews(dataset)
    
    dataset.info()
    return dataset

dataset = loadDataset()
dataset.to_csv("candidates_milestone2.csv", index=False)

390
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Names                         2500 non-null   object 
 1   Race                          2500 non-null   object 
 2   Gender                        2500 non-null   object 
 3   Sports                        2500 non-null   float64
 4   Birth Origin                  2500 non-null   object 
 5   Age                           2500 non-null   float64
 6   Zip                           2500 non-null   int64  
 7   Education                     2500 non-null   int64  
 8   GPA                           2500 non-null   float64
 9   Educational Prestige          2500 non-null   float64
 10  Years of Experience           2500 non-null   float64
 11  Soft Skills                   2500 non-null   float64
 12  Manager's Assessment Score    2500 non-null   float64
 13 

In [77]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Names                         2500 non-null   object 
 1   Race                          2500 non-null   object 
 2   Gender                        2500 non-null   object 
 3   Sports                        2500 non-null   float64
 4   Birth Origin                  2500 non-null   object 
 5   Age                           2500 non-null   float64
 6   Zip                           2500 non-null   int64  
 7   Education                     2500 non-null   int64  
 8   GPA                           2500 non-null   float64
 9   Educational Prestige          2500 non-null   float64
 10  Years of Experience           2500 non-null   float64
 11  Soft Skills                   2500 non-null   float64
 12  Manager's Assessment Score    2500 non-null   float64
 13  Mil