In [160]:
import math
import numpy as np
import pandas as pd
import random

In [161]:
def loadPpl(race, gender):
    numPpl = 10000
    Gender = {"male": .60, "female": .40}

    races = {
        "Caucasian": .55,
        "African-American": .14,
        "Asian": .15,
        "Hispanic": .10,
        "Other": .06
    }
    
    return int(math.floor(numPpl * Gender[gender] * races[race]))


# Run Cell Below to Form Up Dataset

In [162]:
def roundToHalves(old_array):
    array = old_array
    for x in range(len(array)):
        num = array[x]
        decimal = num - int(num)
        if decimal > .33 and decimal < .66:
            array[x] = int(num) + .5
        elif decimal < .33:
            array[x] = int(num)
        else:
            array[x] = int(num) + 1
    return array



# insertAgeBias takes in a dataset, and an age threshold, and for each data sample within the df whose age, 
# exceeds the age threshold, adjusts that data sample's featureToManipulate, by randomly re-assigning it a value in
# lower 10th percentile of the featureToManipulate's values. 
def insertAgeBias(dataset, ageThreshold, featureToManipulate):
    df = dataset.copy()
    featureArray = np.asarray(df[featureToManipulate])
    ageArray = list(df["Age"])
    
    bottom10percentile = np.percentile(featureArray, 10)
    bottom10thvalues = [featureArray[x] for x in range(len(featureArray)) if featureArray[x] < bottom10percentile]
    
    assert(len(ageArray) == len(featureArray))
    for x in range(len(featureArray)):
        if ageArray[x] > ageThreshold:
            featureArray[x] = random.choice(bottom10thvalues)
    df[featureToManipulate] = featureArray
    return df

In [163]:
def insertMetric2Skews(dataset):
    df = dataset.copy()
    
    minorities = extractMinorities(df, ["African-American", "Hispanic"])
    minorityIndxs = minorities.index.tolist()

    culturalFit = list(df["Cultural Fit"])
    for indx in minorityIndxs:
        culturalFit[indx] = culturalFit[indx] - .5
    df["Cultural Fit"] = culturalFit

    
#     minorities = extractMinorities(df, ["African-American", "Hispanic"], "male")
#     minorityIndxs = minorities.index.tolist()
    
#     meanCommuteTime = minorities["Avg Commute Time"].mean()
    
#     commuteTimes = list(df["Avg Commute Time"])
#     for indx in minorityIndxs:
#         if commuteTimes[indx] < meanCommuteTime:
#             commuteTimes[indx] += meanCommuteTime
    
#     df["Avg Commute Time"] = commuteTimes
    
    return df

def insertMetric3Skews(dataset):
    df = dataset.copy()
    return df
#     df = insertAgeBias(df, 35, "Technical Aptitude")
    return df

def extractMinorities(dataset, raceArray, gender=None):
    races = [
    "Caucasian",
    "African-American",
    "Asian",
    "Hispanic",
    "Other"
    ]
    
    df = dataset.copy()
    if raceArray[0] == "all":
        df = df.loc[(df["Race"] != "Caucasian")]
        if gender != None:
            df = df.loc[(df["Gender"] == gender)]
        return df
    else:
        dfs = []
        for race in raceArray:
            if race not in races:
                print("Please select one of these races for ", races)
                return
            else:
                mini_df = df.loc[df["Race"]== race]
                if gender != None:
                    mini_df = mini_df.loc[(mini_df["Gender"] == gender)]
                dfs.append(mini_df)
        return pd.concat(dfs)
  

In [164]:
extractMinorities(dataset, ["Hispanic", "African-American", "Other"])

Unnamed: 0,Names,Race,Gender,Sports,Birth Origin,Age,Zip,Education,GPA,Educational Prestige,...,Leadership Capability,HireVue Score,Technical Aptitude,Avg Deals Closed,Undergraduate Degree,LinkedIn Score,Responsible Social Media Use,Employee Referral,Arrest Record,Criminal Record
31,Josefina Williams,Hispanic,female,0.0,USA,29.0,94556,2,2.62,-0.12,...,2.91,2.81,2.52,9.89,Engineering,,Bad,0.0,1.0,0.0
34,Alice Ballintyn,Hispanic,female,0.0,USA,30.0,94105,2,2.35,-0.09,...,3.64,3.55,2.57,9.80,Sciences,,Bad,0.0,0.0,0.0
46,William Wheelis,Hispanic,male,0.0,USA,28.0,94502,2,3.06,0.12,...,5.06,5.31,4.65,12.77,Sciences,,Bad,0.0,0.0,0.0
55,Melissa Thompson,Hispanic,female,0.0,USA,33.0,94603,2,2.73,-0.02,...,1.89,-0.68,-0.50,6.52,Quantitative,,Bad,0.0,0.0,0.0
66,Mary Delarosa,Hispanic,female,0.0,USA,22.0,94403,1,2.53,0.02,...,2.45,-0.08,0.25,7.60,Engineering,Ok,Bad,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9941,Merle Dileo,Other,male,1.0,USA,32.0,95134,2,2.76,0.16,...,3.23,1.77,1.16,8.38,Quantitative,Ok,Good,0.0,1.0,0.0
9953,Charles Abrams,Other,male,0.0,USA,28.0,94040,1,2.73,0.42,...,3.93,2.81,2.84,10.72,Quantitative,,Good,0.0,1.0,1.0
9957,Theresa Reese,Other,female,0.0,USA,39.0,94941,4,3.50,0.00,...,0.07,-1.91,-1.32,5.15,Sciences,,Bad,0.0,0.0,0.0
9988,Jennifer Honig,Other,female,1.0,China,30.0,94010,5,1.70,0.42,...,3.51,2.95,2.82,10.45,Engineering,Very Good,Good,0.0,0.0,0.0


In [165]:
def insertMetric1Skews(dataset):
    df = dataset.copy()
    # Prestige Normalization     
    prestigeMax = df["Educational Prestige"].max()
    df["Educational Prestige"] = np.round(df["Educational Prestige"].apply(lambda x: x / prestigeMax), 2)
    
    # Education Discretization into quintiles
    df = df.sort_values(["Education"], ascending=False)
    fifth = math.floor(len(dataset) / 5)
    num = 5
    labels = []
    for x in range(5):
        for y in range(fifth):
            labels.append(num)
        num -= 1
    df["Education"] = labels
    df = df.sample(frac=1).reset_index(drop=True) # Need to shuffle after ordering.
    # Increase in the manager's assessment score to try and resolve the weights - but soft skills is a mess.     
    df["Manager's Assessment Score"] = df["Manager's Assessment Score"] + 10
    
    # Actual Skewage...
    # Applying skew to educationalPrestige & softSkills for metric1
    minorities = extractMinorities(df, ["African-American", "Hispanic"])
    minorityIndxs = minorities.index.tolist()

    educPrestige = list(df["Educational Prestige"])    # This should result in a higher false negative rate among minorities
    softSkills = list(df["Soft Skills"])
    
    for indx in minorityIndxs:
        educPrestige[indx] = educPrestige[indx] - .3
        
    asianIndxs = extractMinorities(df, ['Asian', "Hispanic", "African-American"], "female").index.tolist()
    for indx in asianIndxs:
        softSkills[indx] = softSkills[indx] - 2.40
        
    df["Educational Prestige"] = educPrestige
    df["Soft Skills"] = softSkills
    return df


def printStats(employees, label, features):
    df = employees.copy()
    
    label_vals = df[label].unique()
    label_vals.sort()
    
    for feature in features:
        for label_val in label_vals:
            
            selectedFeature = df.loc[(df[label] == label_val)][feature]
            print("The mean of " + feature + " for employees with " + str(label_val) + " is", selectedFeature.mean())
            print("The median of " + feature + " for employees with " + str(label_val) + " is", selectedFeature.median())
            print("")

In [166]:
def loadDataset():
    genders = ["male", "female"]
    races = [
        "Caucasian",
        "African-American",
        "Asian",
        "Hispanic",
        "Other"
        ]
    
    dfs = []
    for gender in genders:
        for race in races:
            dfs.append(pd.read_csv("employeeData_" + gender + "_" + race + ".csv", index_col=0))
    dataset = pd.concat(dfs)
    
    
    cols = dataset.columns.tolist()
    cols = cols[-7:] + cols[:-7]
    dataset = dataset[cols]
    
    # Adjust Metric 1 vars
    yearsOfExp = roundToHalves(list(dataset["Years of Experience"]))
    dataset["Years of Experience"] = yearsOfExp
    dataset = insertMetric1Skews(dataset)
    
    # Adjust Metric 2 vars
    jobTenure = roundToHalves(list(dataset["Job Tenure"]))
    militaryTenure = roundToHalves(list(dataset["Military Tenure"]))
    dataset["Job Tenure"] = jobTenure
    dataset["Military Tenure"] = militaryTenure
    dataset = insertMetric2Skews(dataset) 

    # Adjust Metric 3 vars
    dataset = insertMetric3Skews(dataset)
    
    dataset.info()
    return dataset

dataset = loadDataset()
dataset.to_csv("employees_milestone2.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Names                         10000 non-null  object 
 1   Race                          10000 non-null  object 
 2   Gender                        10000 non-null  object 
 3   Sports                        10000 non-null  float64
 4   Birth Origin                  10000 non-null  object 
 5   Age                           10000 non-null  float64
 6   Zip                           10000 non-null  int64  
 7   Education                     10000 non-null  int64  
 8   GPA                           10000 non-null  float64
 9   Educational Prestige          10000 non-null  float64
 10  Years of Experience           10000 non-null  float64
 11  Soft Skills                   10000 non-null  float64
 12  Manager's Assessment Score    10000 non-null  float64
 13  Mi

In [167]:
printStats(dataset, "Race", ["Soft Skills", "Educational Prestige"])

The mean of Soft Skills for employees with African-American is 2.0534000000000012
The median of Soft Skills for employees with African-American is 2.2500000000000004

The mean of Soft Skills for employees with Asian is 2.063459999999999
The median of Soft Skills for employees with Asian is 2.34

The mean of Soft Skills for employees with Caucasian is 2.996414545454544
The median of Soft Skills for employees with Caucasian is 3.0

The mean of Soft Skills for employees with Hispanic is 2.0191199999999974
The median of Soft Skills for employees with Hispanic is 2.275

The mean of Soft Skills for employees with Other is 3.036816666666669
The median of Soft Skills for employees with Other is 3.03

The mean of Educational Prestige for employees with African-American is 0.04902142857142867
The median of Educational Prestige for employees with African-American is 0.04999999999999999

The mean of Educational Prestige for employees with Asian is 0.35193999999999875
The median of Educational Pres