In [37]:
import pandas as pd
import numpy as np
from sklearn import datasets
from collections import defaultdict
from scipy.stats import truncnorm


In [38]:
# Helper Methods

def get_truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm(
        (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

def binaryArrayGenerator(n, p): #where n is num values and p is the probability of success (1)
    x = np.zeros(n)
    num = int(p*100)
    x[:num] = 1
    np.random.shuffle(x)
    return x

def genSubGaussianDist(dist1, dist2, n, dist1_percent, shuffle):
    x = np.random.normal(dist1["mean"], dist1["var"], int(dist1_percent * n))
    y = np.random.normal(dist2["mean"], dist2["var"], n - int(dist1_percent * n))
    z = np.append(x, y)
    if shuffle:  return np.random.shuffle(z)
    return z
    
    
    

In [54]:
# Build the list of features we desire to include in our rough hiring platform data
# Put yourself in pymetrics' shoes where a Fortune 500 comes to them and asks them to evaluate the following candidates for 
# a entry-level product management / business analyst / sales associate position.
# given their employee selection tests.

# What data parameters would enable you to make the best decision?
# Level of education, GPA, technical aptitude, soft skills, cultural fit, average commute time, address, languagues, years of work experience, number of 
# companies worked for, average stay at each company, past recommendations, Employee Referral, awards from previous work, race, gender, leadership experience, criminal history, 
# LinkedIn profile score, Instagram/Facebook/Twitter content is reasonable. 
# Communication, Problem Solving, Willing To Learn, Emotional Skills, Cognitive Ability.

traditionalFeatures = ["GPA", "Avg Commute Time", "Cultural Fit", "Technical Aptitude", "Soft Skills", "Employee Referral", "Years of Experience", "Leadership Capability", "Race", "Sex", "Age"]
dataset = pd.DataFrame(columns = traditionalFeatures)



In [55]:
ppl = 100
GPA = np.random.normal(3.2, 0.5, 100) #mean, stdev, numSamples out of 4.0
commuteTimes = get_truncated_normal(mean=25, sd=40, low=0, upp=120) #mean, stdev, numSamples in minutes
techAptitude = np.random.choice(5, 100) # represent num of technical platforms proficient in. 
culturalFit = np.random.choice(3, 100) # Where higher means their beliefs align with org more. 
softSkills = binaryArrayGenerator(100, .55)
employeeReferral = binaryArrayGenerator(100, .33)
yearsExperience = get_truncated_normal(mean=3, sd = 2, low = 1, upp=15)
leadershipCapability = np.random.normal(.5, .25, 100)

# Sensitive Attributes
raceValues = {"Caucasian": 0, "African-America": 1, "Asian": 2, "Hispanic": 3, "Other": 4}
race = np.zeros(100)
race[70:75] = 1
race[75: 85] = 2
race[85: 92] = 3
race[92: 100] = 4

sexValues = {"Male": 0, "Female": 1}
sex = np.zeros(100)
sex[60:] = 1


ageDist1 = defaultdict(float)
ageDist1["mean"] = 28 #years old
ageDist1["var"] = 6
ageDist2 = defaultdict(float)
ageDist2["mean"] = 50 #years old
ageDist2["var"] = 3
age = np.rint(genSubGaussianDist(ageDist1, ageDist2, 100, .75, False))

In [62]:
dataset["GPA"] = GPA
dataset["Avg Commute Time"] = commuteTimes.rvs(100)
dataset["Technical Aptitude"] = techAptitude
dataset["Cultural Fit"] = culturalFit
dataset["Soft Skills"] = softSkills
dataset["Employee Referral"] = employeeReferral
dataset["Years of Experience"] = np.rint(yearsExperience.rvs(100))
dataset["Leadership Capability"] = leadershipCapability
dataset["Race"] = race
dataset["Sex"] = sex
dataset["Age"] = age

dataset


Unnamed: 0,GPA,Avg Commute Time,Cultural Fit,Technical Aptitude,Soft Skills,Employee Referral,Years of Experience,Leadership Capability,Race,Sex,Age
0,3.221776,16.122743,2,0,0.0,0.0,4.0,0.448048,0.0,0.0,18.0
1,2.927046,33.175609,1,1,1.0,1.0,3.0,0.868596,0.0,0.0,27.0
2,3.385636,23.431797,0,3,1.0,0.0,4.0,0.506704,0.0,0.0,20.0
3,3.701652,15.750707,1,0,1.0,0.0,6.0,0.456204,0.0,0.0,36.0
4,2.808924,5.390227,0,3,0.0,0.0,6.0,0.636884,0.0,0.0,36.0
...,...,...,...,...,...,...,...,...,...,...,...
95,3.349097,20.870220,1,1,0.0,1.0,2.0,0.321000,4.0,1.0,48.0
96,3.283027,49.132373,2,4,1.0,0.0,2.0,0.335702,4.0,1.0,48.0
97,3.012722,7.994247,2,4,0.0,0.0,3.0,0.463625,4.0,1.0,52.0
98,4.022982,62.070981,0,4,0.0,0.0,2.0,0.309380,4.0,1.0,53.0


In [82]:
###---- Now attempt to violate disparate impact clauses ---###
### --- Devise Tests to select Candidates ---- ###

def selectTopQuarterAvgNaively(metrics, numFeatures):
    if numFeatures == 3:
        dataset["Avg Score"] = (dataset[metrics[0]] + dataset[metrics[1]] + dataset[metrics[2]]) / numFeatures
    values = dataset["Avg Score"]
    idx = (-values).argsort()[:25]
    return idx
        
        

In [84]:
metrics = ["GPA", "Leadership Capability", "Technical Aptitude"]
topPerformers = selectTopQuarterAvgNaively(metrics, 3)
print(topPerformers)

0     75
1     56
2     61
3     98
4     92
5     79
6     44
7     27
8     23
9     82
10    77
11    20
12    38
13    32
14    96
15    47
16    81
17    97
18    71
19    87
20    63
21    48
22    19
23     8
24    57
Name: Avg Score, dtype: int64
