In [1]:
import numpy as np
import pandas as pd

# Train data set

In [2]:
df = pd.read_csv("./data/train.csv", na_values=[" "])

## Criterion

In [3]:
criterion_df = df.loc[:, "Overall_Rating":"Retained"]

In [4]:
criterion_df.head()

Unnamed: 0,Overall_Rating,Technical_Skills,Teamwork,Customer_Service,Hire_Again,High_Performer,Protected_Group,Retained
0,3.0,3.0,4.0,4.0,4.0,0.0,0.0,1
1,5.0,5.0,5.0,5.0,5.0,1.0,1.0,0
2,3.0,3.0,3.0,3.0,4.0,0.0,1.0,0
3,4.0,4.0,4.0,4.0,5.0,1.0,0.0,1
4,2.0,2.0,3.0,2.0,3.0,0.0,0.0,1


## Predictors

In [5]:
predictor_df = df.loc[:, "SJ_Most_1":"PScale13_Q5"]

In [6]:
predictor_df.head()

Unnamed: 0,SJ_Most_1,SJ_Least_1,SJ_Time_1,SJ_Most_2,SJ_Least_2,SJ_Time_2,SJ_Most_3,SJ_Least_3,SJ_Time_3,SJ_Most_4,...,PScale11_Q4,PScale12_Q1,PScale12_Q2,PScale12_Q3,PScale12_Q4,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5
0,3.0,4.0,95.0,3.0,2.0,101.0,1.0,4.0,39.0,1.0,...,1.0,1.0,1.0,3.0,4.0,1.0,2.0,3.0,2.0,1.0
1,3.0,4.0,53.0,3.0,2.0,57.0,1.0,4.0,43.0,1.0,...,1.0,1.0,1.0,4.0,4.0,2.0,1.0,4.0,4.0,4.0
2,2.0,4.0,89.0,1.0,4.0,73.0,1.0,3.0,90.0,1.0,...,4.0,1.0,1.0,4.0,4.0,1.0,1.0,4.0,4.0,4.0
3,2.0,1.0,110.0,1.0,2.0,86.0,1.0,4.0,52.0,1.0,...,2.0,1.0,1.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0
4,3.0,2.0,35.0,3.0,2.0,21.0,1.0,4.0,32.0,2.0,...,4.0,2.0,1.0,4.0,4.0,2.0,1.0,4.0,3.0,2.0


In [7]:
len(predictor_df.columns)

120

# TPOT

In [8]:
from tpot import TPOTClassifier

## Test Code, Set up

### High Performer

In [9]:
high_performer_df = df.loc[:, "High_Performer"].dropna().astype(float)
high_peformer_notna = df['High_Performer'].notna()

In [10]:
allpred_df = df.loc[:, "SJ_Most_1":"PScale13_Q5"][high_peformer_notna]

In [40]:
pipeline_optimizer = TPOTClassifier(generations=2, population_size=20, cv=5,
                                    random_state=42, verbosity=2)
pipeline_optimizer.fit(allpred_df, high_performer_df)
pipeline_optimizer.export('tpot_exported_pipeline_HP.py')

Imputing missing values in feature set


HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=60.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.6272496831432192

Generation 2 - Current best internal CV score: 0.6283903675538657

Best pipeline: RandomForestClassifier(MinMaxScaler(input_matrix), bootstrap=True, criterion=gini, max_features=0.2, min_samples_leaf=6, min_samples_split=4, n_estimators=100)


In [41]:
print(pipeline_optimizer.score(allpred_df, high_performer_df))

Imputing missing values in feature set
0.9533586818757921


### From TPOT Export

In [11]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from tpot.export_utils import set_param_recursive

high_performer_df = df.loc[:, "High_Performer"].dropna().astype(float)
high_peformer_notna = df['High_Performer'].notna()
allpred_df = df.loc[:, "SJ_Most_1":"PScale13_Q5"][high_peformer_notna]

# NOTE: Make sure that the outcome column is labeled 'target' in the data file


imputer = SimpleImputer(strategy="median")
imputer.fit(allpred_df)
allpred_df = imputer.transform(allpred_df)

# Average CV score on the training set was: 0.6283903675538657
exported_pipeline_HP = make_pipeline(
    MinMaxScaler(),
    RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.2, min_samples_leaf=6, 
                           min_samples_split=4, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline_HP.steps, 'random_state', 42)

exported_pipeline_HP.fit(allpred_df, high_performer_df)
score = exported_pipeline_HP.score(allpred_df, high_performer_df)
results = exported_pipeline_HP.predict(allpred_df)


In [12]:
print(score)
print(results)

0.9533586818757921
[0. 1. 0. ... 1. 0. 0.]


### Retained

In [13]:
retained_df = df.loc[:, "Retained"].dropna()
retained_notna = df['Retained'].notna()

In [14]:
allpred_df = df.loc[:, "SJ_Most_1":"PScale13_Q5"][retained_notna]

In [16]:
pipeline_optimizer = TPOTClassifier(generations=2, population_size=20, cv=5,
                                    random_state=42, verbosity=2)
pipeline_optimizer.fit(allpred_df, retained_df)
pipeline_optimizer.export('tpot_exported_pipeline_R.py')

NameError: name 'TPOTClassifier' is not defined

In [15]:
print(pipeline_optimizer.score(allpred_df, retained_df))

NameError: name 'pipeline_optimizer' is not defined

### From TPOT Export

In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer

retained_df = df.loc[:, "Retained"].dropna()
retained_notna = df['Retained'].notna()
allpred_df = df.loc[:, "SJ_Most_1":"PScale13_Q5"][retained_notna]

# NOTE: Make sure that the outcome column is labeled 'target' in the data file

imputer = SimpleImputer(strategy="median")
imputer.fit(allpred_df)
allpred_df = imputer.transform(allpred_df)


# Average CV score on the training set was: 0.5682511868065822
exported_pipeline_R = XGBClassifier(learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=100, n_jobs=-1, 
                                    subsample=0.55, verbosity=0)
# Fix random state in exported estimator
if hasattr(exported_pipeline_R, 'random_state'):
    setattr(exported_pipeline_R, 'random_state', 42)

exported_pipeline_R.fit(allpred_df, retained_df)
results = exported_pipeline_R.predict_proba(allpred_df)




In [34]:
print(results)

[[0.58291405 0.41708595]
 [0.60405684 0.3959432 ]
 [0.44367045 0.55632955]
 ...
 [0.6540057  0.3459943 ]
 [0.4143631  0.5856369 ]
 [0.58776313 0.41223687]]


### Protected Group

In [35]:
protected_group_df = df.loc[:, "Protected_Group"].dropna()
protected_group_notna = df['Protected_Group'].notna()

In [36]:
allpred_df = df.loc[:, "SJ_Most_1":"PScale13_Q5"][protected_group_notna]

In [23]:
pipeline_optimizer = TPOTClassifier(generations=3, population_size=20, cv=5,
                                    random_state=42, verbosity=2)
pipeline_optimizer.fit(allpred_df, protected_group_df)
pipeline_optimizer.export('tpot_exported_pipeline_PG.py')

Imputing missing values in feature set


HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=80.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.718380768794648

Generation 2 - Current best internal CV score: 0.7207166345390634


TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(MinMaxScaler(input_matrix), learning_rate=0.1, max_depth=9, min_child_weight=17, n_estimators=100, n_jobs=1, subsample=0.7000000000000001, verbosity=0)


In [24]:
print(pipeline_optimizer.score(allpred_df, protected_group_df))

Imputing missing values in feature set
0.8376686699172242


### From TPOT Export

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from tpot.export_utils import set_param_recursive

protected_group_df = df.loc[:, "Protected_Group"].dropna()
protected_group_notna = df['Protected_Group'].notna()
allpred_df = df.loc[:, "SJ_Most_1":"PScale13_Q5"][protected_group_notna]

# NOTE: Make sure that the outcome column is labeled 'target' in the data file

imputer = SimpleImputer(strategy="median")
imputer.fit(allpred_df)
allpred_df = imputer.transform(allpred_df)


# Average CV score on the training set was: 0.7340968363760063
exported_pipeline_PG = make_pipeline(
    MinMaxScaler(),
    XGBClassifier(learning_rate=0.1, max_depth=9, min_child_weight=17, n_estimators=100, n_jobs=-1, 
                  subsample=0.7000000000000001, verbosity=0)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline_PG.steps, 'random_state', 42)

exported_pipeline_PG.fit(allpred_df, protected_group_df)
results = exported_pipeline_PG.predict_proba(allpred_df)




In [38]:
print(results)

[[0.79233044 0.20766957]
 [0.5448779  0.45512208]
 [0.57547736 0.42452264]
 ...
 [0.5306227  0.4693773 ]
 [0.4390447  0.5609553 ]
 [0.3182673  0.6817327 ]]


### High Performer and Retained

In [39]:
HPandR_df = df.loc[:, "High_Performer":"Retained"][high_peformer_notna]
HPandR_df['HPandR'] = (HPandR_df['High_Performer'] + HPandR_df['Retained'])/2.
HPandR_df.loc[HPandR_df['HPandR'] <= .5, 'HPandR'] = 0.
HPandR_df = HPandR_df.loc[:, "HPandR"].dropna()
HPandR_notna = HPandR_df

In [40]:
allpred_df = df.loc[:, "SJ_Most_1":"PScale13_Q5"][high_peformer_notna]

In [36]:
pipeline_optimizer = TPOTClassifier(generations=3, population_size=20, cv=5,
                                    random_state=42, verbosity=2)
pipeline_optimizer.fit(allpred_df, HPandR_df)
pipeline_optimizer.export('tpot_exported_pipeline_HPandR.py')

Imputing missing values in feature set


HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=80.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.7470215462610901

Generation 2 - Current best internal CV score: 0.747148288973384

Generation 3 - Current best internal CV score: 0.747148288973384

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=1.0, min_samples_leaf=19, min_samples_split=5, n_estimators=100)


In [37]:
print(pipeline_optimizer.score(allpred_df, HPandR_df))

Imputing missing values in feature set
0.7495564005069708


### From TPOT Export

In [41]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

HPandR_df = df.loc[:, "High_Performer":"Retained"][high_peformer_notna]
HPandR_df['HPandR'] = (HPandR_df['High_Performer'] + HPandR_df['Retained'])/2.
HPandR_df.loc[HPandR_df['HPandR'] <= .5, 'HPandR'] = 0.
HPandR_df = HPandR_df.loc[:, "HPandR"].dropna()
HPandR_notna = HPandR_df
allpred_df = df.loc[:, "SJ_Most_1":"PScale13_Q5"][high_peformer_notna]

# NOTE: Make sure that the outcome column is labeled 'target' in the data file

imputer = SimpleImputer(strategy="median")
imputer.fit(allpred_df)
allpred_df = imputer.transform(allpred_df)


# Average CV score on the training set was: 0.747148288973384
exported_pipeline_HPandR = ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=1.0, 
                                                min_samples_leaf=19, min_samples_split=5, n_estimators=100)
# Fix random state in exported estimator
if hasattr(exported_pipeline_HPandR, 'random_state'):
    setattr(exported_pipeline_HPandR, 'random_state', 42)
    
exported_pipeline_HPandR.fit(allpred_df, HPandR_df)
results = exported_pipeline_HPandR.predict_proba(allpred_df)


In [42]:
print(results)

[[0.60382349 0.39617651]
 [0.64098799 0.35901201]
 [0.76523633 0.23476367]
 ...
 [0.70109741 0.29890259]
 [0.92589208 0.07410792]
 [0.72206295 0.27793705]]


# Model Evaluation

Create df with all required info

In [43]:
evaluate_df = df.dropna()
evaluate_df['HPandR'] = (evaluate_df['High_Performer'] + evaluate_df['Retained'])/2.
evaluate_df.loc[evaluate_df['HPandR'] <= .5, 'HPandR'] = 0.
#evaluate_df['HPandR']
evaluate_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluate_df['HPandR'] = (evaluate_df['High_Performer'] + evaluate_df['Retained'])/2.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,UNIQUE_ID,Overall_Rating,Technical_Skills,Teamwork,Customer_Service,Hire_Again,High_Performer,Protected_Group,Retained,SJ_Most_1,...,PScale12_Q2,PScale12_Q3,PScale12_Q4,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5,split,HPandR
0,245021089,3.0,3.0,4.0,4.0,4.0,0.0,0.0,1,3.0,...,1.0,3.0,4.0,1.0,2.0,3.0,2.0,1.0,train,0.0
1,245181465,5.0,5.0,5.0,5.0,5.0,1.0,1.0,0,3.0,...,1.0,4.0,4.0,2.0,1.0,4.0,4.0,4.0,train,0.0
2,229682665,3.0,3.0,3.0,3.0,4.0,0.0,1.0,0,2.0,...,1.0,4.0,4.0,1.0,1.0,4.0,4.0,4.0,train,0.0
3,245174982,4.0,4.0,4.0,4.0,5.0,1.0,0.0,1,2.0,...,1.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,train,1.0
4,244979030,2.0,2.0,3.0,2.0,3.0,0.0,0.0,1,3.0,...,1.0,4.0,4.0,2.0,1.0,4.0,3.0,2.0,train,0.0


In [44]:
evaluate_test = evaluate_df.loc[:, "SJ_Most_1":"PScale13_Q5"]

Extract Probabilities TPOT Pipelines

In [45]:
#Create dataframe for use with TPOT Pipelines

evaluate_test_TPOT = evaluate_test

imputer = SimpleImputer(strategy="median")
imputer.fit(evaluate_test_TPOT)
evaluate_test_TPOT = imputer.transform(evaluate_test_TPOT)


In [46]:
evaluate_df["HP_Pred"] = exported_pipeline_HP.predict_proba(evaluate_test_TPOT)[:,1]
evaluate_df["R_Pred"] = exported_pipeline_R.predict_proba(evaluate_test_TPOT)[:,1]
evaluate_df["PG_Pred"] = exported_pipeline_PG.predict_proba(evaluate_test_TPOT)[:,1]
evaluate_df["HPR_Pred"] = exported_pipeline_HPandR.predict_proba(evaluate_test_TPOT)[:,1]

evaluate_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluate_df["HP_Pred"] = exported_pipeline_HP.predict_proba(evaluate_test_TPOT)[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluate_df["R_Pred"] = exported_pipeline_R.predict_proba(evaluate_test_TPOT)[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluate_df["PG_Pred"] = exported_pip

Unnamed: 0,UNIQUE_ID,Overall_Rating,Technical_Skills,Teamwork,Customer_Service,Hire_Again,High_Performer,Protected_Group,Retained,SJ_Most_1,...,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5,split,HPandR,HP_Pred,R_Pred,PG_Pred,HPR_Pred
0,245021089,3.0,3.0,4.0,4.0,4.0,0.0,0.0,1,3.0,...,2.0,3.0,2.0,1.0,train,0.0,0.411337,0.417086,0.20767,0.396177
1,245181465,5.0,5.0,5.0,5.0,5.0,1.0,1.0,0,3.0,...,1.0,4.0,4.0,4.0,train,0.0,0.7426,0.395943,0.455122,0.359012
2,229682665,3.0,3.0,3.0,3.0,4.0,0.0,1.0,0,2.0,...,1.0,4.0,4.0,4.0,train,0.0,0.220399,0.55633,0.424523,0.234764
3,245174982,4.0,4.0,4.0,4.0,5.0,1.0,0.0,1,2.0,...,1.0,3.0,3.0,3.0,train,1.0,0.569891,0.624737,0.22384,0.360107
4,244979030,2.0,2.0,3.0,2.0,3.0,0.0,0.0,1,3.0,...,1.0,4.0,3.0,2.0,train,0.0,0.226158,0.554141,0.102584,0.200151


Create function to evaluate model

In [47]:
evaluate_df["HP_Pred_w"] = evaluate_df["HP_Pred"]*.628/2
evaluate_df["R_Pred_w"] = evaluate_df["R_Pred"]*.568/2
evaluate_df["PG_Pred_w"] = evaluate_df["PG_Pred"]*.09
evaluate_df["HPR_Pred_w"] = evaluate_df["HPR_Pred"]*.747

evaluate_df["Overall_Pred"] = (evaluate_df.loc[:,("HP_Pred_w","R_Pred_w","PG_Pred_w","HPR_Pred_w")].mean(axis=1))/4

evaluate_df["Pred_Hire"] = np.where(evaluate_df["Overall_Pred"] > evaluate_df["Overall_Pred"].median(),1,0)

pred_df = evaluate_df.loc[evaluate_df['Pred_Hire'] == 1]

pred_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluate_df["HP_Pred_w"] = evaluate_df["HP_Pred"]*.628/2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluate_df["R_Pred_w"] = evaluate_df["R_Pred"]*.568/2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluate_df["PG_Pred_w"] = evaluate_df["PG_Pred"]*.09
A value is trying to be set on a copy o

Unnamed: 0,UNIQUE_ID,Overall_Rating,Technical_Skills,Teamwork,Customer_Service,Hire_Again,High_Performer,Protected_Group,Retained,SJ_Most_1,...,HP_Pred,R_Pred,PG_Pred,HPR_Pred,HP_Pred_w,R_Pred_w,PG_Pred_w,HPR_Pred_w,Overall_Pred,Pred_Hire
0,245021089,3.0,3.0,4.0,4.0,4.0,0.0,0.0,1,3.0,...,0.411337,0.417086,0.20767,0.396177,0.12916,0.118452,0.01869,0.295944,0.03514,1
1,245181465,5.0,5.0,5.0,5.0,5.0,1.0,1.0,0,3.0,...,0.7426,0.395943,0.455122,0.359012,0.233176,0.112448,0.040961,0.268182,0.040923,1
3,245174982,4.0,4.0,4.0,4.0,5.0,1.0,0.0,1,2.0,...,0.569891,0.624737,0.22384,0.360107,0.178946,0.177425,0.020146,0.269,0.040345,1
5,244953017,4.0,4.0,5.0,4.0,4.0,1.0,1.0,1,3.0,...,0.696172,0.377083,0.428292,0.397412,0.218598,0.107092,0.038546,0.296867,0.041319,1
8,245082123,5.0,4.0,5.0,4.0,5.0,1.0,0.0,1,3.0,...,0.591218,0.515041,0.078894,0.356775,0.185643,0.146272,0.0071,0.266511,0.037845,1


In [48]:
Perc_THP = pred_df["High_Performer"].sum()/evaluate_df["High_Performer"].sum() #Percentage_of_true_top_performers_hired
Perc_TR = pred_df["Retained"].sum()/evaluate_df["Retained"].sum() #Percentage_of_true_retained_hired
Perc_THPR = pred_df["HPandR"].sum()/evaluate_df["HPandR"].sum() #Percentage_of_true_retained_top_performers_hired

Overall_Accuracy = Perc_THP*25 + Perc_TR*25 + Perc_THPR*50

PG_HR = pred_df.loc[pred_df['Protected_Group'] == 1, 'Pred_Hire'].sum()/evaluate_df["Protected_Group"].sum() #Protected Group Hiring Rate
MG_HR = pred_df.loc[pred_df['Protected_Group'] == 0, 'Pred_Hire'].sum()/(evaluate_df['Protected_Group'] == 0).astype(int).sum() #Majority Group Hiring Rate
AI_R = PG_HR/MG_HR #Adverse Impact Ratio

Unfairness = abs(1-AI_R)*100

Final_Score = Overall_Accuracy - Unfairness

print("Results")
print("Percentage_of_true_top_performers_hired", Perc_THP)
print("Percentage_of_true_retained_hired", Perc_TR)
print("Percentage_of_true_retained_top_performers_hired", Perc_THPR)
print("Overall Accuracy", Overall_Accuracy)
print("Protected Group Hire Rate", PG_HR)
print("Majority Group Hire Rate", MG_HR)
print("Adverse Impact Ratio", AI_R)
print("Unfairness Score", Unfairness)
print("Final Score", Final_Score)

Results
Percentage_of_true_top_performers_hired 0.9360916143332102
Percentage_of_true_retained_hired 0.5512394582162023
Percentage_of_true_retained_top_performers_hired 0.9865575686732905
Overall Accuracy 86.51115524739984
Protected Group Hire Rate 0.500618556701031
Majority Group Hire Rate 0.4995448338643605
Adverse Impact Ratio 1.0021494023436583
Unfairness Score 0.21494023436583465
Final Score 86.296215013034


# Dev Submission Code

### Read in Dev Set

In [64]:
dev_df = pd.read_csv("./data/participant_dev.csv", na_values=[" "])
dev_df.head()

Unnamed: 0,UNIQUE_ID,SJ_Most_1,SJ_Least_1,SJ_Time_1,SJ_Most_2,SJ_Least_2,SJ_Time_2,SJ_Most_3,SJ_Least_3,SJ_Time_3,...,PScale12_Q1,PScale12_Q2,PScale12_Q3,PScale12_Q4,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5,split
0,245203329,2.0,4.0,42.0,3.0,2.0,61.0,1.0,3.0,40.0,...,1.0,1.0,4.0,4.0,2.0,1.0,4.0,2.0,3.0,dev
1,245255130,3.0,1.0,117.0,3.0,4.0,89.0,1.0,4.0,79.0,...,3.0,2.0,2.0,3.0,1.0,1.0,4.0,4.0,2.0,dev
2,245127268,3.0,4.0,40.0,1.0,4.0,114.0,1.0,3.0,69.0,...,3.0,2.0,3.0,3.0,2.0,2.0,3.0,3.0,2.0,dev
3,245117275,2.0,4.0,92.0,3.0,2.0,65.0,1.0,3.0,126.0,...,1.0,1.0,3.0,4.0,1.0,1.0,4.0,3.0,1.0,dev
4,231615474,3.0,1.0,97.0,3.0,4.0,46.0,1.0,4.0,43.0,...,1.0,1.0,4.0,4.0,1.0,1.0,4.0,4.0,1.0,dev


In [65]:
dev_df_test = dev_df.fillna(dev_df.median())
dev_df_test = dev_df_test.loc[:, "SJ_Most_1":"PScale13_Q5"]


Extract Probabilities TPOT Pipeline

In [66]:
#Create RFE dataframes for each outcome

dev_df_test_TPOT = dev_df_test

imputer = SimpleImputer(strategy="median")
imputer.fit(dev_df_test_TPOT)
dev_df_test_TPOT = imputer.transform(dev_df_test_TPOT)


In [67]:
dev_df["HP_Pred"] = exported_pipeline_HP.predict_proba(dev_df_test_TPOT)[:,1]
dev_df["R_Pred"] = exported_pipeline_R.predict_proba(dev_df_test_TPOT)[:,1]
dev_df["PG_Pred"] = exported_pipeline_PG.predict_proba(dev_df_test_TPOT)[:,1]
dev_df["HPR_Pred"] = exported_pipeline_HPandR.predict_proba(dev_df_test_TPOT)[:,1]

dev_df.head()


Unnamed: 0,UNIQUE_ID,SJ_Most_1,SJ_Least_1,SJ_Time_1,SJ_Most_2,SJ_Least_2,SJ_Time_2,SJ_Most_3,SJ_Least_3,SJ_Time_3,...,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5,split,HP_Pred,R_Pred,PG_Pred,HPR_Pred
0,245203329,2.0,4.0,42.0,3.0,2.0,61.0,1.0,3.0,40.0,...,2.0,1.0,4.0,2.0,3.0,dev,0.339723,0.479701,0.269281,0.250754
1,245255130,3.0,1.0,117.0,3.0,4.0,89.0,1.0,4.0,79.0,...,1.0,1.0,4.0,4.0,2.0,dev,0.396693,0.583819,0.15666,0.366399
2,245127268,3.0,4.0,40.0,1.0,4.0,114.0,1.0,3.0,69.0,...,2.0,2.0,3.0,3.0,2.0,dev,0.410277,0.505876,0.100963,0.247554
3,245117275,2.0,4.0,92.0,3.0,2.0,65.0,1.0,3.0,126.0,...,1.0,1.0,4.0,3.0,1.0,dev,0.303018,0.565283,0.643197,0.236278
4,231615474,3.0,1.0,97.0,3.0,4.0,46.0,1.0,4.0,43.0,...,1.0,1.0,4.0,4.0,1.0,dev,0.523692,0.550764,0.455269,0.370944


In [68]:
dev_df["HP_Pred_w"] = dev_df["HP_Pred"]*.628/2
dev_df["R_Pred_w"] = dev_df["R_Pred"]*.568/2
dev_df["PG_Pred_w"] = dev_df["PG_Pred"]*.055
dev_df["HPR_Pred_w"] = dev_df["HPR_Pred"]*.747

dev_df["Overall_Pred"] = (dev_df.loc[:,("HP_Pred_w","R_Pred_w","PG_Pred_w","HPR_Pred_w")].mean(axis=1))/4

dev_df["Hire"] = np.where(dev_df["Overall_Pred"] > dev_df["Overall_Pred"].median(),1,0)

dev_df.head()


Unnamed: 0,UNIQUE_ID,SJ_Most_1,SJ_Least_1,SJ_Time_1,SJ_Most_2,SJ_Least_2,SJ_Time_2,SJ_Most_3,SJ_Least_3,SJ_Time_3,...,HP_Pred,R_Pred,PG_Pred,HPR_Pred,HP_Pred_w,R_Pred_w,PG_Pred_w,HPR_Pred_w,Overall_Pred,Hire
0,245203329,2.0,4.0,42.0,3.0,2.0,61.0,1.0,3.0,40.0,...,0.339723,0.479701,0.269281,0.250754,0.106673,0.136235,0.01481,0.187313,0.027814,0
1,245255130,3.0,1.0,117.0,3.0,4.0,89.0,1.0,4.0,79.0,...,0.396693,0.583819,0.15666,0.366399,0.124561,0.165805,0.008616,0.2737,0.035793,1
2,245127268,3.0,4.0,40.0,1.0,4.0,114.0,1.0,3.0,69.0,...,0.410277,0.505876,0.100963,0.247554,0.128827,0.143669,0.005553,0.184923,0.028936,0
3,245117275,2.0,4.0,92.0,3.0,2.0,65.0,1.0,3.0,126.0,...,0.303018,0.565283,0.643197,0.236278,0.095148,0.16054,0.035376,0.1765,0.029223,1
4,231615474,3.0,1.0,97.0,3.0,4.0,46.0,1.0,4.0,43.0,...,0.523692,0.550764,0.455269,0.370944,0.164439,0.156417,0.02504,0.277095,0.038937,1


In [69]:
dev_df_submission = dev_df[["UNIQUE_ID", "Hire"]]
dev_df_submission.head()
dev_df_submission.to_csv(r'.TeamSubmissionTPOT_AltWeights3DevCheck_031321.csv', index = False)

# Final Submission Code

### Read in Test Set

In [56]:
test_df = pd.read_csv("./data/participant_test.csv", na_values=[" "])
test_df.head()

Unnamed: 0,UNIQUE_ID,SJ_Most_1,SJ_Least_1,SJ_Time_1,SJ_Most_2,SJ_Least_2,SJ_Time_2,SJ_Most_3,SJ_Least_3,SJ_Time_3,...,PScale12_Q1,PScale12_Q2,PScale12_Q3,PScale12_Q4,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5,split
0,245091493,3.0,1.0,89.0,1.0,2.0,124.0,1.0,3.0,139.0,...,1.0,1.0,4.0,4.0,1.0,1.0,4.0,3.0,1.0,test
1,245032659,2.0,4.0,53.0,4.0,3.0,53.0,4.0,1.0,56.0,...,3.0,2.0,1.0,3.0,2.0,1.0,4.0,2.0,2.0,test
2,245060445,3.0,4.0,56.0,4.0,1.0,34.0,1.0,3.0,42.0,...,1.0,1.0,4.0,4.0,1.0,1.0,4.0,4.0,1.0,test
3,244944546,3.0,2.0,153.0,3.0,2.0,97.0,1.0,4.0,76.0,...,1.0,1.0,4.0,4.0,1.0,1.0,4.0,3.0,1.0,test
4,245136672,3.0,1.0,80.0,3.0,4.0,33.0,1.0,2.0,65.0,...,1.0,1.0,2.0,4.0,4.0,1.0,4.0,3.0,2.0,test


In [57]:
test_df_test = test_df.fillna(test_df.median())
test_df_test = test_df.loc[:, "SJ_Most_1":"PScale13_Q5"]


Extract Probabilities TPOT Pipeline

In [58]:
#Create RFE dataframes for each outcome

test_df_test_TPOT = test_df_test

imputer = SimpleImputer(strategy="median")
imputer.fit(test_df_test_TPOT)
test_df_test_TPOT = imputer.transform(test_df_test_TPOT)


In [59]:
test_df["HP_Pred"] = exported_pipeline_HP.predict_proba(test_df_test_TPOT)[:,1]
test_df["R_Pred"] = exported_pipeline_R.predict_proba(test_df_test_TPOT)[:,1]
test_df["PG_Pred"] = exported_pipeline_PG.predict_proba(test_df_test_TPOT)[:,1]
test_df["HPR_Pred"] = exported_pipeline_HPandR.predict_proba(test_df_test_TPOT)[:,1]

test_df.head()


Unnamed: 0,UNIQUE_ID,SJ_Most_1,SJ_Least_1,SJ_Time_1,SJ_Most_2,SJ_Least_2,SJ_Time_2,SJ_Most_3,SJ_Least_3,SJ_Time_3,...,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5,split,HP_Pred,R_Pred,PG_Pred,HPR_Pred
0,245091493,3.0,1.0,89.0,1.0,2.0,124.0,1.0,3.0,139.0,...,1.0,1.0,4.0,3.0,1.0,test,0.233504,0.53255,0.135846,0.144419
1,245032659,2.0,4.0,53.0,4.0,3.0,53.0,4.0,1.0,56.0,...,2.0,1.0,4.0,2.0,2.0,test,0.293979,0.580103,0.170358,0.182028
2,245060445,3.0,4.0,56.0,4.0,1.0,34.0,1.0,3.0,42.0,...,1.0,1.0,4.0,4.0,1.0,test,0.530996,0.35426,0.683365,0.394011
3,244944546,3.0,2.0,153.0,3.0,2.0,97.0,1.0,4.0,76.0,...,1.0,1.0,4.0,3.0,1.0,test,0.322425,0.496677,0.698919,0.234838
4,245136672,3.0,1.0,80.0,3.0,4.0,33.0,1.0,2.0,65.0,...,4.0,1.0,4.0,3.0,2.0,test,0.254778,0.561838,0.510375,0.13042


Test Submission 1 - PG Weight = .045, Score =  58.7976; 
Test Submission 2 - PG Weight = .09, Score =  54.0574; 
Test Submission 3 - PG Weight = .00, Score =  46.3817; 

Submissions 1 through 3 were plotted in excel, a quadratic trendline was fit, and wolframalpha.com was used to solve for the maximum point on the trendline (.055). This was input as the final weight for the Protected Group model for the Final Test Submission. 

Test Submission 4 - PG weight = .055, Score =  ; 

In [62]:
test_df["HP_Pred_w"] = test_df["HP_Pred"]*.628/2
test_df["R_Pred_w"] = test_df["R_Pred"]*.568/2
test_df["PG_Pred_w"] = test_df["PG_Pred"]*.055
test_df["HPR_Pred_w"] = test_df["HPR_Pred"]*.747

test_df["Overall_Pred"] = (test_df.loc[:,("HP_Pred_w","R_Pred_w","PG_Pred_w","HPR_Pred_w")].mean(axis=1))/4

test_df["Hire"] = np.where(test_df["Overall_Pred"] > test_df["Overall_Pred"].median(),1,0)

test_df.head()


Unnamed: 0,UNIQUE_ID,SJ_Most_1,SJ_Least_1,SJ_Time_1,SJ_Most_2,SJ_Least_2,SJ_Time_2,SJ_Most_3,SJ_Least_3,SJ_Time_3,...,HP_Pred,R_Pred,PG_Pred,HPR_Pred,HP_Pred_w,R_Pred_w,PG_Pred_w,HPR_Pred_w,Overall_Pred,Hire
0,245091493,3.0,1.0,89.0,1.0,2.0,124.0,1.0,3.0,139.0,...,0.233504,0.53255,0.135846,0.144419,0.07332,0.151244,0.007472,0.107881,0.021245,0
1,245032659,2.0,4.0,53.0,4.0,3.0,53.0,4.0,1.0,56.0,...,0.293979,0.580103,0.170358,0.182028,0.092309,0.164749,0.00937,0.135975,0.02515,0
2,245060445,3.0,4.0,56.0,4.0,1.0,34.0,1.0,3.0,42.0,...,0.530996,0.35426,0.683365,0.394011,0.166733,0.10061,0.037585,0.294326,0.037453,1
3,244944546,3.0,2.0,153.0,3.0,2.0,97.0,1.0,4.0,76.0,...,0.322425,0.496677,0.698919,0.234838,0.101242,0.141056,0.038441,0.175424,0.02851,0
4,245136672,3.0,1.0,80.0,3.0,4.0,33.0,1.0,2.0,65.0,...,0.254778,0.561838,0.510375,0.13042,0.08,0.159562,0.028071,0.097424,0.022816,0


In [63]:
test_df_submission = test_df[["UNIQUE_ID", "Hire"]]
test_df_submission.head()
test_df_submission.to_csv(r'.TestSubmissionTPOT_AltWeights3v3_031321.csv', index = False)