# Predicting protected feature
Robin Burke, March 16, 2020

The idea here is to predict the protected feature so that if we have segregated predictors, we'll know what instances use which one.

In [35]:
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import scale
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('Data/TrainingData.csv')

cols = data.columns

print (data.shape)

data.head()

(44102, 130)


Unnamed: 0,UNIQUE_ID,Overall_Rating,Technical_Skills,Teamwork,Customer_Service,Hire_Again,High_Performer,Protected_Group,Retained,SJ_Most_1,...,PScale12_Q1,PScale12_Q2,PScale12_Q3,PScale12_Q4,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5,split
0,245021089,3.0,3.0,4.0,4.0,4.0,0.0,0.0,1,3.0,...,1.0,1.0,3.0,4.0,1.0,2.0,3.0,2.0,1.0,train
1,245181465,5.0,5.0,5.0,5.0,5.0,1.0,1.0,0,3.0,...,1.0,1.0,4.0,4.0,2.0,1.0,4.0,4.0,4.0,train
2,229682665,3.0,3.0,3.0,3.0,4.0,0.0,1.0,0,2.0,...,1.0,1.0,4.0,4.0,1.0,1.0,4.0,4.0,4.0,train
3,245174982,4.0,4.0,4.0,4.0,5.0,1.0,0.0,1,2.0,...,1.0,1.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,train
4,244979030,2.0,2.0,3.0,2.0,3.0,0.0,0.0,1,3.0,...,2.0,1.0,4.0,4.0,2.0,1.0,4.0,3.0,2.0,train


In [11]:
data = data[data['Protected_Group'].notna()]
data.shape

(44095, 130)

In [16]:
bio_cols = list(cols[cols.str.contains('Biodata_')]) + ['Protected_Group']
bio_df = data.loc[:, bio_cols].copy()
bio_df.dropna(inplace=True)
bio_df = bio_df.astype('int')

bio_df.head()

Unnamed: 0,Biodata_01,Biodata_02,Biodata_03,Biodata_04,Biodata_05,Biodata_06,Biodata_07,Biodata_08,Biodata_09,Biodata_10,...,Biodata_12,Biodata_13,Biodata_14,Biodata_15,Biodata_16,Biodata_17,Biodata_18,Biodata_19,Biodata_20,Protected_Group
0,2,3,2,2,2,1,3,7,2,5,...,2,4,1,2,5,2,1,4,1,0
1,2,5,5,2,3,1,1,5,2,6,...,1,4,3,2,5,2,4,6,1,1
2,2,3,1,1,2,2,2,8,5,7,...,2,1,2,2,5,2,5,1,1,1
3,2,1,2,7,1,1,2,1,7,1,...,2,4,3,2,5,7,7,2,1,0
4,3,1,6,6,1,3,2,1,1,1,...,2,4,4,3,1,3,1,3,1,0


In [17]:
bio_sc = scale(bio_df.iloc[:,0:20])
bio_sc

array([[-0.05119484, -0.14159876, -1.15584344, ..., -1.76582747,
         0.18721026, -0.29641447],
       [-0.05119484,  1.02221612,  0.76082483, ..., -0.19518686,
         1.34301528, -0.29641447],
       [-0.05119484, -0.14159876, -1.79473286, ...,  0.32836001,
        -1.54649728, -0.29641447],
       ...,
       [-1.09932421,  1.02221612,  0.76082483, ..., -0.71873373,
         1.9209178 , -0.29641447],
       [-0.05119484,  2.186031  , -0.51695402, ...,  0.32836001,
         1.34301528, -0.29641447],
       [-0.05119484,  0.44030868,  0.76082483, ..., -0.71873373,
         0.76511277, -0.29641447]])

In [19]:
clf = LogisticRegressionCV(cv=5, random_state=20200314).fit(bio_sc, bio_df['Protected_Group'])
clf.score(bio_sc, bio_df['Protected_Group'])

0.6926365445566448

Not that good.

In [31]:
clf = RandomForestClassifier(max_depth=2, random_state=20200314)
scores = cross_val_score(clf, bio_sc, bio_df['Protected_Group'], cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.69 (+/- 0.00)


In [41]:
bio_df.iloc[:,0:20]

Unnamed: 0,Biodata_01,Biodata_02,Biodata_03,Biodata_04,Biodata_05,Biodata_06,Biodata_07,Biodata_08,Biodata_09,Biodata_10,Biodata_11,Biodata_12,Biodata_13,Biodata_14,Biodata_15,Biodata_16,Biodata_17,Biodata_18,Biodata_19,Biodata_20
0,2,3,2,2,2,1,3,7,2,5,2,2,4,1,2,5,2,1,4,1
1,2,5,5,2,3,1,1,5,2,6,1,1,4,3,2,5,2,4,6,1
2,2,3,1,1,2,2,2,8,5,7,1,2,1,2,2,5,2,5,1,1
3,2,1,2,7,1,1,2,1,7,1,1,2,4,3,2,5,7,7,2,1
4,3,1,6,6,1,3,2,1,1,1,1,2,4,4,3,1,3,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44097,2,1,3,7,1,1,2,1,7,1,1,1,4,2,2,5,7,7,4,1
44098,1,7,5,1,2,1,1,7,4,5,1,1,4,1,1,5,1,6,4,1
44099,1,5,5,5,2,1,1,6,3,3,1,1,4,1,1,5,1,3,7,1
44100,2,7,3,1,1,2,1,7,5,6,1,1,4,2,2,5,1,5,6,1


In [43]:
clf = CatBoostClassifier(iterations=3,
                           depth=3,
                           learning_rate=0.1,
                           loss_function='Logloss',
                         cat_features=range(0, 20),
                           verbose=False)
scores = cross_val_score(clf, bio_df.iloc[:,0:20], bio_df['Protected_Group'], cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.69 (+/- 0.00)


Not any better treating the features as categorical.

## Try with PCA features

In [32]:
pca = PCA(n_components=10)
pca.fit(bio_sc)
pca.explained_variance_ratio_.sum()

0.8266856884500942

In [33]:
bio_xf = pca.transform(bio_sc)

In [34]:
clf = LogisticRegressionCV(cv=5, random_state=20200314).fit(bio_xf, bio_df['Protected_Group'])
clf.score(bio_xf, bio_df['Protected_Group'])

0.6916046288788973

Not any better, but not any worse either.

## Try with additional features
Need to wait until the personality scale clusters are determined.