In [1]:
import numpy as np
import pandas as pd
from dirty_cat import datasets

employee_salaries = datasets.fetch_employee_salaries()

Here is the full dataset.

In [2]:
data = employee_salaries['data']
data.head(n=5)

Unnamed: 0,full_name,gender,2016_gross_pay_received,2016_overtime_pay,department,department_name,division,assignment_category,employee_position_title,underfilled_job_title,date_first_hired,year_first_hired,Current Annual Salary
0,"Aarhus, Pam J.",F,71225.98,416.1,POL,Department of Police,MSB Information Mgmt and Tech Division Records...,Fulltime-Regular,Office Services Coordinator,,09/22/1986,1986.0,69222.18
1,"Aaron, David J.",M,103088.48,3326.19,POL,Department of Police,ISB Major Crimes Division Fugitive Section,Fulltime-Regular,Master Police Officer,,09/12/1988,1988.0,97392.47
2,"Aaron, Marsha M.",F,107000.24,1353.32,HHS,Department of Health and Human Services,Adult Protective and Case Management Services,Fulltime-Regular,Social Worker IV,,11/19/1989,1989.0,104717.28
3,"Ababio, Godfred A.",M,57819.04,3423.07,COR,Correction and Rehabilitation,PRRS Facility and Security,Fulltime-Regular,Resident Supervisor II,,05/05/2014,2014.0,52734.57
4,"Ababu, Essayas",M,95815.17,,HCA,Department of Housing and Community Affairs,Affordable Housing Programs,Fulltime-Regular,Planning Specialist III,,03/05/2007,2007.0,93396.0


This column has only two values.

In [3]:
print(data['assignment_category'].value_counts().sort_index())

Fulltime-Regular    8394
Parttime-Regular     834
Name: assignment_category, dtype: int64


This one has *many*.

In [14]:
len(data['employee_position_title'].value_counts())

385

Let's convert this to a machine learning dataset. Note that we'll definately drop gender.

In [40]:
target_column = 'Current Annual Salary'
ml_df = data[[target_column, 'gender', 'year_first_hired', 'assignment_category', 'employee_position_title']].dropna()
y = ml_df[target_column].values.ravel()
# y = (y - np.mean(y))/np.std(y)
X = ml_df[['employee_position_title', 'gender', 'year_first_hired', 'assignment_category']]

In [6]:
ml_df.head(12).drop(columns=['gender'])

Unnamed: 0,Current Annual Salary,year_first_hired,assignment_category,employee_position_title
0,69222.18,1986.0,Fulltime-Regular,Office Services Coordinator
1,97392.47,1988.0,Fulltime-Regular,Master Police Officer
2,104717.28,1989.0,Fulltime-Regular,Social Worker IV
3,52734.57,2014.0,Fulltime-Regular,Resident Supervisor II
4,93396.0,2007.0,Fulltime-Regular,Planning Specialist III
5,70435.0,2007.0,Fulltime-Regular,Police Officer III
6,60300.0,2016.0,Fulltime-Regular,Accountant/Auditor II
7,64788.65,2014.0,Fulltime-Regular,Administrative Specialist II
8,45261.0,2016.0,Fulltime-Regular,Firefighter/Rescuer III
9,47670.09,2007.0,Fulltime-Regular,Police Aide


Dropping gender is not enough to suggest fairness. Keep that in mind as we explore categorical featurizers.

In [41]:
import dirty_cat 

mod = dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=200)
mod.fit_transform(data[['employee_position_title']]).shape

(9228, 200)

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='char', ngram_range=(2, 4)).fit(ml_df['employee_position_title'])
cv.fit_transform(ml_df['employee_position_title']).shape

(9211, 3702)

Note the difference in shape of output here. 

Also note that the `SimilarityEncoder` can deal with dataframes as input. The `CountVectorizer` expects 1D data containing text. Unfortunately this means that we need to construct two pipelines.

In [34]:
from sklearn import set_config

set_config(display="diagram") 

from sklearn.pipeline import Pipeline, FeatureUnion
from sklego.preprocessing import ColumnSelector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge

method = {
    'sim_enc100': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=100),
    'sim_enc300': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=300),
    'sim_enc_all': dirty_cat.SimilarityEncoder(),
    'one-hot': OneHotEncoder(handle_unknown='ignore')
}

results = []

for k, encoder in method.items():
  pipe = Pipeline([
    ('split', FeatureUnion([
      ('cat', Pipeline([
        ('grab', ColumnSelector(['employee_position_title'])),
        ('handle', encoder)
      ])),
      ('one-hot', Pipeline([
        ('grab', ColumnSelector('assignment_category')),
        ('handle', OneHotEncoder(handle_unknown='ignore'))
      ])),
      ('floats', Pipeline([
        ('grab', ColumnSelector('year_first_hired')),
        ('scale', StandardScaler())
      ])),
    ])),
    ('mod', Ridge())
  ])

  grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
  print(k)
  res_df = pd.DataFrame(grid.fit(X, y).cv_results_)
  res_df['key'] = k
  results.append(res_df)

sim_enc100
sim_enc300
sim_enc_all
one-hot


In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

method = {
    'cv': CountVectorizer(),
    'cv_ngram': CountVectorizer(analyzer='char', ngram_range=(2, 4)),
}

for k, encoder in method.items():
  pipe = Pipeline([
    ('split', FeatureUnion([
      ('cat', Pipeline([
        ('listify', FunctionTransformer(lambda d: [t for t in d['employee_position_title']])),
        ('handle', encoder)
      ])),
      ('one-hot', Pipeline([
        ('grab', ColumnSelector('assignment_category')),
        ('handle', OneHotEncoder(handle_unknown='ignore'))
      ])),

      ('floats', Pipeline([
        ('grab', ColumnSelector('year_first_hired')),
        ('scale', StandardScaler())
      ])),
    ])),
    ('mod', Ridge())
  ])
  grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
  print(k)
  res_df = pd.DataFrame(grid.fit(X, y).cv_results_)
  res_df['key'] = k
  results.append(res_df)

cv
cv_ngram


In [44]:
grid

Here are the results.

In [37]:
import matplotlib.pylab as plt 

plt_df = pd.concat(results)[['mean_test_neg_mean_absolute_error', 'mean_test_r2', 'key']]
plt_df.sort_values('mean_test_r2', ascending=False).reset_index()

Unnamed: 0,index,mean_test_neg_mean_absolute_error,mean_test_r2,key
0,0,-6282.144808,0.901977,cv_ngram
1,0,-6323.948614,0.901758,sim_enc_all
2,0,-6826.029665,0.885764,cv
3,0,-6580.573745,0.874806,sim_enc300
4,0,-6396.464374,0.861522,one-hot
5,0,-7790.967702,0.788693,sim_enc100
