In [66]:
import numpy as np
import pandas as pd
from dirty_cat import datasets
import itertools
import matplotlib.pyplot as plt

employee_salaries = datasets.fetch_employee_salaries()

In [67]:
X = employee_salaries.X[['employee_position_title', 'year_first_hired', 'assignment_category']]
y = employee_salaries.y

In [68]:
X.head()

Unnamed: 0,employee_position_title,year_first_hired,assignment_category
0,Office Services Coordinator,1986,Fulltime-Regular
1,Master Police Officer,1988,Fulltime-Regular
2,Social Worker IV,1989,Fulltime-Regular
3,Resident Supervisor II,2014,Fulltime-Regular
4,Planning Specialist III,2007,Fulltime-Regular


In [69]:
X.assignment_category.value_counts()

Fulltime-Regular    8394
Parttime-Regular     834
Name: assignment_category, dtype: int64

In [70]:
X.employee_position_title.value_counts()

Police Officer III                                883
Firefighter/Rescuer III                           694
Bus Operator                                      638
Manager III                                       243
Correctional Officer III (Corporal)               228
                                                 ... 
Director Office of Intergovernmental Relations      1
Food Service Manager                                1
Executive Administrative Aide to CAO                1
Director Department of Technology Services          1
Director Department of Public Libraries             1
Name: employee_position_title, Length: 385, dtype: int64

Before building machine learning models, it is important to deal with categorical variables. Some variables are simple, for instance the "assignment_category" variable which has only 2 modalities (full-time or part-time). On the other hand, the "employee_position_title" variable contains a large number of modalities which is problematic and therefore a one-hot encoding would not be really suitable. In this notebook, we will explore the **dirty cat** library, which contains methods suitable for this kind of "dirty" categorical variables.

Some titles appear much more than others while others only appear once and this could be a problem because it is possible that we find a value in the train and not in the test or vice versa. For example, we notice that some titles contain the word "director", it could be interesting to group them together in order to have unique information rather than having several titles of directors separately. We are therefore going to separate the text into several words and then for each word, we are going to construct a feature using Scikit-Learn's CountVectorizer.

In [71]:
cv = CountVectorizer().fit(X['employee_position_title'])
cv.transform(X['employee_position_title']).shape

(9228, 321)

In [72]:
# first 10 items
dict(itertools.islice(cv.vocabulary_.items(),10))

{'office': 194,
 'services': 267,
 'coordinator': 67,
 'master': 182,
 'police': 211,
 'officer': 195,
 'social': 273,
 'worker': 318,
 'iv': 159,
 'resident': 250}

But what is there are some typos in the titles ? What if one wrote "afficer" instead of "officer" for instance ? This can be handled by adding some parameters to the CountVectorizer:

In [73]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='char',
                     ngram_range=(2,4)
                    ).fit(X['employee_position_title'])
cv.transform(X['employee_position_title']).shape

(9228, 3702)

In [74]:
dict(itertools.islice(cv.vocabulary_.items(),10))

{'of': 2404,
 'ff': 1357,
 'fi': 1369,
 'ic': 1614,
 'ce': 673,
 'e ': 938,
 ' s': 230,
 'se': 3102,
 'er': 1168,
 'rv': 3026}

We get a sparse matrix where overlaps are a sign of similarity. The **dirty_cat** library has a feature generation trick called the **SimilarityEncoder**.

In [75]:
len(X['employee_position_title'].value_counts())

385

In [76]:
import dirty_cat

mod = dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=200)
mod.fit_transform(X[['employee_position_title']]).shape

(9228, 200)

##### Grid search

In [77]:
from sklearn import set_config

set_config(display="diagram")

from sklearn.pipeline import Pipeline, FeatureUnion
from sklego.preprocessing import ColumnSelector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge

method = {
    'sim_enc100': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=100),
    'sim_enc300': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=300),
    'sim_enc_all': dirty_cat.SimilarityEncoder(),
    'one-hot': OneHotEncoder(handle_unknown='ignore')
}

results = []

for k, encoder in method.items():
    pipe = Pipeline([
        ('split', FeatureUnion([
            ('cat', Pipeline([
                ('grab', ColumnSelector(['employee_position_title'])),
                ('handle', encoder)
            ])),
            ('one-hot', Pipeline([
                ('grab', ColumnSelector('assignment_category')),
                ('handle', OneHotEncoder(handle_unknown='ignore'))
            ])),
            ('floats', Pipeline([
                ('grab', ColumnSelector('year_first_hired')),
                ('scale', StandardScaler())
            ])),
        ])),
        ('mod', Ridge())
    ])

    grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
    res_df = pd.DataFrame(grid.fit(X, y).cv_results_)
    res_df['key'] = k
    results.append(res_df)

In [79]:
df_results = pd.concat(results)[['mean_test_neg_mean_absolute_error','mean_test_r2','key']]
df_results.sort_values('mean_test_r2',ascending=False).reset_index()

Unnamed: 0,index,mean_test_neg_mean_absolute_error,mean_test_r2,key
0,0,-6319.457698,0.901871,sim_enc_all
1,0,-6576.122396,0.875093,sim_enc300
2,0,-6393.584175,0.86162,one-hot
3,0,-7780.017666,0.789158,sim_enc100
