In [28]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [29]:
data = pd.read_csv("../competition/Employee_Satisfaction/train.csv")
test = pd.read_csv("../competition/Employee_Satisfaction/test.csv")
data.columns

Index(['id', 'last_evaluation', 'number_project', 'average_monthly_hours',
       'time_spend_company', 'Work_accident', 'package',
       'promotion_last_5years', 'division', 'salary', 'satisfaction_level'],
      dtype='object')

In [30]:
y = data['satisfaction_level']
X = data.drop(['satisfaction_level'], axis=1)

In [31]:
def num_cat_splitor(X):
    s = (X.dtypes == 'object')
    object_cols = list(s[s].index)
    object_cols # ['package', 'division', 'salary']
    num_cols = list(set(X.columns) - set(object_cols))
    num_cols
    # ['Work_accident',
    #  'time_spend_company',
    #  'promotion_last_5years',
    #  'id',
    #  'average_monthly_hours',
    #  'last_evaluation',
    #  'number_project']
    return num_cols, object_cols
num_cols, object_cols = num_cat_splitor(X)
print(num_cols)
print(object_cols)
X[object_cols].values

['Work_accident', 'promotion_last_5years', 'last_evaluation', 'average_monthly_hours', 'number_project', 'id', 'time_spend_company']
['package', 'division', 'salary']


array([['a', 'accounting', 'medium'],
       ['b', 'marketing', 'low'],
       ['e', 'sales', 'medium'],
       ...,
       ['c', 'sales', 'medium'],
       ['b', 'IT', 'low'],
       ['c', 'support', 'low']], dtype=object)

In [32]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_cols)),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

In [33]:
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(object_cols)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
X_prepared = full_pipeline.fit_transform(X)
X_prepared.shape

(11999, 25)

In [34]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg,X_prepared,y,
                               scoring='neg_mean_squared_error',cv=3)
forest_rmse_scores = np.sqrt(-forest_scores)
print(forest_rmse_scores)
print(forest_rmse_scores.mean())
print(forest_rmse_scores.std())

[0.17779438 0.1829623  0.18112756]
0.18062808168627564
0.0021391555263106897


In [35]:
param_grid = [
    {'n_estimators' : [3,10,30,50,80],'max_features':[2,4,6,8]},
    {'bootstrap':[False], 'n_estimators' : [3,10],'max_features':[2,3,4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')
grid_search.fit(X_prepared,y)


GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30, 50, 80]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             scoring='neg_mean_squared_error')

In [36]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 80}

In [37]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=80)

In [38]:
cv_result = grid_search.cv_results_
for mean_score, params in zip(cv_result['mean_test_score'], cv_result['params']):
    print(np.sqrt(-mean_score), params)

0.2129252723367584 {'max_features': 2, 'n_estimators': 3}
0.19276874697889504 {'max_features': 2, 'n_estimators': 10}
0.1865548358477794 {'max_features': 2, 'n_estimators': 30}
0.18556292560099835 {'max_features': 2, 'n_estimators': 50}
0.1843301132697697 {'max_features': 2, 'n_estimators': 80}
0.2083194772288037 {'max_features': 4, 'n_estimators': 3}
0.18816455255485012 {'max_features': 4, 'n_estimators': 10}
0.1829623147935501 {'max_features': 4, 'n_estimators': 30}
0.18205489877494252 {'max_features': 4, 'n_estimators': 50}
0.18089699699013326 {'max_features': 4, 'n_estimators': 80}
0.2071443061432644 {'max_features': 6, 'n_estimators': 3}
0.1869972051226723 {'max_features': 6, 'n_estimators': 10}
0.18085709016526758 {'max_features': 6, 'n_estimators': 30}
0.18019182644771176 {'max_features': 6, 'n_estimators': 50}
0.17924296098682077 {'max_features': 6, 'n_estimators': 80}
0.2059155832777527 {'max_features': 8, 'n_estimators': 3}
0.18624609411168255 {'max_features': 8, 'n_estimator

In [39]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.01281747, 0.00326629, 0.13011967, 0.20860081, 0.21515772,
       0.14794409, 0.09535877, 0.01160134, 0.02337838, 0.0114124 ,
       0.02205272, 0.01089199, 0.00758653, 0.00638578, 0.00570083,
       0.00533725, 0.00481233, 0.00626799, 0.00699377, 0.01256712,
       0.01023773, 0.01105252, 0.00736504, 0.01143477, 0.01165668])

In [40]:
k = 3
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]
    
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('forst_reg', RandomForestRegressor())
])

In [42]:
param_grid = [{
    'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
#     'feature_selection__k': list(range(5, len(feature_importances) + 1)),
    'forst_reg__n_estimators' : [200,250,300,310,330],
    'forst_reg__max_features':[2,4,6,8]
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=10,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search_prep.fit(X,y)

Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   41.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  4.5min finished


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preparation',
                                        FeatureUnion(transformer_list=[('num_pipeline',
                                                                        Pipeline(steps=[('selector',
                                                                                         DataFrameSelector(attribute_names=['Work_accident',
                                                                                                                            'promotion_last_5years',
                                                                                                                            'last_evaluation',
                                                                                                                            'average_monthly_hours',
                                                                                                                            'number_project',
            

In [43]:
grid_search_prep.best_params_

{'forst_reg__max_features': 2,
 'forst_reg__n_estimators': 250,
 'preparation__num_pipeline__imputer__strategy': 'mean'}

In [44]:
final_model = grid_search_prep.best_estimator_
final_model

Pipeline(steps=[('preparation',
                 FeatureUnion(transformer_list=[('num_pipeline',
                                                 Pipeline(steps=[('selector',
                                                                  DataFrameSelector(attribute_names=['Work_accident',
                                                                                                     'promotion_last_5years',
                                                                                                     'last_evaluation',
                                                                                                     'average_monthly_hours',
                                                                                                     'number_project',
                                                                                                     'id',
                                                                                                     'time_sp

In [45]:
y_pred_test = final_model.predict(test)
result = pd.DataFrame()
result['id'] = test['id']
result['satisfaction_level'] = y_pred_test
result.to_csv('rf_ML_pipeline.csv',index=False)
test.head()

Unnamed: 0,id,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,package,promotion_last_5years,division,salary
0,4615,0.78,4,260.71,2,0,e,0,technical,medium
1,1680,0.96,7,275.38,5,0,b,0,marketing,low
2,8866,0.57,4,207.29,5,0,e,0,sales,low
3,7525,0.89,5,150.7,4,0,c,0,technical,medium
4,5389,0.93,3,141.23,2,0,e,0,support,medium
