In [93]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [122]:
data = pd.read_csv("../competition/Employee_Satisfaction/train.csv")
test = pd.read_csv("../competition/Employee_Satisfaction/test.csv")
data.columns

Index(['id', 'last_evaluation', 'number_project', 'average_monthly_hours',
       'time_spend_company', 'Work_accident', 'package',
       'promotion_last_5years', 'division', 'salary', 'satisfaction_level'],
      dtype='object')

In [95]:
y = data['satisfaction_level']
X = data.drop(['satisfaction_level'], axis=1)

In [96]:
def num_cat_splitor(X):
    s = (X.dtypes == 'object')
    object_cols = list(s[s].index)
    object_cols # ['package', 'division', 'salary']
    num_cols = list(set(X.columns) - set(object_cols))
    num_cols
    # ['Work_accident',
    #  'time_spend_company',
    #  'promotion_last_5years',
    #  'id',
    #  'average_monthly_hours',
    #  'last_evaluation',
    #  'number_project']
    return num_cols, object_cols
num_cols, object_cols = num_cat_splitor(X)
print(num_cols)
print(object_cols)
X[object_cols].values

['Work_accident', 'time_spend_company', 'promotion_last_5years', 'id', 'average_monthly_hours', 'last_evaluation', 'number_project']
['package', 'division', 'salary']


array([['a', 'accounting', 'medium'],
       ['b', 'marketing', 'low'],
       ['e', 'sales', 'medium'],
       ...,
       ['c', 'sales', 'medium'],
       ['b', 'IT', 'low'],
       ['c', 'support', 'low']], dtype=object)

In [97]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_cols)),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

In [98]:
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(object_cols)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
X_prepared = full_pipeline.fit_transform(X)
X_prepared.shape

(11999, 25)

In [99]:
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg,X_prepared,y,
                               scoring='neg_mean_squared_error',cv=3)
forest_rmse_scores = np.sqrt(-forest_scores)
print(forest_rmse_scores)
print(forest_rmse_scores.mean())
print(forest_rmse_scores.std())

[0.17767301 0.18211663 0.18128649]
0.18035871005080292
0.0019290815349805967


In [100]:
param_grid = [
    {'n_estimators' : [3,10,30],'max_features':[2,4,6,8]},
    {'bootstrap':[False], 'n_estimators' : [3,10],'max_features':[2,3,4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')
grid_search.fit(X_prepared,y)


GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             scoring='neg_mean_squared_error')

In [101]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [102]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

In [103]:
cv_result = grid_search.cv_results_
for mean_score, params in zip(cv_result['mean_test_score'], cv_result['params']):
    print(np.sqrt(-mean_score), params)

0.21672947843603435 {'max_features': 2, 'n_estimators': 3}
0.19326941681206122 {'max_features': 2, 'n_estimators': 10}
0.18670411934127787 {'max_features': 2, 'n_estimators': 30}
0.20955467058788665 {'max_features': 4, 'n_estimators': 3}
0.1889899343929204 {'max_features': 4, 'n_estimators': 10}
0.1821711514173845 {'max_features': 4, 'n_estimators': 30}
0.20683335337374673 {'max_features': 6, 'n_estimators': 3}
0.187790093767181 {'max_features': 6, 'n_estimators': 10}
0.18100972971295928 {'max_features': 6, 'n_estimators': 30}
0.20719771329021583 {'max_features': 8, 'n_estimators': 3}
0.1859510805341047 {'max_features': 8, 'n_estimators': 10}
0.18035407573239673 {'max_features': 8, 'n_estimators': 30}
0.21094992134337529 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.1951233950354826 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.21048765152406534 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.19171859010850603 {'bootstrap': False, 'max_featur

In [104]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.01339825, 0.08916229, 0.003258  , 0.14694404, 0.20214578,
       0.13606608, 0.23051826, 0.01185551, 0.0175757 , 0.01242556,
       0.01581036, 0.01255439, 0.00814935, 0.00654919, 0.0054828 ,
       0.00515477, 0.00488507, 0.00628945, 0.00676332, 0.01268749,
       0.01037455, 0.01081768, 0.00786773, 0.01133198, 0.0119324 ])

In [105]:
k = 3
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]
    
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('forst_reg', RandomForestRegressor())
])

In [107]:
param_grid = [{
    'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(5, len(feature_importances) + 1)),
    'forst_reg__n_estimators' : [3,10,30]
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=3,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search_prep.fit(X,y)

Fitting 3 folds for each of 189 candidates, totalling 567 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   49.7s
[Parallel(n_jobs=-1)]: Done 567 out of 567 | elapsed:  1.6min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preparation',
                                        FeatureUnion(transformer_list=[('num_pipeline',
                                                                        Pipeline(steps=[('selector',
                                                                                         DataFrameSelector(attribute_names=['Work_accident',
                                                                                                                            'time_spend_company',
                                                                                                                            'promotion_last_5years',
                                                                                                                            'id',
                                                                                                                            'average_monthly_hours',
                      

In [108]:
grid_search_prep.best_params_

{'feature_selection__k': 25,
 'forst_reg__n_estimators': 30,
 'preparation__num_pipeline__imputer__strategy': 'mean'}

In [110]:
final_model = grid_search_prep.best_estimator_
final_model

Pipeline(steps=[('preparation',
                 FeatureUnion(transformer_list=[('num_pipeline',
                                                 Pipeline(steps=[('selector',
                                                                  DataFrameSelector(attribute_names=['Work_accident',
                                                                                                     'time_spend_company',
                                                                                                     'promotion_last_5years',
                                                                                                     'id',
                                                                                                     'average_monthly_hours',
                                                                                                     'last_evaluation',
                                                                                                     'num

In [125]:
y_pred_test = final_model.predict(test)
result = pd.DataFrame()
result['id'] = test['id']
result['satisfaction_level'] = y_pred_test
result.to_csv('rf_ML_pipeline.csv',index=False)
test.head()

Unnamed: 0,id,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,package,promotion_last_5years,division,salary
0,4615,0.78,4,260.71,2,0,e,0,technical,medium
1,1680,0.96,7,275.38,5,0,b,0,marketing,low
2,8866,0.57,4,207.29,5,0,e,0,sales,low
3,7525,0.89,5,150.7,4,0,c,0,technical,medium
4,5389,0.93,3,141.23,2,0,e,0,support,medium
