In [3]:
import pandas as pd
import numpy as np
import pandas_profiling
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('client_training_cleaned_1.csv')
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
df2 = pd.get_dummies(df, prefix_sep='_', drop_first=True)

X = df2.drop(['job_performance'], axis=1)
y = df2['job_performance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
xg = xgb.XGBRegressor(n_estimators=100, max_depth=3)
xg.fit(X_train, y_train)

y_pred_xg = xg.predict(X_test)

# Evaluation metrics
mse_xg = mean_squared_error(y_test, y_pred_xg)
r2_xg = r2_score(y_test, y_pred_xg)
print('MSE:\n{}\n\nr2_score:\n{}'.format(mse_xg, r2_xg))

In [6]:
rf = RandomForestRegressor(random_state = 0)
rf.fit(X_train, y_train)
# df_imp_3 = pd.DataFrame(rf_3.feature_importances_, 
#             index = X_train.columns, columns = ['importance']).sort_values('importance', ascending=False)

# Evaluation
y_pred_rf = rf.predict(X_test)

# Evaluation metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print('MSE:\n{}\n\nr2_score:\n{}'.format(mse_rf, r2_rf))

MSE:
39278.79394084307

r2_score:
0.7863446144852169


In [10]:
categorical_clms = df.select_dtypes(include=['O']).columns
print("Number of categorical columns: {}".format(len(categorical_clms)))

numerical_clms = df.select_dtypes(include=['int64', 'float64']).columns
print("Number of numerical columns: {}".format(len(numerical_clms)))

df_cat = pd.DataFrame(df, columns=categorical_clms)
df_num = pd.DataFrame(df, columns=numerical_clms)

Number of categorical columns: 186
Number of numerical columns: 41


In [14]:
from fancyimpute import KNN

Using TensorFlow backend.


In [18]:
df_num.columns.values

array(['yrsqual', 'earnhr', 'v272', 'ictwork', 'planning', 'readhome',
       'v207', 'v252', 'v242', 'influence', 'yrsget', 'row', 'v210',
       'writwork', 'learnatwork', 'v136', 'v33', 'earnmth', 'nfehrsnjr',
       'icthome', 'readwork', 'nfehrsjr', 'taskdisc', 'v239', 'isco1c',
       'v105', 'writhome', 'readytolearn', 'v202', 'v104', 'v133',
       'nfehrs', 'v206', 'v224', 'earnhrppp', 'v187', 'leavedu', 'v22',
       'v154', 'v135', 'job_performance'], dtype=object)

In [19]:
df_org = pd.read_csv('client-trainingset-1561457457-252.csv')

In [22]:
df_num_knn = df_org[numerical_clms]

In [26]:
df_num_knn_filled = KNN(k=3).fit_transform(df_num_knn)

Imputing row 1/20000 with 16 missing, elapsed time: 278.923
Imputing row 101/20000 with 5 missing, elapsed time: 281.186
Imputing row 201/20000 with 15 missing, elapsed time: 281.670
Imputing row 301/20000 with 8 missing, elapsed time: 282.016
Imputing row 401/20000 with 10 missing, elapsed time: 282.303
Imputing row 501/20000 with 8 missing, elapsed time: 282.638
Imputing row 601/20000 with 22 missing, elapsed time: 283.039
Imputing row 701/20000 with 17 missing, elapsed time: 283.399
Imputing row 801/20000 with 18 missing, elapsed time: 284.114
Imputing row 901/20000 with 7 missing, elapsed time: 284.605
Imputing row 1001/20000 with 15 missing, elapsed time: 284.984
Imputing row 1101/20000 with 7 missing, elapsed time: 285.328
Imputing row 1201/20000 with 22 missing, elapsed time: 285.795
Imputing row 1301/20000 with 6 missing, elapsed time: 286.269
Imputing row 1401/20000 with 5 missing, elapsed time: 286.795
Imputing row 1501/20000 with 12 missing, elapsed time: 287.221
Imputing ro

Imputing row 13101/20000 with 16 missing, elapsed time: 344.059
Imputing row 13201/20000 with 14 missing, elapsed time: 344.398
Imputing row 13301/20000 with 9 missing, elapsed time: 344.791
Imputing row 13401/20000 with 11 missing, elapsed time: 345.167
Imputing row 13501/20000 with 21 missing, elapsed time: 345.483
Imputing row 13601/20000 with 10 missing, elapsed time: 346.010
Imputing row 13701/20000 with 4 missing, elapsed time: 346.369
Imputing row 13801/20000 with 21 missing, elapsed time: 346.822
Imputing row 13901/20000 with 11 missing, elapsed time: 347.314
Imputing row 14001/20000 with 16 missing, elapsed time: 347.740
Imputing row 14101/20000 with 14 missing, elapsed time: 348.763
Imputing row 14201/20000 with 17 missing, elapsed time: 349.526
Imputing row 14301/20000 with 25 missing, elapsed time: 349.901
Imputing row 14401/20000 with 21 missing, elapsed time: 350.309
Imputing row 14501/20000 with 15 missing, elapsed time: 350.724
Imputing row 14601/20000 with 20 missing, 

In [28]:
df_numeric = pd.DataFrame(df_num_knn_filled, columns=['yrsqual', 'earnhr', 'v272', 'ictwork', 'planning', 'readhome',
       'v207', 'v252', 'v242', 'influence', 'yrsget', 'row', 'v210',
       'writwork', 'learnatwork', 'v136', 'v33', 'earnmth', 'nfehrsnjr',
       'icthome', 'readwork', 'nfehrsjr', 'taskdisc', 'v239', 'isco1c',
       'v105', 'writhome', 'readytolearn', 'v202', 'v104', 'v133',
       'nfehrs', 'v206', 'v224', 'earnhrppp', 'v187', 'leavedu', 'v22',
       'v154', 'v135', 'job_performance'])

In [31]:
df_numeric.isnull().sum().sum()

0

In [32]:
df3 = pd.concat([df_cat, df_numeric], axis=1, sort=False)

In [33]:
df3.shape

(20000, 227)

In [34]:
df4 = pd.get_dummies(df3, prefix_sep='_', drop_first=True)

X_2 = df4.drop(['job_performance'], axis=1)
y_2 = df4['job_performance']

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.3, random_state=0)

rf_2 = RandomForestRegressor(random_state = 0)
rf_2.fit(X_train_2, y_train_2)
# df_imp_3 = pd.DataFrame(rf_3.feature_importances_, 
#             index = X_train.columns, columns = ['importance']).sort_values('importance', ascending=False)

# Evaluation
y_pred_rf_2 = rf_2.predict(X_test_2)

# Evaluation metrics
mse_rf_2 = mean_squared_error(y_test_2, y_pred_rf_2)
r2_rf_2 = r2_score(y_test, y_pred_rf)
print('MSE:\n{}\n\nr2_score:\n{}'.format(mse_rf_2, r2_rf_2))

MSE:
40589.36465662736

r2_score:
0.7863446144852169


In [1]:
from auto_ml import Predictor



In [2]:
from tpot import TPOTRegressor

In [None]:
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)

y_pred_tp = tpot.predict(X_test)

mse_tp = mean_squared_error(y_test, y_pred_tp)
r2_tp = r2_score(y_test, y_pred_tp)
print('MSE:\n{}\n\nr2_score:\n{}'.format(mse_tp, r2_tp))

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=300, style=ProgressStyle(descript…