In [17]:
from HelperFunctions import Classifier, ExtraTreesFeatureImportanceTransformator, SVMTransformator, NNTransformator
from Pipeline import ClassificationPipeline

import pandas as pd
import numpy as np

import random
from xgboost import XGBClassifier
from collections import Counter
from statistics import mean

from sklearn import pipeline, svm, neural_network
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV,train_test_split,StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.utils import shuffle
from sklearn.decomposition import PCA,KernelPCA
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import TransformerMixin

from imblearn.over_sampling import SMOTE,ADASYN 
from imblearn.metrics import classification_report_imbalanced

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt
from matplotlib import colors
#%matplotlib inline
%matplotlib notebook
plt.style.use('ggplot')
pd.set_option('display.max_columns', 250)
pd.set_option('display.max_rows', 250)

In [18]:
features = pd.read_csv('subjectsAll31316-havovwoOnly.txt', sep=";", header=0)
features['Answer.6'].fillna(value= 6-features['Answer'],inplace=True)
#upper_null_index = features[features['Answer.6'].isnull()].index.tolist()
#features.drop(upper_null_index, inplace=True)
#features.reset_index(drop=True, inplace=True)
features.shape

(144, 50)

In [19]:
labels = features['Answer.6'].groupby(features['Supervisor']).transform(lambda x: np.floor(x.max()*0.67)>=x).astype(int)
labels.shape

(144,)

In [20]:
pd.DataFrame.replace(features, to_replace=',',value='.', regex=True, inplace=True)

obj_float_col = ['wtf','Actiegerichtheid', 'Behoefte aan regels', 'Behoefte aan spanning', 'Beinvloedingsvermogen', 'Detailgerichtheid', 'Doorzettingsvermogen', 'Eigen verantwoordelijkheid', 'Eigenbelang', 'Focus', 'Impulsbeheersing', 'Meevoelendheid', 'Oorzaak-analyse', 'Optimisme', 'Profileringsdrang', 'Rationaliteit', 'Sociabiliteit', 'Stressgevoeligheid', 'Verbeeldingskracht', 'Vertrouwen in anderen', 'Werkorganisatie', 'Werktempo', 'Zelfbeheersing', 'Zelfwaardering', 'Zelfwerkzaamheid']
for i in obj_float_col:
    features[i] = features[i].astype('float64')

In [21]:
features.dtypes

ID                              int64
employeeID                      int64
Department                     object
wtf                           float64
DateOfBirth                     int64
EmployedSince                   int64
educationType                   int64
Supervisor                     object
Team                           object
Gender                          int64
Qualified                       int64
teacherScale                    int64
Actiegerichtheid              float64
Behoefte aan regels           float64
Behoefte aan spanning         float64
Beinvloedingsvermogen         float64
Detailgerichtheid             float64
Doorzettingsvermogen          float64
Eigen verantwoordelijkheid    float64
Eigenbelang                   float64
Focus                         float64
Impulsbeheersing              float64
Meevoelendheid                float64
Oorzaak-analyse               float64
Optimisme                     float64
Profileringsdrang             float64
Rationalitei

In [22]:
features.drop(['ID','employeeID','Answer.6','Question#.1','Question#.2','Question#.3','Question#.4','Question#.5','Question#.6'], axis=1, inplace=True)
features = features.sample(frac=1).reset_index(drop=True)

features = pd.get_dummies(features)
features.shape

(144, 68)

In [23]:
percentage = 33
pcaAmount = 10
featureSectionAmount = 23
treeAmount = 512
adaboostAmount = 400
adaboostRate = 0.001
pcaDegree = 3
pcaGamma = 0.33
featureImportanceAmount = 15
SVMAmount = 15
NNAmount = 10

output = []
outputProba = []
precision_1 = []
precision_0 = []
recall_1 = []
recall_0 = []
auc_roc = []
accuracy = []
Proba = [0,0,0]
Confidence = [0,0,0]
confidence_proba = [0,0,0]

In [24]:
for i in range(0,5):
    folds = StratifiedKFold(n_splits=2, shuffle=True)
    
    for train_index, test_index in folds.split(features,labels):
        
        
        train_features, test_features = features.iloc[train_index], features.iloc[test_index]
        train_labels, test_labels = labels.iloc[train_index], labels.iloc[test_index]
        
        smote = SMOTE(ratio='auto',kind='svm')
        train_features, train_labels = smote.fit_sample(train_features, train_labels)
        train_features, train_labels = shuffle(train_features, train_labels, random_state=5)
         
       
        forest = ExtraTreesClassifier(n_estimators=treeAmount,criterion="gini",max_depth=None,min_samples_split=2,\
                                           min_samples_leaf=1,min_weight_fraction_leaf=0,max_features="sqrt", max_leaf_nodes=None,\
                                           bootstrap=True,oob_score=True,n_jobs=1,random_state=None,verbose=0,warm_start=False,\
                                           class_weight="balanced_subsample")
    
        ada = AdaBoostClassifier(base_estimator=forest, n_estimators=adaboostAmount, learning_rate=adaboostRate)
    
        fu = FeatureUnion(n_jobs=-1, transformer_list=[('pca',KernelPCA(pcaAmount, 'poly', degree=pcaDegree, gamma=pcaGamma)),\
                                                       ('fs',SelectKBest(f_classif, k = featureSectionAmount)),\
                                                       ('importance', ExtraTreesFeatureImportanceTransformator(train_features, train_labels, featureImportanceAmount, treeAmount)),\
                                                       ('SVM', SVMTransformator(train_features, train_labels, SVMAmount)),\
                                                       ('NN', NNTransformator(train_features, NNAmount))])
        
        pipe = Pipeline([('scaler', StandardScaler()),('fu',fu),('ada',ada)])
        pipe.fit(train_features, train_labels)
    
        outputfold = pipe.predict(test_features).tolist()
        outputProbafold = pipe.predict_proba(test_features)
        
        output_df = pd.DataFrame(outputProbafold,columns=['zero','one'])
        output_df['Proba_80'] = ((output_df['one']>0.80) | (output_df['one']<0.20)).astype(int)
        output_df['Proba_65'] = ((output_df['one'].between(0.65,0.80)) | (output_df['one'].between(0.20,0.35))).astype(int)
        output_df['Proba_50'] = (output_df['one'].between(0.35,0.65)).astype(int)
        output_df['Test_pred'] = pd.Series(outputfold)
        output_df['Test_label'] = pd.Series(test_labels.values)
        output_df['Confidence_80'] = ((output_df['Proba_80']==1) & (output_df['Test_pred']==output_df['Test_label'])).astype(int)
        output_df['Confidence_65'] = ((output_df['Proba_65']==1) & (output_df['Test_pred']==output_df['Test_label'])).astype(int)
        output_df['Confidence_50'] = ((output_df['Proba_50']==1) & (output_df['Test_pred']==output_df['Test_label'])).astype(int)
        
        Proba = [sum(i) for i in zip(Proba,[output_df['Proba_80'].sum() , output_df['Proba_65'].sum() , output_df['Proba_50'].sum()])]
        Confidence = [sum(i) for i in zip(Confidence,[output_df['Confidence_80'].sum() , output_df['Confidence_65'].sum(), output_df['Confidence_50'].sum()])]
        confidence_proba = [x/y for x,y in zip(Confidence, Proba)]
        
        precision_1.append(precision_score(test_labels, outputfold, pos_label = 1, average = 'binary'))
        precision_0.append(precision_score(test_labels, outputfold, pos_label = 0, average = 'binary'))
        
        auc_roc.append(roc_auc_score(test_labels, outputfold))
        accuracy.append(accuracy_score(test_labels, outputfold))
        
        recall_1.append(recall_score(test_labels, outputfold, pos_label = 1, average = 'binary'))
        recall_0.append(recall_score(test_labels, outputfold, pos_label = 0, average = 'binary'))
    
    ans = {'precision':[mean(precision_0),mean(precision_1)], 'auc':mean(auc_roc), 'recall':[mean(recall_0),mean(recall_1)], \
           'acc':mean(accuracy), 'accuracy top 80-65-50': confidence_proba}
    
    with open('solution_Jayesh2.txt', 'a') as file:
        for q in ans.keys():
            file.write(q + " " + str(ans[q]) + "\n")
        file.write("\n\n")