In [48]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from sklearn.pipeline import Pipeline

from sklearn.decomposition import KernelPCA
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

### Read the data

In [2]:
train = pd.read_excel("../Data/Matrix_Apr10.xlsx", sheetname='train')
test = pd.read_excel("../Data/Matrix_Apr10.xlsx", sheetname='test')

### Define Output

In [4]:
#3 categories
train['growth'] = np.where(train['Y'] <= np.mean(train.Y) - np.std(train.Y), "NEGATIVE", np.where(train['Y'] >= np.mean(train.Y) + np.std(train.Y), "HIGH_GROWTH", "NORMAL_GROWTH"))
test['growth'] = np.where(test['Y'] <= np.mean(train.Y) - np.std(train.Y), "NEGATIVE", np.where(test['Y'] >= np.mean(train.Y) + np.std(train.Y), "HIGH_GROWTH", "NORMAL_GROWTH"))

#2 categories evenly distributed
# train['growth'] = np.where(train['Y'] <= np.mean(train.Y), "BELOW AVERAGE", "ABOVE AVERAGE")
# test['growth'] = np.where(test['Y'] <= np.mean(train.Y), "BELOW AVERAGE", "ABOVE AVERAGE")

#2 categories high/not high growth
# train['growth'] = np.where(train['Y'] <= np.mean(train.Y) + np.std(train.Y), "NOT HIGH GROWTH", "HIGH GROWTH")
# test['growth'] = np.where(test['Y'] <= np.mean(train.Y) + np.std(train.Y), "NOT HIGH GROWTH", "HIGH GROWTH")


### Split train and test

In [105]:
x_train = train.drop(['growth', 'Y', 'postal_code', 'Years'], axis =1)
y_train = train.growth
x_test = test.drop(['growth', 'Y', 'postal_code', 'Years'], axis =1)
y_test = test.growth

### Define Classifiers

In [15]:
classifiers = {}
classifier_parameters = {}

##### Random Forest classifier
classifiers['Random Forest'] = Pipeline([('clf', RandomForestClassifier())])
classifier_parameters['Random Forest'] = {'clf__max_depth':(1, 3, 9, 12, 15), 'clf__class_weight': (None, "balanced", "balanced_subsample")}

##### AdaBoost classifier
classifiers['AdaBoost'] = Pipeline([('clf', AdaBoostClassifier())])
classifier_parameters['AdaBoost'] = {'clf__n_estimators':(30, 40, 50, 60, 70)}

##### SVM
# classifiers['SVM'] = Pipeline([('clf', SVC())])
# classifier_parameters['SVM'] = {'clf__C':(0.01, 0.1, 1, 10), 'clf__kernel': ('poly', 'rbf', 'sigmoid'), 'clf__degree': (2,3), 'clf__gamma': (0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1)}

#### Neural Networks
classifiers['NN'] = Pipeline([('clf', MLPClassifier(activation='logistic'))])
classifier_parameters['NN'] = {'clf__hidden_layer_sizes':((10,15,10), (15))}

#### kNN
classifiers['kNN'] = Pipeline([('clf', neighbors.KNeighborsClassifier())])
classifier_parameters['kNN'] = {'clf__n_neighbors':(3,5,7), 'clf__weights': ('uniform', 'distance')}

### Train Algorithm - Cross Validation

In [16]:
# Create a label encoder to transform output labels.
le = LabelEncoder() 

# Split features and class into two dataframes.
X_training = x_train.values
y_training = le.fit_transform(y_train.values)

# Initialize scores dictionary
scores = pd.DataFrame(columns=['fold', 'algorithm', 'parameters', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score'])

# 10 fold CV
kf = KFold(n_splits=10, shuffle=True)

# Outer Cross Validation
fold = 0
for train_index, test_index in kf.split(X_training):
    X_train, X_test = X_training[train_index], X_training[test_index]
    Y_train, Y_test = y_training[train_index], y_training[test_index]
    
    fold = fold + 1

    # Inner CV
    for name, clf in classifiers.items():
        print('Fold ' + str(fold) + ': ' + name)
        if name in classifier_parameters:
            gs = GridSearchCV(estimator=clf, param_grid=classifier_parameters[name])
            gs.fit(X_train, Y_train)
            y_pred = gs.predict(X_test)
            best_params = str(gs.best_params_)
        else:
            clf.fit(X_train, Y_train)
            y_pred = clf.predict(Y_test)
            best_params = 'default'
        
        # collect the scores for printing out later
        scores = scores.append(pd.DataFrame(data={'fold':[fold],
                                                  'algorithm':[name], 
                                                  'parameters':[best_params], 
                                                  'accuracy':[accuracy_score(Y_test, y_pred)], 
                                                  'precision':[precision_score(Y_test, y_pred, average='weighted')],
                                                  'recall':[recall_score(Y_test, y_pred, average='weighted')],
                                                  'kappa':[cohen_kappa_score(Y_test, y_pred)],
                                                  'f1_score':[f1_score(Y_test, y_pred, average='weighted')]}), 
                               ignore_index=True)
        

Fold 1: kNN
Fold 1: Random Forest
Fold 1: NN
Fold 1: AdaBoost
Fold 2: kNN
Fold 2: Random Forest
Fold 2: NN
Fold 2: AdaBoost
Fold 3: kNN
Fold 3: Random Forest
Fold 3: NN
Fold 3: AdaBoost
Fold 4: kNN
Fold 4: Random Forest
Fold 4: NN
Fold 4: AdaBoost
Fold 5: kNN
Fold 5: Random Forest
Fold 5: NN
Fold 5: AdaBoost
Fold 6: kNN
Fold 6: Random Forest
Fold 6: NN
Fold 6: AdaBoost
Fold 7: kNN
Fold 7: Random Forest
Fold 7: NN
Fold 7: AdaBoost
Fold 8: kNN
Fold 8: Random Forest
Fold 8: NN
Fold 8: AdaBoost
Fold 9: kNN
Fold 9: Random Forest
Fold 9: NN
Fold 9: AdaBoost
Fold 10: kNN
Fold 10: Random Forest
Fold 10: NN
Fold 10: AdaBoost


#### 3 labels

In [7]:
scores[['algorithm', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score']].groupby(['algorithm']).median()

Unnamed: 0_level_0,accuracy,precision,recall,kappa,f1_score
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoost,0.673913,0.621187,0.673913,0.124495,0.64419
NN,0.728261,0.560962,0.728261,0.0,0.623161
Random Forest,0.73913,0.582807,0.73913,0.0,0.629175
kNN,0.73913,0.67344,0.73913,0.184442,0.683984


##### 2 labels evenly distributed

In [12]:
scores[['algorithm', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score']].groupby(['algorithm']).median()

Unnamed: 0_level_0,accuracy,precision,recall,kappa,f1_score
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoost,0.630435,0.633913,0.630435,0.259467,0.630785
NN,0.663043,0.682876,0.663043,0.299433,0.650687
Random Forest,0.645236,0.663826,0.645236,0.289776,0.647545
kNN,0.641304,0.643163,0.641304,0.277887,0.639561


##### 2 categories high/not high growth

In [17]:
scores[['algorithm', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score']].groupby(['algorithm']).median()

Unnamed: 0_level_0,accuracy,precision,recall,kappa,f1_score
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoost,0.847826,0.847597,0.847826,0.27024,0.851709
NN,0.869565,0.756144,0.869565,0.0,0.808898
Random Forest,0.880435,0.855717,0.880435,0.253338,0.847872
kNN,0.836957,0.858928,0.836957,0.256016,0.841023


In [19]:
scores[['algorithm', 'f1_score', 'parameters']][scores['algorithm']=='AdaBoost']

Unnamed: 0,algorithm,f1_score,parameters
3,AdaBoost,0.772313,{'clf__n_estimators': 30}
7,AdaBoost,0.767589,{'clf__n_estimators': 30}
11,AdaBoost,0.925092,{'clf__n_estimators': 40}
15,AdaBoost,0.898429,{'clf__n_estimators': 30}
19,AdaBoost,0.810817,{'clf__n_estimators': 30}
23,AdaBoost,0.868248,{'clf__n_estimators': 70}
27,AdaBoost,0.870438,{'clf__n_estimators': 30}
31,AdaBoost,0.840289,{'clf__n_estimators': 30}
35,AdaBoost,0.863129,{'clf__n_estimators': 50}
39,AdaBoost,0.795546,{'clf__n_estimators': 30}


In [106]:
le = LabelEncoder() 

# Split features and class into two dataframes.
X_training = x_train.values
y_training = le.fit_transform(y_train.values)

clf = AdaBoostClassifier(n_estimators=30)
clf.fit(X_training, y_training)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=30, random_state=None)

In [107]:
y_pred = clf.predict(x_test)
y_test_le = le.fit_transform(y_test.values)

In [108]:
print(confusion_matrix(y_test_le, y_pred))
print(classification_report(y_test_le, y_pred))

[[42  0 23]
 [53  0 54]
 [ 0  0  0]]
             precision    recall  f1-score   support

          0       0.44      0.65      0.53        65
          1       0.00      0.00      0.00       107
          2       0.00      0.00      0.00         0

avg / total       0.17      0.24      0.20       172



### Using Kernel PCA

In [6]:
classifiers = {}
classifier_parameters = {}

kernels = ('poly','rbf','cosine')
kernel_gamma = (0.01, 0.25, 0.5, 0.75)
kernel_ncomponents = (2,5,10)
kernel_degree = (2,3)

##### Random Forest classifier
classifiers['Random Forest'] = Pipeline([('kpca',KernelPCA()),('clf', RandomForestClassifier())])
classifier_parameters['Random Forest'] = {'clf__max_depth':(1, 3, 9, 12, 15), 'clf__class_weight': (None, "balanced", "balanced_subsample"), 'kpca__kernel' : kernels, 'kpca__n_components' :kernel_ncomponents, 'kpca__degree' : kernel_degree, 'kpca__gamma': kernel_gamma}

##### AdaBoost classifier
classifiers['AdaBoost'] = Pipeline([('kpca',KernelPCA()), ('clf', AdaBoostClassifier())])
classifier_parameters['AdaBoost'] = {'clf__n_estimators':(30, 40, 50, 60, 70), 'kpca__kernel' : kernels, 'kpca__n_components' : kernel_ncomponents, 'kpca__degree' : kernel_degree, 'kpca__gamma': kernel_gamma}

##### SVM
# classifiers['SVM'] = Pipeline([('clf', SVC())])
# classifier_parameters['SVM'] = {'clf__C':(0.01, 0.1, 1, 10), 'clf__kernel': ('poly', 'rbf', 'sigmoid'), 'clf__degree': (2,3), 'clf__gamma': (0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1)}

#### Neural Networks
classifiers['NN'] = Pipeline([('kpca',KernelPCA()), ('clf', MLPClassifier(activation='logistic'))])
classifier_parameters['NN'] = {'clf__hidden_layer_sizes':((10,15,10), (15)),'kpca__kernel':kernels, 'kpca__n_components':kernel_ncomponents, 'kpca__degree':kernel_degree, 'kpca__gamma': kernel_gamma}

#### kNN
classifiers['kNN'] = Pipeline([('kpca',KernelPCA()), ('clf', neighbors.KNeighborsClassifier())])
classifier_parameters['kNN'] = {'clf__n_neighbors':(3,5,7), 'clf__weights': ('uniform', 'distance'), 'kpca__kernel':kernels, 'kpca__n_components': kernel_ncomponents, 'kpca__degree': kernel_degree, 'kpca__gamma': kernel_gamma}

In [7]:
# Create a label encoder to transform output labels.
le = LabelEncoder() 

# Split features and class into two dataframes.
X_training = x_train.values
y_training = le.fit_transform(y_train.values)

# Initialize scores dictionary
scores = pd.DataFrame(columns=['fold', 'algorithm', 'parameters', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score'])

# 10 fold CV
kf = KFold(n_splits=10, shuffle=True)

# Outer Cross Validation
fold = 0
for train_index, test_index in kf.split(X_training):
    X_train, X_test = X_training[train_index], X_training[test_index]
    Y_train, Y_test = y_training[train_index], y_training[test_index]
    
    fold = fold + 1

    # Inner CV
    for name, clf in classifiers.items():
        print('Fold ' + str(fold) + ': ' + name)
        if name in classifier_parameters:
            gs = GridSearchCV(estimator=clf, param_grid=classifier_parameters[name])
            gs.fit(X_train, Y_train)
            y_pred = gs.predict(X_test)
            best_params = str(gs.best_params_)
        else:
            clf.fit(X_train, Y_train)
            y_pred = clf.predict(Y_test)
            best_params = 'default'
        
        # collect the scores for printing out later
        scores = scores.append(pd.DataFrame(data={'fold':[fold],
                                                  'algorithm':[name], 
                                                  'parameters':[best_params], 
                                                  'accuracy':[accuracy_score(Y_test, y_pred)], 
                                                  'precision':[precision_score(Y_test, y_pred, average='weighted')],
                                                  'recall':[recall_score(Y_test, y_pred, average='weighted')],
                                                  'kappa':[cohen_kappa_score(Y_test, y_pred)],
                                                  'f1_score':[f1_score(Y_test, y_pred, average='weighted')]}), 
                               ignore_index=True)
        

Fold 1: kNN


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 1: Random Forest
Fold 1: NN




Fold 1: AdaBoost
Fold 2: kNN
Fold 2: Random Forest
Fold 2: NN
Fold 2: AdaBoost
Fold 3: kNN
Fold 3: Random Forest
Fold 3: NN
Fold 3: AdaBoost
Fold 4: kNN
Fold 4: Random Forest
Fold 4: NN
Fold 4: AdaBoost
Fold 5: kNN
Fold 5: Random Forest
Fold 5: NN
Fold 5: AdaBoost
Fold 6: kNN
Fold 6: Random Forest
Fold 6: NN
Fold 6: AdaBoost
Fold 7: kNN
Fold 7: Random Forest
Fold 7: NN
Fold 7: AdaBoost
Fold 8: kNN
Fold 8: Random Forest
Fold 8: NN
Fold 8: AdaBoost
Fold 9: kNN
Fold 9: Random Forest
Fold 9: NN
Fold 9: AdaBoost
Fold 10: kNN
Fold 10: Random Forest
Fold 10: NN
Fold 10: AdaBoost


In [8]:
scores[['algorithm', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score']].groupby(['algorithm']).median()

Unnamed: 0_level_0,accuracy,precision,recall,kappa,f1_score
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoost,0.728261,0.560597,0.728261,0.0,0.628261
NN,0.717391,0.553825,0.717391,0.002882,0.60524
Random Forest,0.73913,0.62182,0.73913,0.098446,0.646205
kNN,0.695652,0.565927,0.695652,0.0,0.620902


In [13]:
scores[['algorithm', 'f1_score', 'parameters']][scores['algorithm']=='Random Forest'].parameters.values

array([ "{'kpca__kernel': 'poly', 'kpca__gamma': 0.25, 'clf__max_depth': 1, 'kpca__degree': 3, 'kpca__n_components': 2, 'clf__class_weight': None}",
       "{'kpca__kernel': 'poly', 'kpca__gamma': 0.25, 'clf__max_depth': 3, 'kpca__degree': 3, 'kpca__n_components': 5, 'clf__class_weight': None}",
       "{'kpca__kernel': 'poly', 'kpca__gamma': 0.75, 'clf__max_depth': 1, 'kpca__degree': 2, 'kpca__n_components': 10, 'clf__class_weight': None}",
       "{'kpca__kernel': 'poly', 'kpca__gamma': 0.01, 'clf__max_depth': 3, 'kpca__degree': 3, 'kpca__n_components': 5, 'clf__class_weight': None}",
       "{'kpca__kernel': 'poly', 'kpca__gamma': 0.01, 'clf__max_depth': 1, 'kpca__degree': 3, 'kpca__n_components': 2, 'clf__class_weight': None}",
       "{'kpca__kernel': 'poly', 'kpca__gamma': 0.01, 'clf__max_depth': 3, 'kpca__degree': 2, 'kpca__n_components': 2, 'clf__class_weight': None}",
       "{'kpca__kernel': 'cosine', 'kpca__gamma': 0.01, 'clf__max_depth': 3, 'kpca__degree': 3, 'kpca__n_compo

In [24]:
X_training_kpca = KernelPCA(gamma = 0.01, degree=3, n_components=10).fit_transform(X_training)
clf = RandomForestClassifier(max_depth=3, class_weight=None)
clf.fit(X_training_kpca, y_training)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [25]:
X_test_kpca = KernelPCA(gamma = 0.01, degree=3, n_components=10).fit_transform(x_test)
y_pred = clf.predict(X_test_kpca)
y_test_le = le.fit_transform(y_test.values)
print(confusion_matrix(y_test_le, y_pred))
print(classification_report(y_test_le, y_pred))

[[  9   0  56]
 [  4   0 103]
 [  0   0   0]]
             precision    recall  f1-score   support

          0       0.69      0.14      0.23        65
          1       0.00      0.00      0.00       107
          2       0.00      0.00      0.00         0

avg / total       0.26      0.05      0.09       172



### Using Oversampling

In [27]:
ros = RandomOverSampler()
x_train_os, y_train_os = ros.fit_sample(x_train, y_train)

In [28]:
classifiers = {}
classifier_parameters = {}

##### Random Forest classifier
classifiers['Random Forest'] = Pipeline([('clf', RandomForestClassifier())])
classifier_parameters['Random Forest'] = {'clf__max_depth':(1, 3, 9, 12, 15), 'clf__class_weight': (None, "balanced", "balanced_subsample")}

##### AdaBoost classifier
classifiers['AdaBoost'] = Pipeline([('clf', AdaBoostClassifier())])
classifier_parameters['AdaBoost'] = {'clf__n_estimators':(30, 40, 50, 60, 70)}

##### SVM
# classifiers['SVM'] = Pipeline([('clf', SVC())])
# classifier_parameters['SVM'] = {'clf__C':(0.01, 0.1, 1, 10), 'clf__kernel': ('poly', 'rbf', 'sigmoid'), 'clf__degree': (2,3), 'clf__gamma': (0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1)}

#### Neural Networks
classifiers['NN'] = Pipeline([('clf', MLPClassifier(activation='logistic'))])
classifier_parameters['NN'] = {'clf__hidden_layer_sizes':((10,15,10), (15))}

#### kNN
classifiers['kNN'] = Pipeline([('clf', neighbors.KNeighborsClassifier())])
classifier_parameters['kNN'] = {'clf__n_neighbors':(3,5,7), 'clf__weights': ('uniform', 'distance')}

In [30]:
# Create a label encoder to transform output labels.
le = LabelEncoder() 

# Split features and class into two dataframes.
X_training = x_train_os
y_training = le.fit_transform(y_train_os)

# Initialize scores dictionary
scores = pd.DataFrame(columns=['fold', 'algorithm', 'parameters', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score'])

# 10 fold CV
kf = KFold(n_splits=10, shuffle=True)

# Outer Cross Validation
fold = 0
for train_index, test_index in kf.split(X_training):
    X_train, X_test = X_training[train_index], X_training[test_index]
    Y_train, Y_test = y_training[train_index], y_training[test_index]
    
    fold = fold + 1

    # Inner CV
    for name, clf in classifiers.items():
        print('Fold ' + str(fold) + ': ' + name)
        if name in classifier_parameters:
            gs = GridSearchCV(estimator=clf, param_grid=classifier_parameters[name])
            gs.fit(X_train, Y_train)
            y_pred = gs.predict(X_test)
            best_params = str(gs.best_params_)
        else:
            clf.fit(X_train, Y_train)
            y_pred = clf.predict(Y_test)
            best_params = 'default'
        
        # collect the scores for printing out later
        scores = scores.append(pd.DataFrame(data={'fold':[fold],
                                                  'algorithm':[name], 
                                                  'parameters':[best_params], 
                                                  'accuracy':[accuracy_score(Y_test, y_pred)], 
                                                  'precision':[precision_score(Y_test, y_pred, average='weighted')],
                                                  'recall':[recall_score(Y_test, y_pred, average='weighted')],
                                                  'kappa':[cohen_kappa_score(Y_test, y_pred)],
                                                  'f1_score':[f1_score(Y_test, y_pred, average='weighted')]}), 
                               ignore_index=True)
        

Fold 1: kNN
Fold 1: Random Forest
Fold 1: NN
Fold 1: AdaBoost
Fold 2: kNN
Fold 2: Random Forest
Fold 2: NN
Fold 2: AdaBoost
Fold 3: kNN
Fold 3: Random Forest
Fold 3: NN
Fold 3: AdaBoost
Fold 4: kNN
Fold 4: Random Forest
Fold 4: NN
Fold 4: AdaBoost
Fold 5: kNN
Fold 5: Random Forest
Fold 5: NN
Fold 5: AdaBoost
Fold 6: kNN
Fold 6: Random Forest
Fold 6: NN
Fold 6: AdaBoost
Fold 7: kNN
Fold 7: Random Forest
Fold 7: NN
Fold 7: AdaBoost
Fold 8: kNN
Fold 8: Random Forest
Fold 8: NN
Fold 8: AdaBoost
Fold 9: kNN
Fold 9: Random Forest
Fold 9: NN
Fold 9: AdaBoost
Fold 10: kNN
Fold 10: Random Forest
Fold 10: NN
Fold 10: AdaBoost


In [31]:
scores[['algorithm', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score']].groupby(['algorithm']).median()

Unnamed: 0_level_0,accuracy,precision,recall,kappa,f1_score
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoost,0.68,0.739723,0.68,0.521611,0.685917
NN,0.61,0.616856,0.61,0.410964,0.605398
Random Forest,0.930396,0.937475,0.930396,0.895137,0.929023
kNN,0.865,0.888406,0.865,0.789608,0.84907


In [36]:
scores[['algorithm', 'f1_score', 'parameters']][scores['algorithm']=='Random Forest'].parameters.values

array(["{'clf__max_depth': 12, 'clf__class_weight': None}",
       "{'clf__max_depth': 12, 'clf__class_weight': None}",
       "{'clf__max_depth': 15, 'clf__class_weight': None}",
       "{'clf__max_depth': 12, 'clf__class_weight': None}",
       "{'clf__max_depth': 12, 'clf__class_weight': 'balanced_subsample'}",
       "{'clf__max_depth': 15, 'clf__class_weight': None}",
       "{'clf__max_depth': 9, 'clf__class_weight': 'balanced_subsample'}",
       "{'clf__max_depth': 15, 'clf__class_weight': None}",
       "{'clf__max_depth': 15, 'clf__class_weight': None}",
       "{'clf__max_depth': 12, 'clf__class_weight': 'balanced'}"], dtype=object)

In [82]:
X_training = x_train_os
le = LabelEncoder() 
y_training = le.fit_transform(y_train_os)
clf = RandomForestClassifier(max_depth=12, class_weight=None)
clf.fit(X_training, y_training)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [96]:
y_pred = clf.predict(x_test)
y_test_le = le.fit_transform(y_test.values)
print(confusion_matrix(y_test_le, y_pred))
print(classification_report(y_test_le, y_pred))

[[15  0 50]
 [19  0 88]
 [ 0  0  0]]
             precision    recall  f1-score   support

          0       0.44      0.23      0.30        65
          1       0.00      0.00      0.00       107
          2       0.00      0.00      0.00         0

avg / total       0.17      0.09      0.11       172



### Split training and testing randomly

In [100]:
data = pd.read_excel("../Data/Matrix_Apr10.xlsx", sheetname='total')

#3 classes
#data['growth'] = np.where(data['Y'] <= np.mean(data.Y) - np.std(data.Y), "NEGATIVE", np.where(data['Y'] >= np.mean(data.Y) + np.std(data.Y), "HIGH_GROWTH", "NORMAL_GROWTH"))

#2 classes high/nothigh
data['growth'] = np.where(data['Y'] <= np.mean(data.Y) + np.std(data.Y), "NOT HIGH GROWTH", "HIGH GROWTH")

X = data.drop(['growth', 'Y', 'postal_code', 'Years'], axis =1)
Y = data.growth

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3)

In [60]:
classifiers = {}
classifier_parameters = {}

##### Random Forest classifier
classifiers['Random Forest'] = Pipeline([('clf', RandomForestClassifier())])
classifier_parameters['Random Forest'] = {'clf__max_depth':(1, 3, 9, 12, 15), 'clf__class_weight': (None, "balanced", "balanced_subsample")}

##### AdaBoost classifier
classifiers['AdaBoost'] = Pipeline([('clf', AdaBoostClassifier())])
classifier_parameters['AdaBoost'] = {'clf__n_estimators':(30, 40, 50, 60, 70)}

##### SVM
# classifiers['SVM'] = Pipeline([('clf', SVC())])
# classifier_parameters['SVM'] = {'clf__C':(0.01, 0.1, 1, 10), 'clf__kernel': ('poly', 'rbf', 'sigmoid'), 'clf__degree': (2,3), 'clf__gamma': (0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1)}

#### Neural Networks
classifiers['NN'] = Pipeline([('clf', MLPClassifier(activation='logistic'))])
classifier_parameters['NN'] = {'clf__hidden_layer_sizes':((10,15,10), (15))}

#### kNN
classifiers['kNN'] = Pipeline([('clf', neighbors.KNeighborsClassifier())])
classifier_parameters['kNN'] = {'clf__n_neighbors':(3,5,7), 'clf__weights': ('uniform', 'distance')}

In [61]:
# Create a label encoder to transform output labels.
le = LabelEncoder() 

# Split features and class into two dataframes.
X_training = x_train.values
y_training = le.fit_transform(y_train.values)

# Initialize scores dictionary
scores = pd.DataFrame(columns=['fold', 'algorithm', 'parameters', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score'])

# 10 fold CV
kf = KFold(n_splits=10, shuffle=True)

# Outer Cross Validation
fold = 0
for train_index, test_index in kf.split(X_training):
    X_train, X_test = X_training[train_index], X_training[test_index]
    Y_train, Y_test = y_training[train_index], y_training[test_index]
    
    fold = fold + 1

    # Inner CV
    for name, clf in classifiers.items():
        print('Fold ' + str(fold) + ': ' + name)
        if name in classifier_parameters:
            gs = GridSearchCV(estimator=clf, param_grid=classifier_parameters[name])
            gs.fit(X_train, Y_train)
            y_pred = gs.predict(X_test)
            best_params = str(gs.best_params_)
        else:
            clf.fit(X_train, Y_train)
            y_pred = clf.predict(Y_test)
            best_params = 'default'
        
        # collect the scores for printing out later
        scores = scores.append(pd.DataFrame(data={'fold':[fold],
                                                  'algorithm':[name], 
                                                  'parameters':[best_params], 
                                                  'accuracy':[accuracy_score(Y_test, y_pred)], 
                                                  'precision':[precision_score(Y_test, y_pred, average='weighted')],
                                                  'recall':[recall_score(Y_test, y_pred, average='weighted')],
                                                  'kappa':[cohen_kappa_score(Y_test, y_pred)],
                                                  'f1_score':[f1_score(Y_test, y_pred, average='weighted')]}), 
                               ignore_index=True)
        

Fold 1: kNN
Fold 1: Random Forest
Fold 1: NN
Fold 1: AdaBoost
Fold 2: kNN
Fold 2: Random Forest
Fold 2: NN
Fold 2: AdaBoost
Fold 3: kNN
Fold 3: Random Forest
Fold 3: NN
Fold 3: AdaBoost
Fold 4: kNN
Fold 4: Random Forest
Fold 4: NN
Fold 4: AdaBoost
Fold 5: kNN
Fold 5: Random Forest
Fold 5: NN
Fold 5: AdaBoost
Fold 6: kNN
Fold 6: Random Forest
Fold 6: NN
Fold 6: AdaBoost
Fold 7: kNN
Fold 7: Random Forest
Fold 7: NN
Fold 7: AdaBoost
Fold 8: kNN
Fold 8: Random Forest
Fold 8: NN
Fold 8: AdaBoost
Fold 9: kNN
Fold 9: Random Forest
Fold 9: NN
Fold 9: AdaBoost
Fold 10: kNN
Fold 10: Random Forest
Fold 10: NN
Fold 10: AdaBoost


In [62]:
scores[['algorithm', 'accuracy', 'precision', 'recall', 'kappa', 'f1_score']].groupby(['algorithm']).median()

Unnamed: 0_level_0,accuracy,precision,recall,kappa,f1_score
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoost,0.876263,0.852288,0.876263,0.194062,0.862071
NN,0.886364,0.78564,0.886364,0.0,0.832968
Random Forest,0.897727,0.878016,0.897727,0.314279,0.881491
kNN,0.886364,0.833946,0.886364,0.0,0.840632


In [64]:
scores[['algorithm', 'f1_score', 'parameters']][scores['algorithm']=='Random Forest'].parameters.values

array(["{'clf__max_depth': 12, 'clf__class_weight': None}",
       "{'clf__max_depth': 15, 'clf__class_weight': 'balanced_subsample'}",
       "{'clf__max_depth': 15, 'clf__class_weight': 'balanced_subsample'}",
       "{'clf__max_depth': 12, 'clf__class_weight': 'balanced'}",
       "{'clf__max_depth': 9, 'clf__class_weight': 'balanced'}",
       "{'clf__max_depth': 12, 'clf__class_weight': 'balanced_subsample'}",
       "{'clf__max_depth': 12, 'clf__class_weight': None}",
       "{'clf__max_depth': 12, 'clf__class_weight': None}",
       "{'clf__max_depth': 9, 'clf__class_weight': 'balanced'}",
       "{'clf__max_depth': 12, 'clf__class_weight': 'balanced_subsample'}"], dtype=object)

In [101]:
X_training = x_train.values
y_training = le.fit_transform(y_train.values)

clf = RandomForestClassifier(max_depth=12, class_weight='balanced_subsample')
clf.fit(X_training, y_training)
y_pred = clf.predict(x_test)
y_test_le = le.fit_transform(y_test.values)
print(confusion_matrix(y_test_le, y_pred))
print(classification_report(y_test_le, y_pred))

[[  5  25]
 [  2 158]]
             precision    recall  f1-score   support

          0       0.71      0.17      0.27        30
          1       0.86      0.99      0.92       160

avg / total       0.84      0.86      0.82       190



In [109]:
le.classes_

array(['HIGH_GROWTH', 'NORMAL_GROWTH'], dtype=object)