# Import libraries
**pandas** - data manipulation and analysis (i.e. DataFrame, integrated indexing).<br>
**numpy** - multi-dimensional array manipulation.<br>
**sklearn** - machine learning library with various classification, regression and clustering algorithms <br>
**plotly** - graphing library that supports interactive graphs <br>
**logger** - custom logger wrapper built on top of Python logger for event logging <br>
**itertools** - fast & memory-efficient looping tool

In [1]:
import pandas as pd
import numpy as np
import hashlib
import os 
from utils import logger
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn import datasets

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from utils import logger
import sys

import plotly.plotly as py
import plotly.graph_objs as go
import itertools as it

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_curve, auc

ModuleNotFoundError: No module named 'utils'

# Feature Selection & Extraction Helpers
**Purpose** - reduces overfitting, improves accuracy (sometimes), reduces training time.
## Feature Selection 
**In a nutshell**: runs analysis on dataset and returns the best set of features which can be used for classification. <br>
* **Filter Method** - calculate correlation of feature variable with target variable. The features are ranked by the score and either selected to be kept or removed from the dataset. (eg. Chi-square, information gain, correlation coefficient scores)<br>
* **Wrapper Method** - search for combination of variables that performs the best using a certain heuristic (e.g. subset, forward, backward elimination) <br>
* **Embedded method** - learn which features best contribute to accuracy while model is being created using regularization methods (e.g. Lasso, Ridge, Elastic Net)<br>

## Feature Extraction 
**In a nutshell**: use initial features to build derived features that are more informative and non-redundent than the original dataset. This is different from feature selection in that feature extraction combines all feature information into newly-created features without completely eliminating low-contributing features (e.g. PCA, NMF, kernel PCA, Graph-based kernel PCA etc.). <br>

## What we are using:
* **Lasso** - a **feature selection** method that uses cross-validation LASSO regression to filter out un-important features <br>
* **PCA** - a **feature extraction** method uses the correlation between some dimensions and tries to provide a minimum number of variables that keeps the maximum amount of variation or information about how the original data is distributed. <br>
* **t-SNE** - a **dimensionality reduction** method that maps multi-dimensional data to a lower dimensional space (typcially 2 or 3), by constructing pairwise probability distribution, modeling similar objects then minimizing KL divergence between original data and lower-dimensional data. This is mainly used for visualizing high-dimensional data, not for feeding directly into models. <br>

In [2]:
def lassoSelection(X_train, y_train, n):
    '''
    Lasso feature selection.  Select n features. 
    '''
    #lasso feature selection
    #print (X_train)
    clf = LassoCV(max_iter=10000,tol=0.001)
    sfm = SelectFromModel(clf,threshold=0)
    sfm.fit(X_train, y_train)
    X_transform = sfm.transform(X_train)
    n_features = X_transform.shape[1]

    # 	print("n_features=",n_features)
    #print(n_features)
    while n_features > n:
        sfm.threshold += 0.01
        X_transform = sfm.transform(X_train)
        n_features = X_transform.shape[1]
        print ("n_features =",n_features)
    features = [index for index,value in enumerate(sfm.get_support()) if value == True  ]
    logger.info("selected features are {}".format(features))
    logger.info("Features selected from Lasso using SelectFromModel with threshold {:6.4f}".format(sfm.threshold))
    return features

In [3]:
def pcaSelection(X_train, X_test, n):
    '''
    PCA feature selection.  Select n features. 
    '''
    pca = PCA(n_components=n)
    pca.fit(X_train)
    X_train_new = pca.transform(X_train)
    X_test_new = pca.transform(X_test)
    logger.info("X_train size after PCA: {}".format(X_train_new.shape))
    logger.info("X_test size after PCA: {}".format(X_test_new.shape))
    logger.info("Cumulative explained variation for {} principal components: {:6.4f}".format(n,np.sum(pca.explained_variance_ratio_)))
    return [X_train_new,X_test_new]

In [4]:
def tsneSelection(X_train, n, v):
    '''
    t-distributed Stochastic Neighbor Embedding feature selection.  Select n features.
    Input:
    X_train - dataset with dimension [P-samples x Q-features]
    n - number of desired features after feature reduction
    v - 1 for verbose, 0 for slience
    Output:
    X_train_new - dataset with dimension [P-samples x n-features] 
    '''
    tsne = TSNE(n_components=n,verbose=v)
    X_train_new = tsne.fit_transform(X_train)
    logger.info("X_train size after tSNE: {}".format(X_train_new.shape))
    return X_train_new

# Scatter Plot Helpers
**2D Scatter Plot** - graph multi-class data with 2 features in a 2D plot<br>
**3D Scatter Plot** - graph multi-class data with 3 features in a 3D plot<br>

In [5]:
# Scatter plot 2D & 3D
num_class = 36
colors = it.cycle(["aquamarine", "crimson", "darkseagreen", "deeppink","wheat","violet","fuchsia","turquoise",\
                   "ivory", "honeydew", "rosybrown","red","lemonchiffon","darkorchid","mintcream","papayawhip",\
                   "beige","darkcyan","firebrick","deepskyblue","seashell","mediumpurple","goldenrod","lightcoral",\
                   "limegreen","cadetblue","darkmagenta","ghostwhite","gainsboro","paleturquoise","teal","peru",\
                  "maroon","olivedrab","springgreen","yellowgreen"])
classes = it.cycle(['Normal','Breast', 'Uterine Corpus', 'Head', 'Kidney Renal Clear', 'Lung Adenocarcinoma', 'Brain', 'Thyroid', 'Prostate', 'Ovarian', 'Lung Squamous', 'Skin', 'Colon', 'Stomach', 'Bladder', 'Liver', 'Cervical', 'Kidney Renal Papillary', 'Leukemia', 'Sarcoma', 'Esophageal', 'Pheochromocytoma', 'Pancreatic', 'Rectum', 'Testicular', 'Wilms', 'Thymoma', 'Mesothelioma', 'Adrenocortical', 'Uveal', 'Kidney Chromophobe', 'Uterine Carcinosarcoma', 'Lymphoid', 'Rhabdoid', 'Cholangiocarcinoma'])

classes_labels = ['Normal','Breast', 'Uterine Corpus', 'Head', 'Kidney Renal Clear', 'Lung Adenocarcinoma', 'Brain', 'Thyroid', 'Prostate', 'Ovarian', 'Lung Squamous', 'Skin', 'Colon', 'Stomach', 'Bladder', 'Liver', 'Cervical', 'Kidney Renal Papillary', 'Leukemia', 'Sarcoma', 'Esophageal', 'Pheochromocytoma', 'Pancreatic', 'Rectum', 'Testicular', 'Wilms', 'Thymoma', 'Mesothelioma', 'Adrenocortical', 'Uveal', 'Kidney Chromophobe', 'Uterine Carcinosarcoma', 'Lymphoid', 'Rhabdoid', 'Cholangiocarcinoma']

def scatter2D(X_train_2d):
    '''
    Function to genrate traces for 2D scatter plot
    Args: 2-feature X_train of dimension [?,2]
    Return: list of scatter plot trace objects
    '''
    data=[]
    for label in range(0,num_class):
        filtered_idx = np.argwhere(y_train==label)[:,0]
        trace = go.Scatter(
            x=X_train_2d[filtered_idx,0],
            y=X_train_2d[filtered_idx,1],
            mode='markers',
            marker=dict(
                size=5,
                line=dict(
                    color=next(colors),
                    width=0.1
                    ),
                opacity=0.5
                ),
            name=next(classes)
            )
        data.append(trace)
    return data


def scatter3D(X_train_3d):
    '''
    Function to generate traces for 3D scatter plot
    Args: 3-feature X_train of dimension [?,3]
    ReturnL list of scatter plot trace objects
    '''
    data=[]
    for label in range(0,num_class):
        filtered_idx = np.argwhere(y_train==label)[:,0]
        trace = go.Scatter3d(
            x=X_train_3d[filtered_idx,0],
            y=X_train_3d[filtered_idx,1],
            z=X_train_3d[filtered_idx,2],
            mode='markers',
            marker=dict(
                size=5,
                line=dict(
                    color=next(colors),
                    width=0.1
                    ),
                opacity=0.5
                ),
            name=next(classes)
            )
        data.append(trace)
    return data

NameError: name 'it' is not defined

# Model Helper

## Estimators
### Linear Estimators - model based on generalized linear models
* **Logistic Regresion** - uses the natural logarithm function to model relationship between the variables and uses  data to find the coefficients.<br>

### Ensemble Estimators
**In a nutshell** - weighted combinations of simple predictors(e.g. one-level decision trees), correct predictor is given more weight. Boosting algorithms focuses new learners on sample points that previous predictors get wrong. <br>

* **Random Forest Classifier** - divides data into sub-samples and construct a binary decision trees for each set of sub-samples. Gini optimizer is used to find the best "split" for each decision junction. Validation or test data is then pass through the decision tree, which outputs probablity for each class. These probabilities of each tree is then averaged for a final classification decision. <br>
* **Extra Trees Classifier** - similar to Random Forest, but a random value is selected for each split <br>
* **AdaBoost Classifier** - Uses N number of 1-level decision tree (optimized by Gini Impurity cost function) as base classifier. Wrongly-predicted samples are given larger weights for the next round of decsion tree optimization. Incorrect classifiers' contribution is shrinked by learning_rate. At the end, the final classifer is a weighted combination of all n estimators.
 * By setting high number of estimaters, and low learning rate, the model will converge with high accuracy at the expense of long computation time. <br>
* **Gradient Boosting Classifier** - use simple regression estimator to fit data, use error residual(MSE) to iteratively fit training data. Simple regressors are added up in a stage-wise fashion to get a complex regression model. <br>

### SVN Estimator - binary linear classifier by a separating hyperplane
* **SVC** with RBF (radial basis function) kernel – uses squared Euclidean distance <br>

### Neural Network Estimator - model based on neural networks
* **Multi-Layer Perceptron (MLP) Classifier** - feedforward artificial neural network, optimizes log-loss! <br>

## K-fold Hyperparameter selection
**GridSearchCV** - exhaustively predicts and scores all parameter combinations for an estimator

In [6]:
def model_fit_predict(X_train,X_test,y_train,y_test,v):

    # np.random.seed(2018)
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.svm import SVC
    from sklearn.neural_network import MLPClassifier
    from sklearn.metrics import precision_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import recall_score
    from sklearn.preprocessing import label_binarize
    models = {
        'LogisticRegression': LogisticRegression(random_state=0, multi_class='ovr',solver='lbfgs',max_iter=10000,tol=0.001,verbose=v),
        'ExtraTreesClassifier': ExtraTreesClassifier(random_state=0, verbose=v),
        'RandomForestClassifier': RandomForestClassifier(random_state=0, verbose=v),
#         'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),random_state=0),
#         'GradientBoostingClassifier': GradientBoostingClassifier(n_estimators = 5000, random_state=0, n_iter_no_change=10, verbose=v),
        'SVC': SVC(random_state=0, decision_function_shape='ovo',max_iter=10000,tol=0.001,verbose=v),
        'MLP':MLPClassifier(random_state=0, hidden_layer_sizes=(500,100,50),solver='adam',max_iter=1000,verbose=v,\
                            learning_rate ='adaptive',activation='relu')
    }
    tuned_parameters = {
        'LogisticRegression':{'C': [1]},
        'ExtraTreesClassifier': { 'n_estimators': [200,500,700,1000] },
        'RandomForestClassifier': { 'n_estimators': [200,500,1000,5000],'min_samples_leaf': [1,2,3]},
#         'AdaBoostClassifier': { 'n_estimators': [500,1000,5000],'learning_rate': [0.05,0.2, 0.7]},
#         'GradientBoostingClassifier': { 'learning_rate': [0.05,0.2, 0.7] },
        'SVC': { 'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001] },
        'MLP':{ 'batch_size':[50,200],'alpha':[0.0001,0.005,0.01],'tol':[0.01,0.001,0.0001]}
    }
    accuracies= {}
    confusion_mat = {}
    report = {}
    fpr_all = {}
    tpr_all ={}
    for key in models:
        print("Running",key,"...")
        clf = GridSearchCV(models[key], tuned_parameters[key], scoring=None,  refit=True, cv=2, verbose=v)
        clf.fit(X_train,y_train)
        print(clf.best_params_)
        # Compute Metrics
        y_test_predict = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_test_predict) #subset accuracy 
        accuracies[key] = accuracy
        confusion_mat[key]=confusion_matrix(y_test,y_test_predict)
        report[key]= classification_report(y_test,y_test_predict,target_names=classes_labels)
        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        y_test_bin = label_binarize(y_test, classes=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35])
        y_test_predict_bin = label_binarize(y_test_predict, classes=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35])  
        for i in range(0,num_class-1):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:,i], y_test_predict_bin[:,i])
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_test_predict_bin.ravel())
        fpr_all[key]=fpr
        tpr_all[key]=tpr
        print(accuracies[key])
        print(confusion_mat[key])
        print(report[key])
    return accuracies, confusion_mat, report, fpr_all, tpr_all

# Pre-process Data
1. **Separate X (features) and y (labels)** <br>
2. **Split training (70%) and testing (30%) dataset** <br>
3. **Standardize data** - scale features such that they are:
  1. zero-mean
  2. one-variance

In [8]:
data_file = "gdc-emr0/cpv_full_matrix.csv" # directory to miRNA_matrix.csv

# Get dataset from csv
df = pd.read_csv(data_file)
y_data = df.pop('label').values
df.pop('file_id')
columns =df.columns
X_data = df.values
num_features_orig = X_data.shape[1]
logger.info("Original dataset size: {}".format(X_data.shape[0]))
logger.info("Total feature num: {}".format(num_features_orig))

# split the data to train and test set
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=0)


logger.info("Training dataset size: {}".format(X_train.shape[0]))
logger.info("Testing dataset size: {}".format(X_test.shape[0]))
# # standardize the data (zero-mean,uniform variance)
scaler = StandardScaler().fit(X_train.astype(np.float64))
X_train = scaler.transform(X_train.astype(np.float64))
X_test = scaler.transform(X_test.astype(np.float64))
logger.info("Mean of X-data features before standardization: {:6.4f}".format(sum(X_data.mean(axis=0))/num_features_orig))
logger.info("STD of X-data features before standardization: {:6.4f}".format(sum(X_data.std(axis=0))/num_features_orig))
logger.info("Mean of X-train features after standardization: {:6.4f}".format(sum(X_train.mean(axis=0))/num_features_orig))
logger.info("STD of X-train features after standardization: {:6.4f}".format(sum(X_train.std(axis=0))/num_features_orig))
logger.info("Mean of X-test features after standardization: {:6.4f}".format(sum(X_test.mean(axis=0))/num_features_orig))
logger.info("STD of X-test features after standardization: {:6.4f}".format(sum(X_test.std(axis=0))/num_features_orig))

# Define Graph layout
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
        )
)

[2018-10-24 15:31:12,629 - GDC - INFO] Original dataset size: 11486
[2018-10-24 15:31:12,645 - GDC - INFO] Total feature num: 1881
[2018-10-24 15:31:13,922 - GDC - INFO] Training dataset size: 8040
[2018-10-24 15:31:13,945 - GDC - INFO] Testing dataset size: 3446
[2018-10-24 15:31:14,928 - GDC - INFO] Mean of X-data features before standardization: 2458.7368
[2018-10-24 15:31:15,186 - GDC - INFO] STD of X-data features before standardization: 4450.1113
[2018-10-24 15:31:15,220 - GDC - INFO] Mean of X-train features after standardization: -0.0000
[2018-10-24 15:31:15,381 - GDC - INFO] STD of X-train features after standardization: 0.9351
[2018-10-24 15:31:15,398 - GDC - INFO] Mean of X-test features after standardization: -0.0069
[2018-10-24 15:31:15,485 - GDC - INFO] STD of X-test features after standardization: 0.9125


# Fit Model after LASSO-Selected Features

In [9]:
# LASSO feature selection
n = 50
feaures_columns = lassoSelection(X_train, y_train, n)
# feaures_columns = [25, 92, 119, 163, 166, 168, 181, 187, 194, 216, 240, 241, 248, \
# 253, 271, 272, 273, 282, 285, 287, 295, 305, 306, 336, 337, 339, 341, 351, 352, 488, \
# 495, 503, 511, 544, 588, 593, 641, 764, 1063, 1090, 1100, 1126, 1395, 1461, 1509, 1523, 1834, 1848, 1872]
scores_lasso, mat_lasso, report_lasso, fpr_lasso, tpr_lasso = model_fit_predict(X_train[:,feaures_columns],X_test[:,feaures_columns],y_train,y_test,1)


You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.



n_features = 460
n_features = 411
n_features = 359
n_features = 330
n_features = 288
n_features = 263
n_features = 232
n_features = 210
n_features = 193
n_features = 173
n_features = 160
n_features = 153
n_features = 138
n_features = 128
n_features = 119
n_features = 105
n_features = 96
n_features = 89
n_features = 84
n_features = 78
n_features = 73
n_features = 69
n_features = 63
n_features = 54
n_features = 52
n_features = 49

[2018-10-24 15:34:32,833 - GDC - INFO] selected features are [25, 92, 119, 163, 166, 168, 181, 187, 194, 216, 240, 241, 248, 253, 271, 272, 273, 282, 285, 287, 295, 305, 306, 336, 337, 339, 341, 351, 352, 488, 495, 503, 511, 544, 588, 593, 641, 764, 1063, 1090, 1100, 1126, 1395, 1461, 1509, 1523, 1834, 1848, 1872]
[2018-10-24 15:34:32,857 - GDC - INFO] Features selected from Lasso using SelectFromModel with threshold 0.2600



Running LogisticRegression ...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    4.0s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:    6.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


{'C': 1}
0.8284968078932096
[[154   1   1 ...   0   0   0]
 [  2 291  12 ...   0   0   0]
 [  3  13 137 ...   0   0   0]
 ...
 [  0   0   0 ...  12   0   0]
 [  1   0   0 ...   0  18   0]
 [  0   1   0 ...   0   0   2]]
                        precision    recall  f1-score   support

                Normal       0.80      0.77      0.78       201
                Breast       0.89      0.90      0.90       324
        Uterine Corpus       0.77      0.79      0.78       174
                  Head       0.70      0.82      0.76       152
    Kidney Renal Clear       0.89      0.95      0.92       155
   Lung Adenocarcinoma       0.76      0.80      0.78       162
                 Brain       0.99      0.98      0.99       170
               Thyroid       0.97      0.95      0.96       149
              Prostate       0.90      1.00      0.95       155
               Ovarian       0.94      0.94      0.94       146
         Lung Squamous       0.65      0.70      0.68       142
           

[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    2.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    2.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Paralle

{'n_estimators': 500}


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.3s finished


0.8496807893209518
[[177   2   0 ...   0   0   0]
 [  2 306   4 ...   0   0   0]
 [  3  13 147 ...   0   0   0]
 ...
 [  0   0   0 ...  11   0   0]
 [  0   0   0 ...   0  19   0]
 [  2   0   0 ...   0   0   3]]
                        precision    recall  f1-score   support

                Normal       0.86      0.88      0.87       201
                Breast       0.87      0.94      0.91       324
        Uterine Corpus       0.79      0.84      0.82       174
                  Head       0.65      0.80      0.72       152
    Kidney Renal Clear       0.98      0.97      0.98       155
   Lung Adenocarcinoma       0.78      0.75      0.77       162
                 Brain       0.99      0.99      0.99       170
               Thyroid       0.96      0.99      0.97       149
              Prostate       0.94      0.99      0.96       155
               Ovarian       0.99      0.95      0.97       146
         Lung Squamous       0.63      0.75      0.69       142
                  Sk

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    4.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    5.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    9.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   10.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)

{'min_samples_leaf': 1, 'n_estimators': 1000}


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    5.2s finished


0.8528728961114336
[[174   4   0 ...   0   0   0]
 [  2 301   6 ...   0   0   0]
 [  3   8 146 ...   0   0   0]
 ...
 [  0   0   0 ...  13   0   0]
 [  0   0   0 ...   0  17   0]
 [  0   0   0 ...   0   0   4]]
                        precision    recall  f1-score   support

                Normal       0.90      0.87      0.88       201
                Breast       0.88      0.93      0.91       324
        Uterine Corpus       0.82      0.84      0.83       174
                  Head       0.73      0.82      0.77       152
    Kidney Renal Clear       0.96      0.97      0.97       155
   Lung Adenocarcinoma       0.76      0.72      0.74       162
                 Brain       0.98      0.99      0.99       170
               Thyroid       0.97      0.99      0.98       149
              Prostate       0.95      0.97      0.96       155
               Ovarian       1.00      0.95      0.98       146
         Lung Squamous       0.60      0.80      0.68       142
                  Sk

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.2min finished


[LibSVM]{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7811955890887986
[[147   4   0 ...   0   0   0]
 [  4 292  10 ...   0   0   0]
 [  6  29 122 ...   0   0   0]
 ...
 [  0   0   0 ...   9   0   0]
 [  0   0   0 ...   0  20   0]
 [  1   1   0 ...   0   0   0]]
                        precision    recall  f1-score   support

                Normal       0.69      0.73      0.71       201
                Breast       0.71      0.90      0.80       324
        Uterine Corpus       0.80      0.70      0.75       174
                  Head       0.64      0.71      0.67       152
    Kidney Renal Clear       0.87      0.92      0.89       155
   Lung Adenocarcinoma       0.71      0.68      0.69       162
                 Brain       0.99      0.97      0.98       170
               Thyroid       0.96      0.94      0.95       149
              Prostate       0.85      1.00      0.92       155
               Ovarian       0.97      0.95      0.96       146
         Lung Squamous       0.53      0.61      0.57       142
                  Sk

Iteration 6, loss = 0.48642139
Iteration 7, loss = 0.44184552
Iteration 8, loss = 0.39596461
Iteration 9, loss = 0.36058299
Iteration 10, loss = 0.32983389
Iteration 11, loss = 0.31530260
Iteration 12, loss = 0.29574077
Iteration 13, loss = 0.27172851
Iteration 14, loss = 0.25411811
Iteration 15, loss = 0.23668489
Iteration 16, loss = 0.21943742
Iteration 17, loss = 0.21382630
Iteration 18, loss = 0.19129552
Iteration 19, loss = 0.20037320
Iteration 20, loss = 0.17476539
Iteration 21, loss = 0.17170412
Iteration 22, loss = 0.15367627
Iteration 23, loss = 0.15755296
Iteration 24, loss = 0.16364769
Iteration 25, loss = 0.13679019
Iteration 26, loss = 0.12123811
Iteration 27, loss = 0.11463014
Iteration 28, loss = 0.11880908
Iteration 29, loss = 0.11403879
Iteration 30, loss = 0.09724425
Iteration 31, loss = 0.09683975
Iteration 32, loss = 0.11284309
Iteration 33, loss = 0.08695071
Iteration 34, loss = 0.08740459
Iteration 35, loss = 0.12836543
Iteration 36, loss = 0.12103091
Iteration 37

Iteration 1, loss = 3.10590613
Iteration 2, loss = 1.97243620
Iteration 3, loss = 1.32691300
Iteration 4, loss = 1.02304132
Iteration 5, loss = 0.86339185
Iteration 6, loss = 0.74698850
Iteration 7, loss = 0.65798262
Iteration 8, loss = 0.62055445
Iteration 9, loss = 0.58309668
Iteration 10, loss = 0.54428361
Iteration 11, loss = 0.50254810
Iteration 12, loss = 0.46998775
Iteration 13, loss = 0.45849831
Iteration 14, loss = 0.43541308
Iteration 15, loss = 0.39911702
Iteration 16, loss = 0.38075665
Iteration 17, loss = 0.36410580
Iteration 18, loss = 0.35634218
Iteration 19, loss = 0.34691361
Iteration 20, loss = 0.36088072
Iteration 21, loss = 0.31299513
Iteration 22, loss = 0.32063691
Iteration 23, loss = 0.34156494
Iteration 24, loss = 0.28091962
Iteration 25, loss = 0.26854552
Iteration 26, loss = 0.25775274
Iteration 27, loss = 0.26201176
Iteration 28, loss = 0.27753547
Iteration 29, loss = 0.24404713
Iteration 30, loss = 0.22903995
Iteration 31, loss = 0.22512872
Iteration 32, los

Iteration 140, loss = 0.01328884
Iteration 141, loss = 0.01332889
Iteration 142, loss = 0.01481705
Iteration 143, loss = 0.04611622
Iteration 144, loss = 0.02587858
Iteration 145, loss = 0.01923221
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 3.10449259
Iteration 2, loss = 1.95103608
Iteration 3, loss = 1.28358619
Iteration 4, loss = 0.98621675
Iteration 5, loss = 0.83273752
Iteration 6, loss = 0.72540794
Iteration 7, loss = 0.65012517
Iteration 8, loss = 0.58337300
Iteration 9, loss = 0.53784034
Iteration 10, loss = 0.52376399
Iteration 11, loss = 0.48804032
Iteration 12, loss = 0.45267893
Iteration 13, loss = 0.43638991
Iteration 14, loss = 0.40988782
Iteration 15, loss = 0.38818108
Iteration 16, loss = 0.35923772
Iteration 17, loss = 0.35202417
Iteration 18, loss = 0.32527102
Iteration 19, loss = 0.31623254
Iteration 20, loss = 0.30409510
Iteration 21, loss = 0.28638893
Iteration 22, loss = 0.27758301
Iteration 23, los

Iteration 8, loss = 0.42875640
Iteration 9, loss = 0.39397331
Iteration 10, loss = 0.36320843
Iteration 11, loss = 0.35216321
Iteration 12, loss = 0.32982702
Iteration 13, loss = 0.30553373
Iteration 14, loss = 0.28791133
Iteration 15, loss = 0.26992821
Iteration 16, loss = 0.25921259
Iteration 17, loss = 0.25027812
Iteration 18, loss = 0.24767171
Iteration 19, loss = 0.26373338
Iteration 20, loss = 0.23364278
Iteration 21, loss = 0.21920001
Iteration 22, loss = 0.20714917
Iteration 23, loss = 0.20702643
Iteration 24, loss = 0.19314991
Iteration 25, loss = 0.18046031
Iteration 26, loss = 0.17061632
Iteration 27, loss = 0.16724695
Iteration 28, loss = 0.16710119
Iteration 29, loss = 0.16388693
Iteration 30, loss = 0.15154311
Iteration 31, loss = 0.15651356
Iteration 32, loss = 0.14353281
Iteration 33, loss = 0.13356167
Iteration 34, loss = 0.13323113
Iteration 35, loss = 0.13416537
Iteration 36, loss = 0.13051147
Iteration 37, loss = 0.14444505
Iteration 38, loss = 0.19583101
Iteration 

Iteration 46, loss = 0.10039995
Iteration 47, loss = 0.09357899
Iteration 48, loss = 0.10142069
Iteration 49, loss = 0.10233597
Iteration 50, loss = 0.09101880
Iteration 51, loss = 0.09805759
Iteration 52, loss = 0.09856099
Iteration 53, loss = 0.09214155
Iteration 54, loss = 0.08706382
Iteration 55, loss = 0.08799589
Iteration 56, loss = 0.08258316
Iteration 57, loss = 0.09943912
Iteration 58, loss = 0.08569844
Iteration 59, loss = 0.08919988
Iteration 60, loss = 0.07561798
Iteration 61, loss = 0.09983266
Iteration 62, loss = 0.08084305
Iteration 63, loss = 0.08009097
Iteration 64, loss = 0.09636101
Iteration 65, loss = 0.11192461
Iteration 66, loss = 0.13124189
Iteration 67, loss = 0.22029828
Iteration 68, loss = 0.20528835
Iteration 69, loss = 0.12094841
Iteration 70, loss = 0.08899016
Iteration 71, loss = 0.08041618
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 2.26784195
Iteration 2, loss = 1.02565309
Iteration 3, los

Iteration 21, loss = 0.32136770
Iteration 22, loss = 0.32565462
Iteration 23, loss = 0.36315847
Iteration 24, loss = 0.29157025
Iteration 25, loss = 0.28202607
Iteration 26, loss = 0.27073374
Iteration 27, loss = 0.27284518
Iteration 28, loss = 0.29527626
Iteration 29, loss = 0.26196215
Iteration 30, loss = 0.24126618
Iteration 31, loss = 0.23775378
Iteration 32, loss = 0.33080927
Iteration 33, loss = 0.23477572
Iteration 34, loss = 0.23031639
Iteration 35, loss = 0.21875499
Iteration 36, loss = 0.19898430
Iteration 37, loss = 0.19315057
Iteration 38, loss = 0.18900622
Iteration 39, loss = 0.18831624
Iteration 40, loss = 0.17612031
Iteration 41, loss = 0.17405053
Iteration 42, loss = 0.17983515
Iteration 43, loss = 0.19629002
Iteration 44, loss = 0.17329152
Iteration 45, loss = 0.15916418
Iteration 46, loss = 0.15074835
Iteration 47, loss = 0.15627645
Iteration 48, loss = 0.19305463
Iteration 49, loss = 0.16449046
Iteration 50, loss = 0.17193551
Iteration 51, loss = 0.16515081
Iteratio

Iteration 1, loss = 3.11051107
Iteration 2, loss = 1.97825852
Iteration 3, loss = 1.33155141
Iteration 4, loss = 1.02703571
Iteration 5, loss = 0.86978959
Iteration 6, loss = 0.75191444
Iteration 7, loss = 0.66236142
Iteration 8, loss = 0.62677190
Iteration 9, loss = 0.58729781
Iteration 10, loss = 0.54788550
Iteration 11, loss = 0.50886969
Iteration 12, loss = 0.47815457
Iteration 13, loss = 0.46546836
Iteration 14, loss = 0.43926458
Iteration 15, loss = 0.40640625
Iteration 16, loss = 0.38619293
Iteration 17, loss = 0.36936002
Iteration 18, loss = 0.36348149
Iteration 19, loss = 0.35131415
Iteration 20, loss = 0.36532352
Iteration 21, loss = 0.32136770
Iteration 22, loss = 0.32565462
Iteration 23, loss = 0.36315847
Iteration 24, loss = 0.29157025
Iteration 25, loss = 0.28202607
Iteration 26, loss = 0.27073374
Iteration 27, loss = 0.27284518
Iteration 28, loss = 0.29527626
Iteration 29, loss = 0.26196215
Iteration 30, loss = 0.24126618
Iteration 31, loss = 0.23775378
Iteration 32, los

Iteration 94, loss = 0.06764427
Iteration 95, loss = 0.05423974
Iteration 96, loss = 0.04388842
Iteration 97, loss = 0.04523314
Iteration 98, loss = 0.04309304
Iteration 99, loss = 0.04256992
Iteration 100, loss = 0.03988442
Iteration 101, loss = 0.03949627
Iteration 102, loss = 0.04614541
Iteration 103, loss = 0.04092299
Iteration 104, loss = 0.03863844
Iteration 105, loss = 0.03738866
Iteration 106, loss = 0.03638097
Iteration 107, loss = 0.03468521
Iteration 108, loss = 0.03583376
Iteration 109, loss = 0.03400453
Iteration 110, loss = 0.03663637
Iteration 111, loss = 0.03403602
Iteration 112, loss = 0.05499844
Iteration 113, loss = 0.04417487
Iteration 114, loss = 0.03829468
Iteration 115, loss = 0.03349512
Iteration 116, loss = 0.03038775
Iteration 117, loss = 0.03055536
Iteration 118, loss = 0.03032103
Iteration 119, loss = 0.03208482
Iteration 120, loss = 0.03803458
Iteration 121, loss = 0.03822654
Iteration 122, loss = 0.04236763
Iteration 123, loss = 0.03900817
Iteration 124, l

Iteration 95, loss = 0.20853484
Iteration 96, loss = 0.15426781
Iteration 97, loss = 0.12618026
Iteration 98, loss = 0.12452029
Iteration 99, loss = 0.10770458
Iteration 100, loss = 0.10643200
Iteration 101, loss = 0.10574304
Iteration 102, loss = 0.09891713
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 2.29525439
Iteration 2, loss = 1.05509747
Iteration 3, loss = 0.82087039
Iteration 4, loss = 0.67470462
Iteration 5, loss = 0.59333327
Iteration 6, loss = 0.54616081
Iteration 7, loss = 0.50078800
Iteration 8, loss = 0.45710297
Iteration 9, loss = 0.42490063
Iteration 10, loss = 0.39765198
Iteration 11, loss = 0.38322921
Iteration 12, loss = 0.36714601
Iteration 13, loss = 0.34605059
Iteration 14, loss = 0.32757191
Iteration 15, loss = 0.31242153
Iteration 16, loss = 0.29563592
Iteration 17, loss = 0.29134158
Iteration 18, loss = 0.29187393
Iteration 19, loss = 0.29020155
Iteration 20, loss = 0.25575721
Iteration 21, loss =

Iteration 5, loss = 0.87603813
Iteration 6, loss = 0.75765872
Iteration 7, loss = 0.66889208
Iteration 8, loss = 0.63260883
Iteration 9, loss = 0.59395273
Iteration 10, loss = 0.55742274
Iteration 11, loss = 0.51774837
Iteration 12, loss = 0.48689546
Iteration 13, loss = 0.47264269
Iteration 14, loss = 0.44622377
Iteration 15, loss = 0.41311149
Iteration 16, loss = 0.39610808
Iteration 17, loss = 0.37773669
Iteration 18, loss = 0.37210999
Iteration 19, loss = 0.36217280
Iteration 20, loss = 0.37861884
Iteration 21, loss = 0.33086114
Iteration 22, loss = 0.33576215
Iteration 23, loss = 0.37155879
Iteration 24, loss = 0.30109690
Iteration 25, loss = 0.29191111
Iteration 26, loss = 0.28142918
Iteration 27, loss = 0.28226146
Iteration 28, loss = 0.29608763
Iteration 29, loss = 0.26751786
Iteration 30, loss = 0.24937671
Iteration 31, loss = 0.24691844
Iteration 32, loss = 0.33318503
Iteration 33, loss = 0.24520177
Iteration 34, loss = 0.24343050
Iteration 35, loss = 0.22987264
Iteration 36,

Iteration 28, loss = 0.25463120
Iteration 29, loss = 0.23717606
Iteration 30, loss = 0.24229930
Iteration 31, loss = 0.23049831
Iteration 32, loss = 0.21290389
Iteration 33, loss = 0.20623124
Iteration 34, loss = 0.20314348
Iteration 35, loss = 0.18990292
Iteration 36, loss = 0.19418960
Iteration 37, loss = 0.18468515
Iteration 38, loss = 0.18777966
Iteration 39, loss = 0.17549073
Iteration 40, loss = 0.17751697
Iteration 41, loss = 0.16815043
Iteration 42, loss = 0.16137748
Iteration 43, loss = 0.15658123
Iteration 44, loss = 0.15619186
Iteration 45, loss = 0.15444510
Iteration 46, loss = 0.15198352
Iteration 47, loss = 0.15075036
Iteration 48, loss = 0.15773874
Iteration 49, loss = 0.14699049
Iteration 50, loss = 0.13978792
Iteration 51, loss = 0.14255863
Iteration 52, loss = 0.13110464
Iteration 53, loss = 0.13014861
Iteration 54, loss = 0.12796735
Iteration 55, loss = 0.12506911
Iteration 56, loss = 0.12972296
Iteration 57, loss = 0.11889183
Iteration 58, loss = 0.10809435
Iteratio

Iteration 37, loss = 0.18468515
Iteration 38, loss = 0.18777966
Iteration 39, loss = 0.17549073
Iteration 40, loss = 0.17751697
Iteration 41, loss = 0.16815043
Iteration 42, loss = 0.16137748
Iteration 43, loss = 0.15658123
Iteration 44, loss = 0.15619186
Iteration 45, loss = 0.15444510
Iteration 46, loss = 0.15198352
Iteration 47, loss = 0.15075036
Iteration 48, loss = 0.15773874
Iteration 49, loss = 0.14699049
Iteration 50, loss = 0.13978792
Iteration 51, loss = 0.14255863
Iteration 52, loss = 0.13110464
Iteration 53, loss = 0.13014861
Iteration 54, loss = 0.12796735
Iteration 55, loss = 0.12506911
Iteration 56, loss = 0.12972296
Iteration 57, loss = 0.11889183
Iteration 58, loss = 0.10809435
Iteration 59, loss = 0.11574680
Iteration 60, loss = 0.10656156
Iteration 61, loss = 0.10276841
Iteration 62, loss = 0.10534101
Iteration 63, loss = 0.10240559
Iteration 64, loss = 0.10642241
Iteration 65, loss = 0.10899829
Iteration 66, loss = 0.12092938
Iteration 67, loss = 0.09980490
Iteratio

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 16.9min finished


Iteration 1, loss = 1.69956970
Iteration 2, loss = 0.79408992
Iteration 3, loss = 0.63626540
Iteration 4, loss = 0.55688471
Iteration 5, loss = 0.50608658
Iteration 6, loss = 0.46184332
Iteration 7, loss = 0.43826373
Iteration 8, loss = 0.41107718
Iteration 9, loss = 0.38681838
Iteration 10, loss = 0.37586424
Iteration 11, loss = 0.37866974
Iteration 12, loss = 0.33746118
Iteration 13, loss = 0.32501446
Iteration 14, loss = 0.31365388
Iteration 15, loss = 0.31213292
Iteration 16, loss = 0.30998473
Iteration 17, loss = 0.30217400
Iteration 18, loss = 0.28070476
Iteration 19, loss = 0.27143528
Iteration 20, loss = 0.26684413
Iteration 21, loss = 0.27788948
Iteration 22, loss = 0.25162147
Iteration 23, loss = 0.23925443
Iteration 24, loss = 0.23952484
Iteration 25, loss = 0.23152666
Iteration 26, loss = 0.22564842
Iteration 27, loss = 0.22117429
Iteration 28, loss = 0.22119791
Iteration 29, loss = 0.21112682
Iteration 30, loss = 0.22634597
Iteration 31, loss = 0.22204530
Iteration 32, los

# Fit Model with PCA-Reduced Features

In [None]:
# PCA feature reduction to n-components
n = 50 # 650 for ~80% representation
X_train_pca, X_test_pca = pcaSelection(X_train, X_test, n)
scores_pca, mat_pca, report_pca, fpr_pca, tpr_pca = model_fit_predict(X_train_pca,X_test_pca,y_train,y_test,1)

# Evalutation
**Accuracy** - the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true <br>
**Precision** - tp / (tp + fp) <br>
**Recall** - tp / (tp + fn) <br>
**F1-score** -  2 * (precision * recall) / (precision + recall) <br>
**Support** -  number of samples of the true response that lie in that class <br>
**ROC curve** - a graphical plot which ishows the performance of a classification model at all classification thresholds <br>
**AUC** - aggregate measure of performance across all possible classification thresholds <br>

In [None]:
from sklearn.metrics import auc
models = {'LogisticRegression','ExtraTreesClassifier','RandomForestClassifier','AdaBoostClassifier',\
          'GradientBoostingClassifier','SVC','MLP'}
def ROCplot(fpr,tpr):
    '''
    Function to genrate traces for 2D scatter plot
    Args: 2-feature X_train of dimension [?,2]
    Return: list of scatter plot trace objects
    '''
    data=[]
    lw = 1
    trace_mid = go.Scatter(x=[0, 1], y=[0, 1], 
                        mode='lines', 
                        line=dict(color='navy', width=lw, dash='dash'),
                        name='Random Classification')
    data.append(trace_mid)
    for label in range (0,num_class-1):
        trace = go.Scatter(x=fpr[label], y=tpr[label], 
                            mode='lines', 
                            line=dict(color=next(colors), width=lw),
                            name= next(classes) + ' (area = %0.2f)' % auc(fpr[label], tpr[label])
                           )
        data.append(trace)
    trace_avg = go.Scatter(x=fpr['micro'], y=tpr['micro'], 
                        mode='lines', 
                        line=dict(color=next(colors), width=lw),
                        name= 'micro_avg' + ' (area = %0.2f)' % auc(fpr['micro'], tpr['micro'])
                       )
    data.append(trace_avg)
    return data


logistic_roc_data = ROCplot(fpr_pca['LogisticRegression'],tpr_pca['LogisticRegression'])
layout = go.Layout(title="Logistic Regression ROC curve",
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))
fig_roc_1 = go.Figure(data=logistic_roc_data[:-1], layout=layout)
py.iplot(fig_roc_1)


In [None]:
extratrees_roc_data = ROCplot(fpr_pca['ExtraTreesClassifier'],tpr_pca['ExtraTreesClassifier'])
layout = go.Layout(title="Extra Trees Classifier ROC curve",
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))
fig_roc_2 = go.Figure(data=extratrees_roc_data[:-1], layout=layout)
py.iplot(fig_roc_2)

In [None]:
rf_roc_data = ROCplot(fpr_pca['RandomForestClassifier'],tpr_pca['RandomForestClassifier'])
layout = go.Layout(title="Random Forest Classifier ROC curve",
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))
fig_roc_3 = go.Figure(data=rf_roc_data[:-1], layout=layout)
py.iplot(fig_roc_3)

In [None]:
adaboost_roc_data = ROCplot(fpr_pca['AdaBoostClassifier'],tpr_pca['AdaBoostClassifier'])
layout = go.Layout(title="Adaboost Classifier ROC curve",
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))
fig_roc_4 = go.Figure(data=adaboost_roc_data, layout=layout)
py.iplot(fig_roc_4)

In [None]:
gd_roc_data = ROCplot(fpr_pca['GradientBoostingClassifier'],tpr_pca['GradientBoostingClassifier'])
layout = go.Layout(title="Gradient Boosting Classifier ROC curve",
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))
fig_roc_5 = go.Figure(data=gd_roc_data, layout=layout)
py.iplot(fig_roc_5)

In [None]:
svc_roc_data = ROCplot(fpr_pca['SVC'],tpr_pca['SVC'])
layout = go.Layout(title="SVC Classifier ROC curve",
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))
fig_roc_6 = go.Figure(data=svc_roc_data[:-1], layout=layout)
py.iplot(fig_roc_6)

In [None]:
mlp_roc_data = ROCplot(fpr_pca['MLP'],tpr_pca['MLP'])
layout = go.Layout(title="Multi-Layer Perceptron Classifier ROC curve",
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))
fig_roc_7 = go.Figure(data=mlp_roc_data[:-1], layout=layout)
py.iplot(fig_roc_7)

In [None]:
# 5 estimators

lw = 2
trace_mid = go.Scatter(x=[0, 1], y=[0, 1],
                    mode='lines', 
                    line=dict(color='navy', width=lw, dash='dash'),
                    name='Random Classification')
trace1 = go.Scatter(x=fpr_pca['LogisticRegression']['micro'], y=tpr_pca['LogisticRegression']['micro'], 
                    mode='lines', 
                    line=dict(color='aquamarine',width=lw),
                    name= 'Logistic Regression (area = %0.2f)' % auc(fpr_pca['LogisticRegression']['micro'], tpr_pca['LogisticRegression']['micro']))
trace2 = go.Scatter(x=fpr_pca['ExtraTreesClassifier']['micro'], y=tpr_pca['ExtraTreesClassifier']['micro'], 
                    mode='lines', 
                    line=dict(color='crimson',width=lw),
                    name= 'Extra Trees Classifier (area = %0.2f)' % auc(fpr_pca['ExtraTreesClassifier']['micro'], tpr_pca['ExtraTreesClassifier']['micro']))
trace3 = go.Scatter(x=fpr_pca['RandomForestClassifier']['micro'], y=tpr_pca['RandomForestClassifier']['micro'], 
                    mode='lines', 
                    line=dict(color='darkseagreen',width=lw),
                    name= 'Random Forest Classifier (area = %0.2f)' % auc(fpr_pca['RandomForestClassifier']['micro'], tpr_pca['RandomForestClassifier']['micro']))
trace4 = go.Scatter(x=fpr_pca['SVC']['micro'], y=tpr_pca['SVC']['micro'], 
                    mode='lines', 
                    line=dict(color='violet',width=lw),
                    name= 'SVC (area = %0.2f)' % auc(fpr_pca['SVC']['micro'], tpr_pca['SVC']['micro']))
trace5 = go.Scatter(x=fpr_pca['MLP']['micro'], y=tpr_pca['MLP']['micro'], 
                    mode='lines', 
                    line=dict(color='wheat',width=lw),
                    name= 'Multi-Layer Perceptron (area = %0.2f)' % auc(fpr_pca['MLP']['micro'], tpr_pca['MLP']['micro']))

data = [trace_mid,trace1,trace2,trace3,trace4,trace5]

layout = go.Layout(title="All Model ROC curve",
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))
fig_roc_all = go.Figure(data=data, layout=layout)
py.iplot(fig_roc_all)

In [None]:
# 7 estimators
roc_micros=[logistic_roc_data[-1],extratrees_roc_data[-1],
              rf_roc_data[-1],adaboost_roc_data[-1],\
              gd_roc_data[-1],svc_roc_data[-1],\
              mlp_roc_data[-1]]
layout = go.Layout(title="All Model ROC curve",
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))
fig_roc_all = go.Figure(data=roc_micros, layout=layout)
py.iplot(fig_roc_all)

In [None]:
# 5 estimators
acc_y = [scores_pca['LogisticRegression'], scores_pca['ExtraTreesClassifier'], scores_pca['RandomForestClassifier'],\
       scores_pca['SVC'],scores_pca['MLP']]

precision_y = [report_pca['LogisticRegression'][2335:2339],report_pca['ExtraTreesClassifier'][2335:2339],\
      report_pca['RandomForestClassifier'][2335:2339],report_pca['SVC'][2335:2339],\
      report_pca['MLP'][2335:2339]]

recall_y = [report_pca['LogisticRegression'][2345:2345+4],report_pca['ExtraTreesClassifier'][2345:2345+4],\
      report_pca['RandomForestClassifier'][2345:2345+4],report_pca['SVC'][2345:2345+4],\
      report_pca['MLP'][2345:2345+4]]

fscore_y = [report_pca['LogisticRegression'][2355:2355+4],report_pca['ExtraTreesClassifier'][2355:2355+4],\
      report_pca['RandomForestClassifier'][2355:2355+4],report_pca['SVC'][2355:2355+4],\
      report_pca['MLP'][2355:2355+4]]

accuracy = go.Bar(
    x=['LogisticRegression', 'ExtraTrees', 'RandomForest','SVC','MLP'],
    y=acc_y,
    text= [ '%.2f' % elem for elem in acc_y ],
    textposition = 'outside',
    name='Accuracy'
)
precision = go.Bar(
    x=['LogisticRegression', 'ExtraTrees', 'RandomForest','SVC','MLP'],
    y=precision_y,
    text= [ '%.2f' % float(elem) for elem in precision_y ],
    textposition = 'outside',
    name='Precision'
)
recall = go.Bar(
    x=['LogisticRegression', 'ExtraTrees', 'RandomForest','SVC','MLP'],
    y=recall_y,
    text= [ '%.2f' % float(elem) for elem in recall_y ],
    textposition = 'outside',
    name='Recall'
)
f1score = go.Bar(
    x=['LogisticRegression', 'ExtraTrees', 'RandomForest','SVC','MLP'],
    y=recall_y,
    text= [ '%.2f' % float(elem) for elem in fscore_y ],
    textposition = 'outside',
    name='F-1 Score'
)
data = [accuracy, precision, recall,f1score]
layout = go.Layout(
    barmode='group',
    title="Evaluation Metric Comparison of Various Models"
)

bar_comp2 = go.Figure(data=data, layout=layout)
py.iplot(bar_comp2)

In [None]:
## 7 estimators
accuracy = go.Bar(
    x=['LogisticRegression', 'ExtraTrees', 'RandomForest','AdaBoost','GradientBoost','SVC','MLP'],
    y=[scores_pca['LogisticRegression'], scores_pca['ExtraTreesClassifier'], scores_pca['RandomForestClassifier'],\
       scores_pca['AdaBoostClassifier'],scores_pca['GradientBoostingClassifier'],scores_pca['SVC'],scores_pca['MLP']],
    name='Accuracy'
)
precision = go.Bar(
    x=['LogisticRegression', 'ExtraTrees', 'RandomForest','AdaBoost','GradientBoost','SVC','MLP'],
    y=[report_pca['LogisticRegression']['micro avg']['precision'], 18, 29],
    name='Precision'
)
recall = go.Bar(
    x=['LogisticRegression', 'ExtraTrees', 'RandomForest','AdaBoost','GradientBoost','SVC','MLP'],
    y=[12, 18, 29],
    name='Recall'
)
f1 = go.Bar(
    x=['LogisticRegression', 'ExtraTrees', 'RandomForest','AdaBoost','GradientBoost','SVC','MLP'],
    y=[12, 18, 29],
    name='F1-Score'
)
data = [accuracy, precision, recall,f1]
layout = go.Layout(
    barmode='group',
    title="Evaluation Metric Comparison of Various Models"
)

bar_comp = go.Figure(data=data, layout=layout)
py.iplot(bar_comp)

In [None]:
for key, value in report_pca.items() :
    print ("Classification Report: "+key)
    print (value +"\n")

# Data Visualization
## PCA 3D Scatter Plot

In [None]:
# PCA 3-component scatter plot
# Define Graph layout
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
        )
)
X_train_pca3, X_test_pca3 = pcaSelection(X_train, X_test, 3)
pca3_traces = scatter3D(X_train_pca3)
fig1 = go.Figure(data=pca3_traces, layout=layout)
py.iplot(fig1, filename='PCA_3D_Scatter')

## t-SNE 3D Scatter Plot 

In [None]:
# tSNE 3-component scatter plot (PCA->tSNE)
X_train_tsne3 = tsneSelection(X_train_pca,3,1)
tsne3_traces = scatter3D(X_train_tsne3)
fig3 = go.Figure(data=tsne3_traces, layout=layout)
py.iplot(fig3, filename='tSNE_3D_Scatter')

## PCA 2D Scatter Plot

In [None]:
# PCA 2-component scatter plot
X_train_pca2, X_test_pca2 = pcaSelection(X_train, X_test, 2)
pca2_traces = scatter2D(X_train_pca2)
fig2 = go.Figure(data=pca2_traces, layout=layout)
py.iplot(fig2, filename='PCA_2D_Scatter')

## t-SNE 2D Scatter Plot

In [None]:
# tSNE 2-component scatter plot (PCA->tSNE)
X_train_tsne2 = tsneSelection(X_train_pca,2,1)
tsne2_traces = scatter2D(X_train_tsne2)
fig4 = go.Figure(data=tsne2_traces, layout=layout)
py.iplot(fig4, filename='tSNE_2D_Scatter')