# Import libraries

In [None]:
import pandas as pd
import numpy as np
import hashlib
import os 
from utils import logger
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn import datasets

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from utils import logger
import sys

# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import axes3d

import plotly.plotly as py
import plotly.graph_objs as go
import itertools as it

# %matplotlib notebook

# Feature Selection
#### Lasso
#### PCA - It is using the correlation between some dimensions and tries to provide a minimum number of variables that keeps the maximum amount of variation or information about how the original data is distributed. 
#### tSNE - looks at the original data that is entered into the algorithm and looks at how to best represent this data using less dimensions by matching both distributions

In [None]:
def lassoSelection(X_train, y_train, n):
    '''
    Lasso feature selection.  Select n features. 
    '''
    #lasso feature selection
    #print (X_train)
    clf = LassoCV(max_iter=10000,tol=0.001)
    sfm = SelectFromModel(clf,threshold=0)
    sfm.fit(X_train, y_train)
    X_transform = sfm.transform(X_train)
    n_features = X_transform.shape[1]

    # 	print("n_features=",n_features)
    #print(n_features)
    while n_features > n:
        sfm.threshold += 0.01
        X_transform = sfm.transform(X_train)
        n_features = X_transform.shape[1]
        print ("n_features =",n_features)
    features = [index for index,value in enumerate(sfm.get_support()) if value == True  ]
    logger.info("selected features are {}".format(features))
    return features

In [None]:
def pcaSelection(X_train, X_test, n):
    '''
    PCA feature selection.  Select n features. 
    '''
    pca = PCA(n_components=n)
    pca.fit(X_train)
    X_train_new = pca.transform(X_train)
    X_test_new = pca.transform(X_test)
    logger.info("X_train size after PCA: {}".format(X_train_new.shape))
    logger.info("X_test size after PCA: {}".format(X_test_new.shape))
    logger.info("Cumulative explained variation for {} principal components: {}".format(n,np.sum(pca.explained_variance_ratio_)))
    return [X_train_new,X_test_new]

In [None]:
def tsneSelection(X_train, n, v):
    '''
    t-distributed Stochastic Neighbor Embedding feature selection.  Select n features. 
    '''
    tsne = TSNE(n_components=n,verbose=v)
    X_train_new = tsne.fit_transform(X_train)
    logger.info("X_train size after tSNE: {}".format(X_train_new.shape))
    return X_train_new

# Model

In [None]:
def model_fit_predict(X_train,X_test,y_train,y_test):

    # np.random.seed(2018)
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    # from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.svm import SVC
    from sklearn.metrics import precision_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import recall_score
    models = {
        'LogisticRegression': LogisticRegression(random_state=0, multi_class='ovr',solver='newton-cg',max_iter=10000,tol=0.001),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'RandomForestClassifier': RandomForestClassifier(),
    #   	# 'AdaBoostClassifier': AdaBoostClassifier(),
        # 'GradientBoostingClassifier': GradientBoostingClassifier(),
        'SVC': SVC(decision_function_shape='ovo',max_iter=10000,tol=0.001)
    }
    tuned_parameters = {
        'LogisticRegression':{'C': [1, 10]},
        # 'LogisticRegression':{'solver':['newton-cg','liblinear','sag'],'C': [1, 10]}
        'ExtraTreesClassifier': { 'n_estimators': [100] },
        'RandomForestClassifier': { 'n_estimators': [16, 32] },
    #   	'AdaBoostClassifier': { 'n_estimators': [16, 32] },
        # 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.05] },
        # 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
        'SVC': {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]}
    }
    scores= {}
    for key in models:
        print("Running",key,"...")
        clf = GridSearchCV(models[key], tuned_parameters[key], scoring=None,  refit=True, cv=10)
        clf.fit(X_train,y_train)
        y_test_predict = clf.predict(X_test)
        precision = precision_score(y_test, y_test_predict,average='micro') # tp / (tp + fp)
        accuracy = accuracy_score(y_test, y_test_predict) #subset accuracy
        f1 = f1_score(y_test, y_test_predict,average='micro') # F1 = 2 * (precision * recall) / (precision + recall)
        recall = recall_score(y_test, y_test_predict, average='macro') 
        # specificity = specificity_score(y_test, y_test_predict)
        scores[key] = [precision,accuracy,f1,recall]
    print(scores)
    return scores

# Scatterplot Function

In [None]:
# Scatter plot 2D & 3D
num_class = 36
colors = it.cycle(["aquamarine", "crimson", "darkseagreen", "deeppink","wheat","violet","fuchsia","turquoise",\
                   "ivory", "honeydew", "rosybrown","red","lemonchiffon","darkorchid","mintcream","papayawhip",\
                   "beige","darkcyan","firebrick","deepskyblue","seashell","mediumpurple","goldenrod","lightcoral",\
                   "limegreen","cadetblue","darkmagenta","ghostwhite","gainsboro","paleturquoise","teal","peru",\
                  "maroon","olivedrab","springgreen","yellowgreen"])
classes = it.cycle(['Breast', 'Uterine Corpus', 'Head', 'Kidney Renal Clear', 'Lung Adenocarcinoma', 'Brain', 'Thyroid', 'Prostate', 'Ovarian', 'Lung Squamous', 'Skin', 'Colon', 'Stomach', 'Bladder', 'Liver', 'Cervical', 'Kidney Renal Papillary', 'Leukemia', 'Sarcoma', 'Esophageal', 'Pheochromocytoma', 'Pancreatic', 'Rectum', 'Testicular', 'Wilms', 'Thymoma', 'Mesothelioma', 'Adrenocortical', 'Uveal', 'Kidney Chromophobe', 'Uterine Carcinosarcoma', 'Lymphoid', 'Rhabdoid', 'Cholangiocarcinoma', 'Glioblastoma'])


def scatter2D(X_train_2d):
    '''
    Function to genrate traces for 2D scatter plot
    Args: 2-feature X_train of dimension [?,2]
    Return: list of scatter plot trace objects
    '''
    data=[]
    for label in range(0,num_class):
        filtered_idx = np.argwhere(y_train==label)[:,0]
        trace = go.Scatter(
            x=X_train_2d[filtered_idx,0],
            y=X_train_2d[filtered_idx,1],
            mode='markers',
            marker=dict(
                size=5,
                line=dict(
                    color=next(colors),
                    width=0.1
                    ),
                opacity=0.5
                ),
            name=next(classes)
            )
        data.append(trace)
    return data


def scatter3D(X_train_3d):
    '''
    Function to generate traces for 3D scatter plot
    Args: 3-feature X_train of dimension [?,3]
    ReturnL list of scatter plot trace objects
    '''
    data=[]
    for label in range(0,num_class):
        filtered_idx = np.argwhere(y_train==label)[:,0]
        trace = go.Scatter3d(
            x=X_train_3d[filtered_idx,0],
            y=X_train_3d[filtered_idx,1],
            z=X_train_3d[filtered_idx,2],
            mode='markers',
            marker=dict(
                size=5,
                line=dict(
                    color=next(colors),
                    width=0.1
                    ),
                opacity=0.5
                ),
            name=next(classes)
            )
        data.append(trace)
    return data

# Pre-processing
### Separate X (features) and y (labels)
### Split training (70%) and testing (30%) dataset
### Standardize data -- scale features such that they are 1) zero-mean, 2) one-variance

In [None]:
data_file = "../data/miRNA.csv" # directory to miRNA_matrix.csv

df = pd.read_csv(data_file)
y_data = df.pop('label').values
df.pop('file_id')
columns =df.columns
X_data = df.values

print ("Original dataset size:",X_data.shape[0])
print ("Total feature num:",X_data.shape[1])

# split the data to train and test set
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=0)
print("Training dataset size:",X_train.shape[0])
print("Testing dataset size:", X_test.shape[0])
# print(columns)
# print(X_train)
# print(y_train)

# standardize the data (zero-mean,uniform variance)
print ("pre-processing data...")
scaler = StandardScaler()
scaler = scaler.fit(X_train.astype(np.float64))
X_train = scaler.transform(X_train.astype(np.float64))
X_test = scaler.transform(X_test.astype(np.float64))
logger.info("Mean of features after standardization: {}".format(X_train.mean(axis=0)))
logger.info("STD of features after standardization: {}".format(X_train.std(axis=0)))
            
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
        )
)

# Feature Reduction/Selection
### Lasso
### PCA
### PCA + tSNE

In [None]:
# LASSO feature selection
n = 50
feaures_columns = lassoSelection(X_train, y_train, n)
# feaures_columns = [25, 92, 119, 163, 166, 168, 181, 187, 194, 216, 240, 241, 248, \
# 253, 271, 272, 273, 282, 285, 287, 295, 305, 306, 336, 337, 339, 341, 351, 352, 488, \
# 495, 503, 511, 544, 588, 593, 641, 764, 1063, 1090, 1100, 1126, 1395, 1461, 1509, 1523, 1834, 1848, 1872]
scores_lasso = model_fit_predict(X_train[:,feaures_columns],X_test[:,feaures_columns],y_train,y_test)

In [None]:
# PCA feature reduction to n-components
n = 650
X_train_pca, X_test_pca = pcaSelection(X_train, X_test, n)
scores_pca = model_fit_predict(X_train_pca,X_test_pca,y_train,y_test)

In [None]:
# PCA 3-component scatter plot
X_train_pca3, X_test_pca3 = pcaSelection(X_train, X_test, 3)
pca3_traces = scatter3D(X_train_pca3)
fig1 = go.Figure(data=pca3_traces, layout=layout)
py.iplot(fig1, filename='PCA_3D_Scatter')

In [None]:
# tSNE 3-component scatter plot (PCA->tSNE)
X_train_tsne3 = tsneSelection(X_train_pca,3,1)
tsne3_traces = scatter3D(X_train_tsne3)
fig3 = go.Figure(data=tsne3_traces, layout=layout)
py.iplot(fig3, filename='tSNE_3D_Scatter')

In [None]:
# PCA 2-component scatter plot
X_train_pca2, X_test_pca2 = pcaSelection(X_train, X_test, 2)
pca2_traces = scatter2D(X_train_pca2)
fig2 = go.Figure(data=pca2_traces, layout=layout)
py.iplot(fig2, filename='PCA_2D_Scatter')

In [None]:
# tSNE 2-component scatter plot (PCA->tSNE)
X_train_tsne2 = tsneSelection(X_train_pca,2,1)
tsne2_traces = scatter2D(X_train_tsne2)
fig4 = go.Figure(data=tsne2_traces, layout=layout)
py.iplot(fig4, filename='tSNE_2D_Scatter')