# Import libraries

In [1]:
import pandas as pd 
import hashlib
import os 
from utils import logger
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn import datasets

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from utils import logger
import sys

# Feature Selection

In [2]:
def lassoSelection(X_train, y_train, n):
    '''
    Lasso feature selection.  Select n features. 
    '''
    #lasso feature selection
    #print (X_train)
    clf = LassoCV(max_iter=10000,tol=0.001)
    sfm = SelectFromModel(clf,threshold=0)
    sfm.fit(X_train, y_train)
    X_transform = sfm.transform(X_train)
    n_features = X_transform.shape[1]

    # 	print("n_features=",n_features)
    #print(n_features)
    while n_features > n:
        sfm.threshold += 0.01
        X_transform = sfm.transform(X_train)
        n_features = X_transform.shape[1]
        print ("n_features =",n_features)
    features = [index for index,value in enumerate(sfm.get_support()) if value == True  ]
    logger.info("selected features are {}".format(features))
    return features

In [3]:
def pcaSelection(X_train, X_test, n):
    '''
    PCA feature selection.  Select n features. 
    '''
    pca = PCA(n_components=n)
    pca.fit(X_train)
    X_train_new = pca.transform(X_train)
    X_test_new = pca.transform(X_test)
    logger.info("X_train size after PCA: {}".format(X_train_new.shape))
    logger.info("X_test size after PCA: {}".format(X_test_new.shape))
    return [X_train_new,X_test_new]

In [4]:
def tsneSelection(X_train, X_test, n):
    '''
    t-distributed Stochastic Neighbor Embedding feature selection.  Select n features. 
    '''
    tsne = TSNE(n_components=n)
    tsne.fit(X_train)
    X_train_new = tsne.transform(X_train)
    X_test_new = tsne.transform(X_test)
    logger.info("X_train size after PCA: {}".format(X_train_new.shape))
    logger.info("X_test size after PCA: {}".format(X_test_new.shape))
    return [X_train_new,X_test_new]

# Model

In [5]:
def model_fit_predict(X_train,X_test,y_train,y_test):

    # np.random.seed(2018)
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    # from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.svm import SVC
    from sklearn.metrics import precision_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import recall_score
    models = {
        'LogisticRegression': LogisticRegression(random_state=0, multi_class='ovr',solver='newton-cg',max_iter=10000,tol=0.001),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'RandomForestClassifier': RandomForestClassifier(),
    #   	# 'AdaBoostClassifier': AdaBoostClassifier(),
        # 'GradientBoostingClassifier': GradientBoostingClassifier(),
        'SVC': SVC(decision_function_shape='ovo',max_iter=10000,tol=0.001)
    }
    tuned_parameters = {
        'LogisticRegression':{'C': [1, 10]},
        # 'LogisticRegression':{'solver':['newton-cg','liblinear','sag'],'C': [1, 10]}
        'ExtraTreesClassifier': { 'n_estimators': [100] },
        'RandomForestClassifier': { 'n_estimators': [16, 32] },
    #   	'AdaBoostClassifier': { 'n_estimators': [16, 32] },
        # 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.05] },
        # 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
        'SVC': {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]}
    }
    scores= {}
    for key in models:
        print("Running",key,"...")
        clf = GridSearchCV(models[key], tuned_parameters[key], scoring=None,  refit=True, cv=10)
        clf.fit(X_train,y_train)
        y_test_predict = clf.predict(X_test)
        precision = precision_score(y_test, y_test_predict,average='micro') # tp / (tp + fp)
        accuracy = accuracy_score(y_test, y_test_predict) #subset accuracy
        f1 = f1_score(y_test, y_test_predict,average='micro') # F1 = 2 * (precision * recall) / (precision + recall)
        recall = recall_score(y_test, y_test_predict, average='macro') 
        # specificity = specificity_score(y_test, y_test_predict)
        scores[key] = [precision,accuracy,f1,recall]
    print(scores)
    return scores

# Pre-processing
### Separate X (features) and y (labels)
### Split training (70%) and testing (30%) dataset
### Standardize data -- scale features such that they are 1) zero-mean, 2) one-variance

In [None]:
data_file = "../data/miRNA.csv" # directory to miRNA_matrix.csv

df = pd.read_csv(data_file)
# print(df)
y_data = df.pop('label').values

df.pop('file_id')

columns =df.columns
#print (columns)
X_data = df.values

print ("Original dataset size:",X_data.shape[0])
print ("Total feature num:",X_data.shape[1])

# split the data to train and test set
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=0)
print("Training dataset size:",X_train.shape[0])
print("Testing dataset size:", X_test.shape[0])
# print(columns)
# print(X_train)
# print(y_train)

# standardize the data (zero-mean,uniform variance)
print ("pre-processing data...")
scaler = StandardScaler()
scaler = scaler.fit(X_train.astype(np.float64))
X_train = scaler.transform(X_train.astype(np.float64))
X_test = scaler.transform(X_test.astype(np.float64))
# print (X_train.mean(axis=0))
# print(X_train.std(axis=0))

# check the distribution of tumor and normal sampels in traing and test data set.
# logger.info("Percentage of tumor cases in training set is {}".format(sum(y_train)/len(y_train)))
# logger.info("Percentage of tumor cases in test set is {}".format(sum(y_test)/len(y_test)))


# Feature Reduction/Selection
### Lasso
### PCA
### PCA + tSNE

In [None]:
# LASSO feature selection
n = 50
feaures_columns = lassoSelection(X_train, y_train, n)
# feaures_columns = [25, 92, 119, 163, 166, 168, 181, 187, 194, 216, 240, 241, 248, \
# 253, 271, 272, 273, 282, 285, 287, 295, 305, 306, 336, 337, 339, 341, 351, 352, 488, \
# 495, 503, 511, 544, 588, 593, 641, 764, 1063, 1090, 1100, 1126, 1395, 1461, 1509, 1523, 1834, 1848, 1872]
scores_lasso = model_fit_predict(X_train[:,feaures_columns],X_test[:,feaures_columns],y_train,y_test)

In [None]:
# PCA feature reduction
n = 50
X_train_pca, X_test_pca = pcaSelection(X_train, X_test, n)
scores_pca = model_fit_predict(X_train_pca,X_test_pca,y_train,y_test)

In [10]:
# PCA+tSNE feature reduction
n1 = 50
n2 =3
X_train_pca, X_test_pca = pcaSelection(X_train, X_test, n1)
X_train_tsne, X_test_tsne = tsneSelection(X_train_pca, X_test_pca, n2)
scores_tsne = model_fit_predict(X_train_tsne,X_test_tsne,y_train,y_test)

[2018-10-19 17:20:37,095 - GDC - INFO] X_train size after PCA: (8040, 50)
[2018-10-19 17:20:37,096 - GDC - INFO] X_test size after PCA: (3446, 50)


AttributeError: 'TSNE' object has no attribute 'transform'