In [166]:
import tensorflow as tf
import pandas as pd
import numpy as np
import time

import xgboost
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [182]:
def run_cross_val(X, y, classifiers, n_fold=5, random_state=1):
    kf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=random_state)
    result_df = pd.DataFrame()
    fold_num = 0
    print("fold", end=" ")
    for train, test in kf.split(X, y):
        fold_num += 1
        print(fold_num, end=", ")
        x_train = X.iloc[train]
        y_train = y[train]
        x_test = X.iloc[test]
        y_test = y[test]
        for classifier_name, clf in classifiers.items():
            t0 = time.time()
            clf.fit(x_train, y_train)
            prediction = clf.predict(x_test)
            if fold_num == 5:
                print("Fit and predict of classifier {0} takes {1} seconds".format(
                    classifier_name, time.time()-t0))
            accuracy = accuracy_score(prediction, y_test)
            result_df.loc[fold_num, classifier_name + "_accuracy"] = accuracy
            if len(set(y)) == 2:
                auc_score = roc_auc_score(prediction, y_test)
                result_df.loc[fold_num, classifier_name + "_auc"] = auc_score
    return result_df


def prep_data(df, label):
    X = df[df.columns[3:]]
    X.reset_index(drop=True, inplace=True)
    y_raw = df[label]
    le = LabelEncoder()
    y = le.fit_transform(y_raw)
    return X, y, le


Classifiers = {"Logistic Regression": LogisticRegression(), 
               "XGBoost": xgboost.XGBClassifier()}

### Tissue Predicter

In [184]:
for n_component in [20, 50, 200]:
    print("number of components from PCA used: ", n_component)
    df = pd.read_csv("./data/mRNA_PCA_{0}_components.csv".format(n_component), 
                     index_col="sample_id")
    X, y, le = prep_data(df, "label_tissue")
    result_df = run_cross_val(X, y, Classifiers, n_fold=5, random_state=0)
    print(result_df)

number of components from PCA used:  20
fold 1, 2, 3, 4, 5, Fit and predict of classifier Logistic Regression takes 3.400779962539673 seconds
Fit and predict of classifier XGBoost takes 4.109149217605591 seconds
   Logistic Regression_accuracy  XGBoost_accuracy
1                      0.900967          0.895440
2                      0.899815          0.890120
3                      0.899954          0.900417
4                      0.906221          0.893686
5                      0.898092          0.880875
number of components from PCA used:  50
fold 1, 2, 3, 4, 5, Fit and predict of classifier Logistic Regression takes 12.382299184799194 seconds
Fit and predict of classifier XGBoost takes 6.736157417297363 seconds
   Logistic Regression_accuracy  XGBoost_accuracy
1                      0.936435          0.923538
2                      0.936288          0.918283
3                      0.933766          0.913386
4                      0.939647          0.919220
5                      0.

### Tumor v.s. Normal  Predicter

In [179]:
for n_component in [20, 50, 200]:
    print("number of components from PCA used: ", n_component)
    df = pd.read_csv("./data/mRNA_PCA_{0}_components.csv".format(n_component), 
                     index_col="sample_id")
    X, y, le = prep_data(df, "label")
    result_df = run_cross_val(X, y, Classifiers, n_fold=5, random_state=0)
    print(result_df)

number of components from PCA used:  20
fold 1, 2, 3, 4, 5, Fit and predict of classifier Logistic Regression takes 0.270998477935791 seconds
Fit and predict of classifier XGBoost takes 0.4251375198364258 seconds
   Logistic Regression_accuracy  XGBoost_accuracy
1                      0.970370          0.975463
2                      0.968519          0.970833
3                      0.968056          0.972222
4                      0.965278          0.973148
5                      0.970820          0.977304
number of components from PCA used:  50
fold 1, 2, 3, 4, 5, Fit and predict of classifier Logistic Regression takes 1.1676404476165771 seconds
Fit and predict of classifier XGBoost takes 0.6924042701721191 seconds
   Logistic Regression_accuracy  XGBoost_accuracy
1                      0.982870          0.979630
2                      0.977315          0.977315
3                      0.975463          0.975926
4                      0.975000          0.976389
5                      

### Gender  Predicter

In [183]:
for n_component in [20, 50, 200]:
    print("number of components from PCA used: ", n_component)
    df = pd.read_csv("./data/mRNA_PCA_{0}_components.csv".format(n_component), 
                     index_col="sample_id")
    X, y, le = prep_data(df, "label_gender")
    result_df = run_cross_val(X, y, Classifiers, n_fold=5, random_state=0)
    print(result_df)

number of components from PCA used:  20
fold 1, 2, 3, 4, 5, Fit and predict of classifier Logistic Regression takes 0.08177328109741211 seconds
Fit and predict of classifier XGBoost takes 0.2570624351501465 seconds
   Logistic Regression_accuracy  Logistic Regression_auc  XGBoost_accuracy  \
1                      0.742249                 0.748444          0.737159   
2                      0.720963                 0.727201          0.721888   
3                      0.728578                 0.733513          0.742473   
4                      0.732747                 0.738929          0.723483   
5                      0.733210                 0.739775          0.743863   

   XGBoost_auc  
1     0.746131  
2     0.729845  
3     0.751429  
4     0.731008  
5     0.754895  
number of components from PCA used:  50
fold 1, 2, 3, 4, 5, Fit and predict of classifier Logistic Regression takes 0.3034799098968506 seconds
Fit and predict of classifier XGBoost takes 0.33362722396850586 seconds