In [26]:
import tensorflow as tf
import pandas as pd
import numpy as np
import time

import xgboost
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [103]:
def run_cross_val(X, y, classifiers, n_fold=5, random_state=0):
    kf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=random_state)
    result_df = pd.DataFrame()
    fold_num = 0
    for train, test in kf.split(X, y):
        fold_num += 1
        print(fold_num)
        x_train = X.iloc[train]
        y_train = y[train]
        x_test = X.iloc[test]
        y_test = y[test]
        for classifier_name, clf in classifiers.items():
            t0 = time.time()
            clf.fit(x_train, y_train)
            prediction = clf.predict(x_test)
            if fold_num == 1:
                print("fit and predict of classifier {0} takes {1} seconds".format(
                    classifier_name, time.time()-t0))
            result_df.loc[fold_num, classifier_name] = accuracy_score(prediction, y_test)
    return result_df


def prep_data(df, label):
    X = df[df.columns[3:]]
    X.reset_index(drop=True, inplace=True)
    y_raw = df[label]
    le = LabelEncoder()
    y = le.fit_transform(y_raw)
    return X, y, le

In [94]:
df = pd.read_csv("./data/mRNA_PCA_50_components.csv", index_col="sample_id")

### Tissue Predicter

In [99]:
X, y, le = prep_data(df, "label_tissue")

In [104]:
classifiers = {"Logistic Regression": LogisticRegression(), 
               "XGBoost": xgboost.XGBClassifier()}
result_df = run_cross_val(X, y, classifiers, n_fold=5, random_state=0)

1
fit and predict of classifier Logistic Regression takes 11.938397884368896 seconds
fit and predict of classifier XGBoost takes 5.860821485519409 seconds
2
3
4
5


In [106]:
result_df

Unnamed: 0,Logistic Regression,XGBoost
1,0.936435,0.923538
2,0.936288,0.918283
3,0.933766,0.913386
4,0.939647,0.91922
5,0.933923,0.914844
