In [1]:
import warnings
warnings.simplefilter(action='ignore')

import tensorflow as tf
import pandas as pd
import numpy as np
import time
import itertools
from pprint import pprint as pp
import multiprocessing

import xgboost
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import load_data
from matplotlib import pyplot as plt

from bokeh.models import Jitter
from bokeh.layouts import column
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from bokeh.models import FuncTickFormatter

%matplotlib inline
output_notebook()

In [2]:
def remove_nulls(X, y):
    return X[y!=0], y[y!=0]


def fit_one_model(X_train, y_train, X_val, y_val, label, fold_num, params):
    clf = LogisticRegression(**params, solver="saga", multi_class="multinomial")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    params["accuracy"] = accuracy_score(y_val, y_pred)
    params["precision"] = precision_score(y_val, y_pred, average="weighted")
    params["recall"] = recall_score(y_val, y_pred, average="weighted")
    params["f1"] = f1_score(y_val, y_pred, average="weighted")
    params["fold_num"] = fold_num
    params["label"] = label
    return params


def parameter_search_cv(data, random_state=0):
    rows = []
    for fold_num, (train, val) in enumerate(data.cv):
        print(fold_num, end=",")
        t0 = time.time()
        for label in ["gender", "tumor", "tissue"]:
            print(label, end=",")
            X_train, y_train = remove_nulls(train.X, train.y_STL[label])
            X_val, y_val = remove_nulls(val.X, val.y_STL[label])           
            assert((X_train.shape[0] == y_train.shape[0]) and (X_val.shape[0] == y_val.shape[0]))
            params_list = list(itertools.product(["balanced", None], ["l1", "l2"], [0.01, 0.1, 1, 10, 100]))
            params_dict_list = [dict(zip(["class_weight", "penalty", "C"], i)) for i in params_list]
            args_list = [[X_train, y_train, X_val, y_val, label, fold_num, i] for i in params_dict_list]
            p = multiprocessing.Pool(16)
            rows += p.starmap(fit_one_model, args_list)
            p.terminate()
        print("it takes {0} seconds to run fold {1}".format(time.time()-t0, fold_num))
    result_df = pd.DataFrame(rows)
    result_df = result_df.replace(np.nan, "unbalanced")
    return result_df


def sample_data(X, y, frac=0.01):
    idx = range(int(X.shape[0])) 
    sample_idx = np.random.choice(idx, size=int(len(idx)*frac))
    return X[sample_idx], y[sample_idx]

### Predictions

In [None]:
for var in [0.2, 0.6, 0.7, 0.8, 0.9]:
    print("PCA variance:", var)
    data = load_data.read_data_sets("./data/mRNA_PCA_{0}_variance_StandardScaled.csv".format(var),
                                    random_state=0)
    result_df = parameter_search_cv(data, random_state=0)
    result_df["PCA"] = var
    result_df.to_csv("./results/LR/PCA_{0}_LR_gridsearch.csv".format(var), index=None)

In [4]:
filename = "./results/LR/PCA_{0}_LR_gridsearch.csv"
result_df = pd.concat([pd.read_csv(filename.format(i)) for i in [0.2, 0.6, 0.7, 0.8, 0.9]])

### Compare between difference dataset

In [5]:
result_df.groupby(["PCA", "label", "class_weight", "penalty", "C"]).agg(
    [np.mean, np.std])[["accuracy", "f1"]].max(level=[0,1])

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,accuracy,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
PCA,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0.2,gender,0.530736,0.015205,0.527364,0.042556
0.2,tissue,0.308936,0.009691,0.256395,0.011142
0.2,tumor,0.876723,0.1087,0.821914,0.156832
0.6,gender,0.706645,0.006428,0.705414,0.006793
0.6,tissue,0.914166,0.008475,0.909099,0.007301
0.6,tumor,0.966746,0.062128,0.964138,0.042883
0.7,gender,0.738418,0.006582,0.737729,0.006666
0.7,tissue,0.936564,0.010086,0.933395,0.008714
0.7,tumor,0.971481,0.048624,0.96907,0.032713
0.8,gender,0.853647,0.009089,0.853668,0.009108


In [10]:
colors = ["red", "olive", "goldenrod", "skyblue", "orange", "salmon"]

def x_ticker_name():
    name_dict = {0:"PCA_0.6", 1:"PCA_0.7", 2:"PCA_0.8", 3:"PCA_0.9"}
    return name_dict[tick]


for label in ["gender", "tissue", "tumor"]:
    p = figure(plot_width=600, plot_height=400, title=label)
    for i, var in enumerate([0.6, 0.7, 0.8, 0.9]):
        y = result_df[(result_df['PCA'] == var) & (result_df["label"]==label)]['accuracy']
        color = colors[i % len(colors)]
        p.circle(x={'value': i, 'transform': Jitter(width=0.5)}, y=y, color="grey")
    p.xaxis.ticker = [0, 1, 2, 3]
    p.xaxis.formatter = FuncTickFormatter.from_py_func(x_ticker_name)
    show(p)