In [2]:

import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

# fix random seed for reproducibility
np.random.seed(42)

Using TensorFlow backend.


In [3]:
%%time
# Load training set
X_glio = pd.read_csv("data/glioblastomaExpression.csv",header = 0,index_col=0)
Y_glio = pd.read_csv("data/glioblastomaLabels.csv",header = 0,index_col=0)

CPU times: user 4.13 s, sys: 6.8 s, total: 10.9 s
Wall time: 10.9 s


In [6]:
# Convert rna prep method into numerical values for training
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y_glio["tr_method_value"] = pd.Series(encoder.fit_transform(Y_glio["tr_method"]), index=Y_glio.index)

In [7]:
y_glio = np.array((Y_glio["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0

# Predict with Logistic Regression
- Fit Logistic Regression model on glioma samples
- predict with sklearn using ALL labels and features (281 samples)
- see results<br>



In [8]:
Y_glio.shape

(233, 3)

In [9]:
X_glio.shape

(58581, 233)

In [10]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_glio.T, y_glio)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
# Load test set
X_test = pd.read_csv("data/ALLeukemiaExpression.csv",header=0,index_col=0)
Y_test = pd.read_csv("data/ALLeukemiaLabels.csv",header=0,index_col=0)

# Accuracy of Logistic Regression
- RiboD = 1, PolyA = 0
- present total accuracy over new test data (single disease=ALL)
- present riboD accuracy
- present polyA accuracy

### Logistic Regression is 5.6-85% accurate in general with test set

In [16]:
highestVarianceScore = -10000
higestAccuracy = 0
for a in range(10):
    print('new prediction on ALL:', a+1)
    # Randomize test set
    X_testRand = X_test.sample(frac=1)
    Y_testRandLong = Y_test.T[['tr_method','disease']].reindex(X_testRand.T.index)
    y_testRand = np.array((Y_testRandLong["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0
    Y_testRandWide = Y_testRandLong.T
    # Predict riboD or polyA from test set
    log_reg_predict = log_reg.predict(X_testRand.T)

#     # The coefficients
#     print('Coefficients: \n', log_reg.coef_, '\nlength of Coefficients: \n',len(log_reg.coef_[0]))
#     # The mean squared error
#     print("Mean squared error: %.2f"
#           % mean_squared_error(y_testRand, log_reg_predict))
#     # Explained variance score: 1 is perfect prediction
#     print('Variance score (1 is perfect prediction): %.2f' % r2_score(y_testRand, log_reg_predict))

    truthTest = []
    for i in range(len(log_reg_predict)):
        truthTest.append(log_reg_predict[i]==y_testRand[i])
#     print('\n\nAccuracy:',truthTest.count(True)/len(truthTest))
    truthTable = pd.DataFrame(list(zip(Y_testRandWide,y_testRand,log_reg_predict,truthTest)),\
                 columns=['th_sampleid','truth','prediction','predictionIsCorrect'])
#     print('PolyA correct:',list(truthTable[truthTable['truth']==0]['predictionIsCorrect']).count(True)/len(truthTable[truthTable['truth']==0]))   
#     print('RiboD correct:',list(truthTable[truthTable['truth']==1]['predictionIsCorrect']).count(True)/len(truthTable[truthTable['truth']==1]))
#     print('\n\n')
    if r2_score(y_testRand, log_reg_predict) > highestVarianceScore:
        highestVarianceScore = r2_score(y_testRand, log_reg_predict)
        highestAccuracy = truthTest.count(True)/len(truthTest)
print("highestVarianceScore:",highestVarianceScore)
print("highest Accuracy: ", highestAccuracy)
truthTest = []
for i in range(len(log_reg_predict)):
    truthTest.append(log_reg_predict[i]==y_testRand[i])
print('\n\nAccuracy:',truthTest.count(True)/len(truthTest))
truthTable = pd.DataFrame(list(zip(Y_testRandWide,y_testRand,log_reg_predict,truthTest)),\
             columns=['th_sampleid','truth','prediction','correct'])
truthTable

new prediction on ALL: 1
new prediction on ALL: 2
new prediction on ALL: 3
new prediction on ALL: 4
new prediction on ALL: 5
new prediction on ALL: 6
new prediction on ALL: 7
new prediction on ALL: 8
new prediction on ALL: 9
new prediction on ALL: 10
highestVarianceScore: -0.12665094339622662
highest Accuracy:  0.9395017793594306


Accuracy: 0.05693950177935943


Unnamed: 0,th_sampleid,truth,prediction,correct
0,TH01_0125_S01,1,1,True
1,TARGET-10-PASFXA-04,0,1,False
2,TARGET-10-PANSBR-09,0,1,False
3,TARGET-10-PARBRK-04,0,1,False
4,THR08_0162_S02,0,1,False
5,THR08_0200_S01,0,1,False
6,TARGET-10-PANCVR-04,0,1,False
7,TARGET-10-PAPDUV-09,0,1,False
8,THR08_0203_S01,0,1,False
9,TARGET-10-PASCIU-03,0,1,False


## (Future) Predicted low accuracy
- Fit whole compendium labels and features without ALL
- predict with sklearn using ALL labels and features (281 samples)
- see trainingCompendium notebook