In [1]:

import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

# fix random seed for reproducibility
np.random.seed(42)

Using TensorFlow backend.


In [3]:
%%time
# Load training set
X_glio = pd.read_csv("data/glioblastomaExpression14kgenes.csv",header = 0,index_col=0)
Y_glio = pd.read_csv("data/glioblastomaLabels.csv",header = 0,index_col=0)

CPU times: user 1.49 s, sys: 596 ms, total: 2.08 s
Wall time: 2.08 s


In [59]:
# Convert rna prep method into numerical values for training
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y_glio["tr_method_value"] = pd.Series(encoder.fit_transform(Y_glio["tr_method"]), index=Y_glio.index)

In [60]:
y_glio = np.array((Y_glio["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0

# Predict with Logistic Regression
- Fit Logistic Regression model on glioma samples
- predict with sklearn using ALL labels and features (281 samples)
- see results<br>



In [61]:
Y_glio.shape

(233, 3)

In [62]:
X_glio.shape

(14645, 233)

In [63]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_glio.T, y_glio)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [64]:
# Load test set
X_test = pd.read_csv("data/ALLeukemiaExpressionVar.csv",header=0,index_col=0)
Y_test = pd.read_csv("data/ALLeukemiaLabels.csv",header=0,index_col=0)

# Accuracy of Logistic Regression
- RiboD = 1, PolyA = 0
- present total accuracy over new test data (single disease=ALL)
- present riboD accuracy
- present polyA accuracy

### Logistic Regression is 5%-85% accurate in general with test set

In [67]:
for a in range(3):
    print('new prediction on ALL:', a+1)
    # Randomize test set
    X_testRand = X_test.sample(frac=1)
    Y_testRandLong = Y_test.T[['tr_method','disease']].reindex(X_testRand.T.index)
    y_testRand = np.array((Y_testRandLong["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0
    Y_testRandWide = Y_testRandLong.T
    # Predict riboD or polyA from test set
    log_reg_predict = log_reg.predict(X_testRand.T)

#     # The coefficients
    print('Coefficients: \n', log_reg.coef_, '\nlength of Coefficients: \n',len(log_reg.coef_[0]))
#     # The mean squared error
    print("Mean squared error: %.2f"
          % mean_squared_error(y_testRand, log_reg_predict))
#     # Explained variance score: 1 is perfect prediction
    print('Variance score (1 is perfect prediction): %.2f' % r2_score(y_testRand, log_reg_predict))

    truthTest = []
    for i in range(len(log_reg_predict)):
        truthTest.append(log_reg_predict[i]==y_testRand[i])
    print('\n\nAccuracy:',truthTest.count(True)/len(truthTest))
    truthTable = pd.DataFrame(list(zip(Y_testRandWide,y_testRand,log_reg_predict,truthTest)),\
                 columns=['th_sampleid','truth','prediction','predictionIsCorrect'])
    truthTable.groupby('predictionIsCorrect').size()
    print('PolyA correct:',list(truthTable[truthTable['truth']==0]['predictionIsCorrect']).count(True)/len(truthTable[truthTable['truth']==0]))   
    print('RiboD correct:',list(truthTable[truthTable['truth']==1]['predictionIsCorrect']).count(True)/len(truthTable[truthTable['truth']==1]))
    print('\n\n')

new prediction on ALL: 1
Coefficients: 
 [[-0.00418181 -0.00122375  0.00012804 ...  0.00028804 -0.00077866
   0.00059957]] 
length of Coefficients: 
 14645
Mean squared error: 0.94
Variance score (1 is perfect prediction): -16.50


Accuracy: 0.060498220640569395
PolyA correct: 0.0037735849056603774
RiboD correct: 1.0



new prediction on ALL: 2
Coefficients: 
 [[-0.00418181 -0.00122375  0.00012804 ...  0.00028804 -0.00077866
   0.00059957]] 
length of Coefficients: 
 14645
Mean squared error: 0.35
Variance score (1 is perfect prediction): -5.49


Accuracy: 0.6512455516014235
PolyA correct: 0.6339622641509434
RiboD correct: 0.9375



new prediction on ALL: 3
Coefficients: 
 [[-0.00418181 -0.00122375  0.00012804 ...  0.00028804 -0.00077866
   0.00059957]] 
length of Coefficients: 
 14645
Mean squared error: 0.93
Variance score (1 is perfect prediction): -16.23


Accuracy: 0.07473309608540925
PolyA correct: 0.018867924528301886
RiboD correct: 1.0





## Predicted low and high accuracy
- This is just trained on the glioblastoma samples and tested on only the ALL samples.
- Since the only thing changing between predictions is the order of ALL samples, the logistic regression model is predicting the order of the samples rather than the expression values itself. 

- Fit whole compendium labels and features without ALL
- predict with sklearn using ALL labels and features (281 samples)
- see trainingCompendium notebook