In [515]:

import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

# fix random seed for reproducibility
np.random.seed(42)

In [516]:
%%time
# Load training set
X_glio = pd.read_csv("data/glioblastomaExpression.csv",header = 0,index_col=0)
Y_glio = pd.read_csv("data/glioblastomaLabels.csv",header = 0,index_col=0)

CPU times: user 4.95 s, sys: 692 ms, total: 5.64 s
Wall time: 5.64 s


In [517]:
# Convert rna prep method into numerical values for training
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y_glio["tr_method_value"] = pd.Series(encoder.fit_transform(Y_glio["tr_method"]), index=Y.index)

In [518]:
y_glio = np.array((Y_glio["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0

# Predict with Logistic Regression
- Fit Logistic Regression model on glioma samples
- predict with sklearn using ALL labels and features (281 samples)
- see results<br>



In [519]:
Y_glio.shape

(233, 3)

In [520]:
X_glio.shape

(58581, 233)

In [535]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_glio.T, y_glio)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [522]:
# Load test set
X_test = pd.read_csv("data/ALLeukemiaExpression.csv",header=0,index_col=0)
Y_test = pd.read_csv("data/ALLeukemiaLabels.csv",header=0,index_col=0)

In [527]:
# Randomize test set
X_testRand = X_test.sample(frac=1)
Y_testRandLong = Y_test.T[['tr_method','disease']].reindex(X_testRand.T.index)
y_testRand = np.array((Y_testRandLong["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0

# Predict riboD or polyA from test set
log_reg_predict = log_reg.predict(X_testRand.T)

# The coefficients
print('Coefficients: \n', log_reg.coef_, '\nlength of Coefficients: \n',len(log_reg.coef_[0]))
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_testRand, log_reg_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_testRand, log_reg_predict))



Coefficients: 
 [[ -2.49636328e-04  -6.39874176e-05   1.19150902e-04 ...,   0.00000000e+00
   -9.16648913e-05  -1.29530621e-04]] 
length of Coefficients: 
 58581
Mean squared error: 0.94
Variance score: -16.56


# Accuracy of Logistic Regression
- RiboD = 1, PolyA = 0
- present total accuracy over new test data (single disease=ALL)
- present riboD accuracy
- present polyA accuracy

### Logistic Regression is 5.6% accurate in general with test set

In [529]:
truthTest = []
for i in range(len(log_reg_predict)):
    truthTest.append(log_reg_predict[i]==y_testRand[i])
print('\n\nAccuracy:',truthTest.count(True)/len(truthTest))
truthTable = pd.DataFrame(list(zip(Y_testRandWide,y_testRand,log_reg_predict,truthTest)),\
             columns=['th_sampleid','truth','prediction','correct'])
truthTable



Accuracy: 0.05693950177935943


Unnamed: 0,th_sampleid,truth,prediction,correct
0,THR08_0182_S01,1,1,True
1,THR08_0187_S01,0,1,False
2,THR08_0161_S01,0,1,False
3,TARGET-10-PAPMVB-04,0,1,False
4,TH01_0128_S01,0,1,False
5,TARGET-10-PASCIU-03,0,1,False
6,TH01_0131_S01,0,1,False
7,TARGET-10-PARIAD-04,0,1,False
8,TARGET-10-PAPGNC-03,0,1,False
9,TARGET-10-PANSDA-09,0,1,False


### Within PolyA, 0% accuracy (len = 265)

In [525]:
list(truthTable[truthTable['truth']==0]['correct']).count(True)/len(truthTable[truthTable['truth']==0])

0.0

### Within RiboD, 100% accuracy (len=16)

In [526]:
list(truthTable[truthTable['truth']==1]['correct']).count(True)/len(truthTable[truthTable['truth']==1])

1.0

## Predicted low accuracy
- Fit whole compendium labels and features without ALL
- predict with sklearn using ALL labels and features (281 samples)
- see trainingCompendium notebook