In [30]:

import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

# fix random seed for reproducibility
np.random.seed(42)

In [31]:
%%time
# Load training set
X_glio = pd.read_csv("data/glioblastomaExpression.csv",header = 0,index_col=0)
Y_glio = pd.read_csv("data/glioblastomaLabels.csv",header = 0,index_col=0)

CPU times: user 5.29 s, sys: 1.5 s, total: 6.8 s
Wall time: 6.81 s


In [32]:
# Load training set
X_hd5 = pd.read_hdf("data/gliomablastomaTrain.h5", "expression")
Y_hd5 = pd.read_hdf("data/gliomablastomaTrain.h5", "labels")

In [33]:
# Convert rna prep method into numerical values for training
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y_glio["tr_method_value"] = pd.Series(encoder.fit_transform(Y_glio["tr_method"]), index=Y_glio.index)
Y_hd5["tr_method_value"] = pd.Series(encoder.fit_transform(Y_hd5["tr_method"]), index=Y_hd5.index)

In [34]:
y_glio = np.array((Y_glio["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0
y_hd5 = np.array((Y_glio["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0

# Predict with Logistic Regression
- Use ALL labels and features (281 samples)
- predict with sklearn
- see results

In [35]:
Y_glio[Y_glio['tr_method']=='PolyA'].info()
Y_hd5[Y_glio['tr_method']=='PolyA'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 192 entries, TCGA-41-2571-01 to TCGA-41-2572-01
Data columns (total 3 columns):
tr_method          192 non-null object
disease            192 non-null object
tr_method_value    192 non-null int64
dtypes: int64(1), object(2)
memory usage: 6.0+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 192 entries, TCGA-41-2571-01 to TCGA-41-2572-01
Data columns (total 3 columns):
tr_method          192 non-null object
disease            192 non-null object
tr_method_value    192 non-null int64
dtypes: int64(1), object(2)
memory usage: 6.0+ KB


In [36]:
X_glio

Unnamed: 0,TCGA-41-2571-01,TCGA-28-2513-01,THR14_0319_S01,TCGA-06-0211-02,TCGA-06-2559-01,TCGA-26-5135-01,TCGA-06-0210-02,TCGA-06-0644-01,THR21_0532_S01,TCGA-06-0174-01,...,TCGA-14-0789-01,TH06_0616_S01,THR14_0301_S01,THR14_0311_S01,TCGA-27-2528-01,TCGA-06-2561-01,TCGA-32-1970-01,TCGA-12-3650-01,THR21_0540_S01,TCGA-41-2572-01
5S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,1.273512,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5_8S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7SK,0.000000,0.061245,0.000000,0.000000,0.065849,0.000000,0.063572,0.000000,0.197446,0.061245,...,0.000000,0.000000,0.000000,0.000000,0.056631,0.000000,0.000000,0.000000,0.238787,0.000000
A1BG,5.990746,5.687935,3.192194,6.235162,5.099273,4.697643,4.514772,5.543833,2.606442,5.186631,...,5.103545,4.281698,5.514122,5.242983,4.537348,3.212619,6.072972,4.897200,2.432959,6.587368
A1BG-AS1,2.604084,2.711477,0.214125,3.235750,2.657677,2.257066,2.364634,2.687107,1.443607,2.378513,...,2.978198,2.871844,4.040893,2.448901,2.639266,1.321971,4.136664,1.150645,1.232661,3.244872
A1CF,0.000000,0.000000,0.000000,0.000000,0.070525,0.000000,0.056724,0.056724,0.014355,0.000000,...,0.000000,0.014355,0.014355,0.000000,0.000000,0.000000,0.000000,0.000000,0.111031,0.000000
A2M,6.216219,9.167109,2.687061,8.791856,7.635368,8.134139,9.357698,8.443048,6.244507,7.400755,...,9.669072,6.080657,8.977251,8.718259,7.433662,8.236287,7.365567,7.311900,7.017031,8.096078
A2M-AS1,0.344950,0.687133,0.464668,0.807444,0.575411,0.782505,1.028655,0.565692,0.815575,0.344950,...,1.575342,1.097611,1.124328,1.843984,0.807444,1.589803,0.495795,0.275124,1.782409,0.687133
A2ML1,0.084203,0.678148,0.464668,0.575411,0.565692,0.575411,0.650870,0.084203,0.042644,0.310455,...,0.536153,1.495695,0.201634,0.070389,0.765625,2.813548,3.090849,0.251076,0.321928,0.704956
A2ML1-AS1,0.097748,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.097611,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [37]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_hd5, y_hd5)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
# Load test set
X_test = pd.read_csv("data/ALLeukemiaExpressionVar.csv",header=0,index_col=0)
Y_test = pd.read_csv("data/ALLeukemiaLabels.csv",header=0,index_col=0)

In [39]:
# Randomize test set
# X_testRand = X_test.sample(frac=1)
# Y_testRandLong = Y_test.T.reindex(X_testRand.T.index)
# Y_testRandLong = Y_testRandLong.T.reindex(Y_test.iloc[:,0:1])
# Y_testRandLong.head()

X_testRand = X_test.sample(frac=1)
Y_testRandLong = Y_test.T[['tr_method','disease']].reindex(X_testRand.T.index)
y_testRand = np.array((Y_testRandLong["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0


y_testRand = np.array((Y_testRandLong["tr_method"]=='RiboMinus').astype(np.int)) # 1 if RiboMinus, else 0

# Predict riboD or polyA from test set
log_reg_predict = log_reg.predict(X_testRand.T)

# The coefficients
print('Coefficients: \n', log_reg.coef_, '\nlength of Coefficients: \n',len(log_reg.coef_[0]))
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_testRand, log_reg_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_testRand, log_reg_predict))



Coefficients: 
 [[-0.00418181 -0.00122375  0.00012804 ...  0.00028804 -0.00077866
   0.00059957]] 
length of Coefficients: 
 14645
Mean squared error: 0.93
Variance score: -16.23


# Accuracy of Logistic Regression
- RiboD = 1, PolyA = 0
- present total accuracy over new test data (single disease=ALL)
- present riboD accuracy
- present polyA accuracy

### Logistic Regression is 74.7% accurate in general with test set

In [48]:
Y_testRandWide = Y_testRandLong.T
truthTest = []
for i in range(len(log_reg_predict)):
    truthTest.append(log_reg_predict[i]==y_testRand[i])
print('\n\nAccuracy:',truthTest.count(True)/len(truthTest))
truthTable = pd.DataFrame(list(zip(Y_testRandWide,y_testRand,log_reg_predict,truthTest)),\
             columns=['th_sampleid','truth','prediction','correct'])
truthTable



Accuracy: 0.07473309608540925


Unnamed: 0,th_sampleid,truth,prediction,correct
0,TH01_0125_S01,1,1,True
1,TARGET-10-PASFXA-04,0,1,False
2,TARGET-10-PANSBR-09,0,1,False
3,TARGET-10-PARBRK-04,0,1,False
4,THR08_0162_S02,0,1,False
5,THR08_0200_S01,0,1,False
6,TARGET-10-PANCVR-04,0,1,False
7,TARGET-10-PAPDUV-09,0,1,False
8,THR08_0203_S01,0,1,False
9,TARGET-10-PASCIU-03,0,1,False


### Within PolyA, 2.6% accuracy (len = 265)

In [46]:
list(truthTable[truthTable['truth']==0]['correct']).count(True)/len(truthTable[truthTable['truth']==0])

0.026415094339622643

### Within RiboD, 87.5% accuracy (len=16)

In [47]:
list(truthTable[truthTable['truth']==1]['correct']).count(True)/len(truthTable[truthTable['truth']==1])

0.875