<a href="https://colab.research.google.com/github/jtwang1027/cancer_omics/blob/master/2_pipeline_multi_cell_line.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

In [0]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow 2.x selected.
Found GPU at: /device:GPU:0


In [0]:
dir="/content/drive/My Drive/complete_cell_lines"
file_list=[]
for file in os.listdir(dir):
    if file.endswith(".csv"):
        file_list.append(os.path.join(dir, file))
print(file_list)

['/content/drive/My Drive/complete_cell_lines/184a1.csv', '/content/drive/My Drive/complete_cell_lines/bt20.csv', '/content/drive/My Drive/complete_cell_lines/bt474.csv', '/content/drive/My Drive/complete_cell_lines/bt549.csv', '/content/drive/My Drive/complete_cell_lines/cal148.csv', '/content/drive/My Drive/complete_cell_lines/cal51.csv', '/content/drive/My Drive/complete_cell_lines/cal851.csv', '/content/drive/My Drive/complete_cell_lines/du4475.csv', '/content/drive/My Drive/complete_cell_lines/efm192a.csv', '/content/drive/My Drive/complete_cell_lines/evsat.csv', '/content/drive/My Drive/complete_cell_lines/hbl100.csv', '/content/drive/My Drive/complete_cell_lines/hcc1187.csv', '/content/drive/My Drive/complete_cell_lines/hcc1395.csv', '/content/drive/My Drive/complete_cell_lines/hcc1419.csv', '/content/drive/My Drive/complete_cell_lines/hcc1500.csv', '/content/drive/My Drive/complete_cell_lines/hcc1569.csv', '/content/drive/My Drive/complete_cell_lines/hcc1599.csv', '/content/dri

In [0]:
#what columns are common to all data?
col=set()
for f in file_list:
  curr=pd.read_csv(f, index_col=0, nrows=10)
  curr.dropna(axis=1, inplace=True) 
  curr=curr.columns.tolist()
  curr=set(curr)
  # print(f)
  if col==set():
    col=curr # set as baseline columns
  else:
    col.intersection(curr)
    # print(f)
col= list(col)

In [0]:
col

['cleavedCas',
 'Ki.67',
 'fileID',
 'p.JNK',
 'p.SRC',
 'b.CATENIN',
 'time',
 'p.p38',
 'p.PDPK1',
 'p.BTK',
 'p.MKK3.MKK6',
 'p.SMAD23',
 'p.H3',
 'p.STAT3',
 'p.Akt.Ser473.',
 'p.STAT5',
 'IdU',
 'p.AKT.Thr308.',
 'p.MEK',
 'p.NFkB',
 'p.GSK3b',
 'p.S6K',
 'cellID',
 'p.ERK',
 'cell_line',
 'p.MKK4',
 'p.MAP2K3',
 'p.MAPKAPK2',
 'p.AMPK',
 'p.HER2',
 'p.STAT1',
 'p.PLCg2',
 'p.p53',
 'p.4EBP1',
 'p.RB',
 'p.CREB',
 'GAPDH',
 'p.S6',
 'p.FAK',
 'CyclinB',
 'p.p90RSK']

In [0]:
#which cell lines have treatment data
file_tr=[]
for f in file_list:
  curr=pd.read_csv(f, index_col=0, nrows=0).columns.tolist()
  # print(f)
  if 'p.STAT5' in curr:
    file_tr.append(f)
# file_tr

In [0]:
from sklearn.preprocessing import MinMaxScaler

In [0]:
def process(filenames, predictor= 'p.STAT5'):
  ''' input filename, downloads csv, preprocessing, minmaxscaling
  returns X and y for train/test/split ; predictor is the predictor column'''
  
  df=pd.DataFrame()
  for fi in filenames:
    temp= pd.read_csv(fi)
    df=pd.concat([df,temp],axis=0)
    
  df.dropna(axis=1, inplace=True)
  #should add in synchronization for treatment times
  
  if 'treatment' in df.columns: # will likely get dropped, not in all datasets
    encoded1=pd.get_dummies(df.treatment, drop_first=True)
    df=df.join(encoded1)
#   if 'cell_line' in df.columns: #one hot encode cell line
#     print(f'# unique lines: {df.cell_line.unique()}')
#     encoded2=pd.get_dummies(df['cell_line'], drop_first=False)
#     print(encoded2)
#     print('cell line encoded')
#     df=df.join(encoded2)
  
  df=df.select_dtypes(include=['float64'])#,'uint8'])
  
#   predictor=df.columns[-1] #make it the last one
  
  print(f'predictor column: {predictor}')
  
  X= df.drop([predictor], axis=1)
  
  #scale remaining columns
  scaler=MinMaxScaler()
  X[X.columns]=scaler.fit_transform(X)

  
  
  y= df[predictor]

  return(X,y)

In [0]:
X,y= process(file_list[:4])

predictor column: p.STAT5


In [0]:
temp1=pd.read_csv(file_list[0])
temp2=pd.read_csv(file_list[1])

In [0]:
temp=pd.concat([temp1,temp2], axis=0)
temp.columns

Index(['treatment', 'cell_line', 'time', 'cellID', 'fileID', 'b.CATENIN',
       'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67', 'p.4EBP1',
       'p.Akt.Ser473.', 'p.AKT.Thr308.', 'p.AMPK', 'p.BTK', 'p.CREB', 'p.ERK',
       'p.FAK', 'p.GSK3b', 'p.H3', 'p.HER2', 'p.JNK', 'p.MAP2K3', 'p.MAPKAPK2',
       'p.MEK', 'p.MKK3.MKK6', 'p.MKK4', 'p.NFkB', 'p.p38', 'p.p53',
       'p.p90RSK', 'p.PDPK1', 'p.PLCg2', 'p.RB', 'p.S6', 'p.S6K', 'p.SMAD23',
       'p.SRC', 'p.STAT1', 'p.STAT3', 'p.STAT5'],
      dtype='object')

In [0]:
temp.cell_line.unique()

array(['184A1', 'BT20'], dtype=object)

In [0]:
encoded2=pd.get_dummies(df['cell_line'], drop_first=True)

In [0]:
encoded2=pd.get_dummies(temp['cell_line'], drop_first=False)

In [0]:
temp.cell_line.unique()

array(['184A1', 'BT20'], dtype=object)

In [0]:
encoded2

Unnamed: 0,184A1,BT20
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
412576,0,1
412577,0,1
412578,0,1
412579,0,1


In [0]:
# temp.join(encoded2)
pd.merge(temp, encoded2, left_index=True, right_index=True)


Unnamed: 0,treatment,cell_line,time,cellID,fileID,b.CATENIN,cleavedCas,CyclinB,GAPDH,IdU,Ki.67,p.4EBP1,p.Akt.Ser473.,p.AKT.Thr308.,p.AMPK,p.BTK,p.CREB,p.ERK,p.FAK,p.GSK3b,p.H3,p.HER2,p.JNK,p.MAP2K3,p.MAPKAPK2,p.MEK,p.MKK3.MKK6,p.MKK4,p.NFkB,p.p38,p.p53,p.p90RSK,p.PDPK1,p.PLCg2,p.RB,p.S6,p.S6K,p.SMAD23,p.SRC,p.STAT1,p.STAT3,p.STAT5,184A1,BT20
0,EGF,184A1,0.0,1,2656,0.224784,1.584511,2.41223,1.586472,5.09198,1.59085,0.993163,0.550685,1.986163,2.103893,1.552072,0.342839,1.945108,1.974801,1.10118,2.65877,4.030961,1.727544,1.412370,0.499066,0.934936,1.178910,1.26944,1.763922,1.505088,0.798554,2.88592,0.634713,1.998328,6.12288,4.63975,0.331558,2.533238,0.776630,1.242880,0.704653,-0.577393,1,0
0,EGF,184A1,0.0,1,2656,0.224784,1.584511,2.41223,1.586472,5.09198,1.59085,0.993163,0.550685,1.986163,2.103893,1.552072,0.342839,1.945108,1.974801,1.10118,2.65877,4.030961,1.727544,1.412370,0.499066,0.934936,1.178910,1.26944,1.763922,1.505088,0.798554,2.88592,0.634713,1.998328,6.12288,4.63975,0.331558,2.533238,0.776630,1.242880,0.704653,-0.577393,0,1
0,EGF,BT20,0.0,1,173,1.473732,3.442083,1.53231,3.154426,4.25020,3.90564,3.913590,4.685157,3.520813,2.940114,2.973216,1.327159,4.246362,2.993972,4.81318,1.71397,5.097469,1.437622,1.760643,2.345361,2.128692,2.959242,4.26799,3.348944,2.470509,2.739179,4.30421,1.933920,3.557209,3.53145,4.46999,2.019960,2.416691,3.938022,3.128716,2.849716,2.469874,1,0
0,EGF,BT20,0.0,1,173,1.473732,3.442083,1.53231,3.154426,4.25020,3.90564,3.913590,4.685157,3.520813,2.940114,2.973216,1.327159,4.246362,2.993972,4.81318,1.71397,5.097469,1.437622,1.760643,2.345361,2.128692,2.959242,4.26799,3.348944,2.470509,2.739179,4.30421,1.933920,3.557209,3.53145,4.46999,2.019960,2.416691,3.938022,3.128716,2.849716,2.469874,0,1
1,EGF,184A1,0.0,1,2689,1.595264,3.411190,2.72302,4.005737,5.01509,1.66281,4.947062,3.085023,3.101794,3.716224,3.051249,1.335840,2.688881,2.978988,3.10473,3.97202,5.105626,2.814350,2.191086,2.889448,2.521620,3.098738,4.72070,3.355354,2.951474,3.142957,3.60416,3.624425,2.937221,4.46909,4.76504,2.479759,3.402019,3.276655,2.784750,3.121248,2.679670,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412576,iPKC,BT20,60.0,10943,203,0.224784,2.621299,1.20489,0.565534,6.15714,1.69387,1.262963,4.072052,1.079260,2.652408,0.543688,1.330433,3.510651,1.404442,1.41261,2.03113,4.099871,1.502687,1.161247,0.754879,3.349719,2.221470,5.11187,2.093402,1.228809,0.798554,3.79872,1.449668,2.996980,2.20794,5.19038,2.568317,2.437066,1.627965,1.072230,1.729814,2.447808,0,1
412577,iPKC,BT20,60.0,10944,203,0.224784,1.620657,2.17016,0.565534,4.25020,1.65883,2.826974,4.256552,1.737076,3.681191,2.286686,0.886092,4.431916,2.462415,3.61944,1.03591,4.837756,0.791972,0.364099,3.555782,4.056356,1.888923,5.04185,2.787857,3.746544,2.093396,4.25233,0.516309,2.144964,2.56546,5.83472,2.428914,2.028794,2.665232,3.332157,2.142585,3.319022,0,1
412578,iPKC,BT20,60.0,10945,203,0.224784,1.814268,1.40306,1.653685,5.24270,1.59085,1.049438,3.200674,0.895138,2.980461,1.929770,2.191442,4.277874,2.606349,2.36089,2.75427,4.511062,1.422224,1.931830,2.528625,3.471817,1.717419,4.97150,3.077000,2.081375,2.449693,4.37475,1.602473,2.657207,2.64744,5.86601,2.957850,1.859629,2.000075,3.596318,1.050310,2.818219,0,1
412579,iPKC,BT20,60.0,10946,203,0.450334,1.828815,1.42798,1.938115,4.92441,2.22142,2.952410,4.310807,1.862176,3.833275,0.841376,0.646114,4.676619,3.093699,3.82837,3.01109,5.881139,1.933004,0.364099,2.867965,4.143543,1.883298,5.44465,3.030463,3.474432,1.538356,4.60995,1.602103,3.075064,2.43223,6.58014,2.227953,1.988299,2.725758,3.896554,1.885875,2.761818,0,1


In [0]:
encoded2.dtypes

184A1    uint8
BT20     uint8
dtype: object

In [0]:
temp.cell_line.unique()

array(['184A1', 'BT20'], dtype=object)

In [0]:
X.head()

Unnamed: 0,time,b.CATENIN,cleavedCas,CyclinB,GAPDH,IdU,Ki.67,p.4EBP1,p.Akt.Ser473.,p.AKT.Thr308.,p.AMPK,p.BTK,p.CREB,p.ERK,p.FAK,p.GSK3b,p.H3,p.HER2,p.JNK,p.MAP2K3,p.MAPKAPK2,p.MEK,p.MKK3.MKK6,p.MKK4,p.NFkB,p.p38,p.p53,p.p90RSK,p.PDPK1,p.PLCg2,p.RB,p.S6,p.S6K,p.SMAD23,p.SRC,p.STAT1,p.STAT3
0,0.0,0.0,0.086165,0.177054,0.192988,0.091187,0.0,0.0,0.0,0.139917,0.276021,0.194083,0.0,0.225248,0.303534,0.0,0.179983,0.371509,0.19069,0.138618,0.0,0.153991,0.090401,0.033644,0.276346,0.146509,0.0,0.26212,0.018636,0.193378,0.535957,0.283681,0.0,0.254186,0.035064,0.233791,0.008185
1,0.0,0.247093,0.315549,0.222631,0.650303,0.082858,0.011928,0.534501,0.334143,0.28299,0.494447,0.482627,0.09934715,0.336725,0.505064,0.272547,0.325628,0.498271,0.390215,0.241591,0.361818,0.382905,0.346091,0.473866,0.46083,0.347042,0.360702,0.365048,0.489208,0.372768,0.309552,0.299524,0.446373,0.365618,0.42891,0.461356,0.38691
2,0.0,0.130996,0.160565,0.174623,0.551576,0.119837,0.0,0.205114,0.196928,0.257694,0.498823,0.468817,1.000474e-07,0.227379,0.270447,0.0,0.182626,0.404544,0.265458,0.277046,0.214959,0.398163,0.194793,0.341215,0.355895,0.165638,0.140817,0.0,0.314762,0.240401,0.351938,0.253003,0.412886,0.192961,0.198114,0.402057,0.120641
3,0.0,0.125813,0.0,0.019761,0.385106,0.078605,0.0,0.384213,0.29638,0.032518,0.349477,0.275986,0.03299393,0.430958,0.387031,0.307907,0.232261,0.375387,0.350791,0.194166,0.293676,0.451711,0.350188,0.313392,0.397242,0.29968,0.0,0.396328,0.314104,0.316921,0.239866,0.532708,0.502227,0.268803,0.318338,0.455991,0.050687
4,0.0,0.31533,0.151789,0.0,0.188116,0.017979,0.444844,0.062137,0.20821,0.091852,0.521913,0.331514,0.1425027,0.382907,0.377538,0.0,0.237277,0.253561,0.189693,0.139113,0.0,0.210086,0.156386,0.274211,0.571772,0.249623,0.070745,0.292693,0.268644,0.0,0.438189,0.356578,0.0,0.166972,0.151538,0.247885,0.249143


In [0]:
from tensorflow.python.keras.layers import Dense, BatchNormalization
from tensorflow.python.keras import Sequential
import tensorflow.python.keras


In [0]:
 def build_model():
  model = Sequential([
    Dense(24, activation='relu', input_shape=[len(X.columns)]),
    Dense(12, activation='relu'),
    BatchNormalization(),
    Dense(12, activation='relu'),
    Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

model=build_model()

In [0]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25, random_state=1234)

In [19]:
# model2=build_model()
EPOCHS = 1
# batch_size= 
# with tf.device('/device:GPU:0'):
history=model.fit(X_train, y_train, epochs=EPOCHS, validation_split=0.2, verbose=2)#, callbacks=[ tfdocs.modeling.EpochDots() ])


Train on 3387751 samples, validate on 846938 samples
3387751/3387751 - 399s - loss: 0.1837 - mae: 0.3259 - mse: 0.1837 - val_loss: 0.1816 - val_mae: 0.3230 - val_mse: 0.1816


In [0]:
#test set performance (from same cell lines)
test_predictions=model.predict(X_test)
rms = sqrt(mean_squared_error(test_predictions, y_test))
rms



NameError: ignored

In [0]:
#how well does this model perform on other cell lines?
Xf,yf = process(file_list[-5:])


In [0]:
#test set performance (from different cell lines)
test_predictions=model.predict(Xf)
rms = sqrt(mean_squared_error(test_predictions, yf))
rms


