### Script to process extracted LINCS signatures into a format suitable for DoRoThea etc.

In [3]:
# import packages
import pandas as pd
from glob import glob

In [5]:
# load data
## list files
calculon_data = glob("../Transcriptomics_Data/from_calculon/*")
print(calculon_data)

['../Transcriptomics_Data/from_calculon/spiperone_HEPG2_6 h_10 µM.csv', '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_24 h_10 µM_No_Replicates.txt', '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_6 h_10 µM_Consensus.txt', '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_6 h_3 µM_Consensus.txt', '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_6 h_3 µM_No_Replicates.txt', '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_6 h_10 µM_No_Replicates.txt', '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_24 h_3 µM_Consensus.txt', '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_24 h_10 µM_Consensus.txt', '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_24 h_3 µM_No_Replicates.txt']


In [66]:
## open the replicate matrix first
rep_mat = pd.read_csv(calculon_data[0],index_col=0)
rep_mat

Unnamed: 0_level_0,CPC004_HEPG2_6H:BRD-K55468218-001-16-2:10,CPC016_HEPG2_6H:BRD-K55468218-001-16-2:10,CVD001_HEPG2_6H:BRD-K55468218-001-16-2:10
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10007,0.224226,-0.174466,-3.415558
1001,-0.108408,-0.398970,-0.711983
10013,0.177987,1.592620,-0.914362
10038,0.376075,0.769336,-0.553934
10046,-2.353033,1.609043,-0.669830
...,...,...,...
994,-0.300577,0.409873,-1.831151
9943,0.407846,-0.307383,0.162897
9961,0.079676,0.180630,-2.365148
998,-0.718072,-0.369251,-0.206287


In [67]:
## change the colnames
rep_mat.columns = ['6h_10uM_1','6h_10uM_2','6h_10uM_3']
rep_mat.head()

Unnamed: 0_level_0,6h_10uM_1,6h_10uM_2,6h_10uM_3
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10007,0.224226,-0.174466,-3.415558
1001,-0.108408,-0.39897,-0.711983
10013,0.177987,1.59262,-0.914362
10038,0.376075,0.769336,-0.553934
10046,-2.353033,1.609043,-0.66983


In [13]:
## now read in all other files as a loop, extract column, change col name and add to full matrix
data_files = calculon_data[1:]
data_files

['../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_24 h_10 µM_No_Replicates.txt',
 '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_6 h_10 µM_Consensus.txt',
 '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_6 h_3 µM_Consensus.txt',
 '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_6 h_3 µM_No_Replicates.txt',
 '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_6 h_10 µM_No_Replicates.txt',
 '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_24 h_3 µM_Consensus.txt',
 '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_24 h_10 µM_Consensus.txt',
 '../Transcriptomics_Data/from_calculon/LINCS_all_HEPG2_24 h_3 µM_No_Replicates.txt']

In [79]:
dfcols = []
for f in data_files:
    
    # read it in and transpose it
    mat = (pd.read_csv(f,index_col=0,sep="\t")).transpose()
    
    if "spiperone" not in mat.columns.tolist():
        continue
    
    # subset just for spiperone
    mat = mat[['spiperone']]
    
    # extract the time poiint
    tp = f.split("HEPG2_")[1].split(" h")[0]
    
    # extract the dose
    d = f.split("HEPG2_")[1].split("_")[1].split(" ")[0]
    
    # change the col name
    colname = tp+"h_"+d+"uM"
    mat.columns = [colname]
    
    dfcols.append(mat)

In [80]:
## change index to numerical and bind to the rep_mat matrix
df = pd.concat(dfcols, axis=1, sort=False)
df.index = df.index.astype(int)

In [84]:
full_mat = pd.concat([df,rep_mat], axis=1)
full_mat

Unnamed: 0,24h_10uM,6h_10uM,6h_3uM,24h_3uM,6h_10uM_1,6h_10uM_2,6h_10uM_3
16,-5.007989,-0.679437,-0.419172,0.379737,-0.361913,3.030506,1.281102
23,2.402371,-0.364950,1.018829,0.041429,-0.883590,-0.691907,-0.592128
25,0.728516,0.636101,-0.701791,-0.837763,0.871811,1.561861,1.245187
30,-0.211073,0.380537,-0.060052,1.080040,0.449291,0.375457,0.387258
39,0.083360,-0.139425,0.764955,0.496611,0.236847,0.759870,-0.919301
...,...,...,...,...,...,...,...
200081,-1.905147,-0.263175,-0.551930,0.471927,0.846493,1.728014,0.248843
200734,0.194317,0.019805,-1.054226,0.152152,-0.592179,-0.504057,0.865518
256364,1.178730,-0.352025,-1.563118,-0.201832,-0.011850,1.381040,-0.904996
375346,-0.615308,-0.451846,0.054882,-0.229413,0.858698,-0.668128,0.350354


In [85]:
full_mat.to_csv("../Transcriptomics_Data/processed_signatures/speripone_matrix.csv")