In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

# Set naming parameters (same as in part 1)
DATE = '08272023'
NAME = 'MRGvsM'
SELSET_FILE = './data/Rat_info_072023/MRGvsM_atp0.05_312.csv' # Replace with selected set file
G_FILE = './data/sigcom-lincs-L1000toRNAseq/Level5_pred_RNA-Seq-like-L1000/LINCS_DCIC_2021_AntibodyPert_PredictedRNAseq_ChDir_Sigs.gctx'

# Define output
OUTPUT_DIRECTORY = './data/Rat_info_072023/MRGvsM_results_08272023/'
if not os.path.exists(OUTPUT_DIRECTORY):
    print('Directory does not exist: ' + OUTPUT_DIRECTORY)

In [2]:
# Load in data and check targets
df = pd.read_pickle(OUTPUT_DIRECTORY + NAME + 'xeL1000_corrES_plt01_' + DATE + '.pkl')
sig_dict = pd.read_csv('sigcom_perttype_dict_07212023.csv', index_col=0) # Dictionary of signatures to perturbation types
df_sigs = pd.merge(sig_dict, df, on='sig_id', how='right')
df_sigs = df_sigs.drop_duplicates()


# Chemical perturbation section
chidx = df_sigs['pert_type'] == 'ChemicalPert'
chem_ids = df_sigs.loc[chidx, 'sig_id'].str.split('_', expand=True)[4]


# Import file with perturbation information for small molecules
LINCS_sm = pd.read_csv('./data/L1000toRNAseq/LINCS_small_molecules.tsv', sep='\t')
LINCS_sm_filt = LINCS_sm[LINCS_sm['target'] != '-']

# Combine the filtered small molecule pert_name dictionary with the targets 
chem_sigs = df_sigs[df_sigs['pert_type'] == 'ChemicalPert']
chem_sigs.loc[:,'pert_name'] = chem_sigs.loc[:,'sig_id'].str.split('_', expand=True)[4]

#chem_sigs = chem_sigs.drop(columns='target')
chem_wtargets = pd.merge(chem_sigs, LINCS_sm_filt[['pert_name', 'target']], left_on='pert_name', right_on='pert_name')
chem_wtargets.to_csv(OUTPUT_DIRECTORY + NAME + 'xeL1000_chemwtargets_corrES_plt01_' + DATE + '.csv')


#df_sigs = df_sigs.drop(columns=['target','target_x','target_y'])
df_sigs = pd.merge(df_sigs, chem_wtargets[['sig_id', 'pert_name', 'target']], left_on='sig_id', right_on='sig_id', how='left')
#df_sigs = df_sigs.drop(columns=['pert_name_x','pert_name_y'])

## For the shRNAPert the last item is the target
shidx = df_sigs['pert_type'] == 'shRNAPert'
df_sigs.loc[shidx,'target'] = df_sigs.loc[shidx]['sig_id'].apply(lambda x: x.split('_')[-1])


## OverexpressionPert
oidx = df_sigs['pert_type'] == 'OverexpressionPert'

# Overexpression perturbations tend to end in the target unless they end in a dosage in which case the target is one sooner
df_sigs.loc[oidx,'target'] = df_sigs.loc[oidx,'sig_id'].apply(lambda x: x.split('_')[-1] if not any(char.islower() for char in x.split('_')[-1]) else x.split('_')[-2])

## For the CRISPRPert the last item is sometimes the target
cidx = df_sigs['pert_type'] == 'CRISPRPert'
df_sigs.loc[cidx,'target'] = df_sigs.loc[cidx]['sig_id'].apply(lambda x: x.split('_')[-1])


## For the shRNAPert the last item is the target
shidx = df_sigs['pert_type'] == 'shRNAPert'
df_sigs.loc[shidx,'target'] = df_sigs.loc[shidx]['sig_id'].apply(lambda x: x.split('_')[-1])


## OverexpressionPert
oidx = df_sigs['pert_type'] == 'OverexpressionPert'

# Overexpression perturbations tend to end in the target unless they end in a dosage in which case the target is one sooner
df_sigs.loc[oidx,'target'] = df_sigs.loc[oidx,'sig_id'].apply(lambda x: x.split('_')[-1] if not any(char.islower() for char in x.split('_')[-1]) else x.split('_')[-2])

## For the CRISPRPert the last item is sometimes the target
cidx = df_sigs['pert_type'] == 'CRISPRPert'
df_sigs.loc[cidx,'target'] = df_sigs.loc[cidx]['sig_id'].apply(lambda x: x.split('_')[-1])


# Check overall result
df_sigs = df_sigs.dropna()
df_sigs.to_pickle(OUTPUT_DIRECTORY + NAME + 'xeL1000_corrES_wtargets_plt01_'  + DATE + '.pkl')

df_sorted_s = df_sigs.sort_values('pearson', ascending=False)
df_dd_s = df_sorted_s.drop_duplicates(subset='target', keep='first')
df_dd_s = df_dd_s.reset_index(drop=True)
df_dd_s.to_csv(OUTPUT_DIRECTORY + NAME + 'xeL1000_corrES_wtargets_sortedbyPearson-duplicatesdropped_plt01_' + DATE + '.csv')
print('File saved to: ' + OUTPUT_DIRECTORY + NAME + 'xeL1000_corrES_wtargets_sortedbyPearson-duplicatesdropped_plt01_' + DATE + '.csv')

In [3]:
df_dd_s

Unnamed: 0,sig_id,pert_type,pearson,spearman,ES,pert_name,target
0,CPC001_VCAP_24H_N13_SB-205607_10uM,ChemicalPert,0.631691,0.525836,0.347249,SB-205607,OPRD1
1,LKCP002_U2OS_48H_H04_cilengitide_10uM,ChemicalPert,0.599193,0.521385,0.285579,cilengitide,"ITGAV, ITGB3"
2,LJP007_MCF10A_24H_A11_XMD-885_0.12uM,ChemicalPert,0.597751,0.476661,0.363378,XMD-885,"MAPK7, LRRK2"
3,ASG003_U2OS_6H_M24_YM-155_0.12uM,ChemicalPert,0.597642,0.533652,0.288425,YM-155,BIRC5
4,CPC006_PL21_6H_A06_temozolomide_10uM,ChemicalPert,0.582085,0.543747,0.328273,temozolomide,MGMT
...,...,...,...,...,...,...,...
1096,CPD002_PC3_24H_H16_thiamine_10uM,ChemicalPert,0.008340,0.013569,0.089184,thiamine,SLC19A2
1097,PAC003_U2OS_6H_E12_2-iodomelatonin_10uM,ChemicalPert,-0.036372,0.036908,0.075901,2-iodomelatonin,"NQO2, MTNR1B, MTNR1A"
1098,PAC002_U2OS_6H_A04_phenacemide_10uM,ChemicalPert,-0.125025,-0.071103,-0.098672,phenacemide,SCN1A
1099,PAC002_U2OS_6H_G10_BAY-11-7085_10uM,ChemicalPert,-0.153183,0.005862,-0.113852,BAY-11-7085,NFKBIA


In [None]:
# Optional:
# Drop the duplicates that have the lower ES scores (optional)
df_sorted = df_sigs.sort_values('ES')
df_dd_ES = df_sorted.drop_duplicates(subset='target', keep='first')
df_dd_ES = df_dd_ES.reset_index(drop=True)
df_dd_ES.to_csv(OUTPUT_DIRECTORY + NAME + 'xeL1000_corrES_wtargets_sortedbyES-duplicatesdropped_plt01_' + DATE + '.csv')
