In [2]:
### This is a simple script formatting the PC9 data to data preprocessing
# Author: Yiyun
import pandas as pd
import os
import shutil

***
### Drop PC9 in 19Q3

In [None]:
### q3 and q4 input and output folder path
q3_dir = '../data/DepMap/19Q3'
q4_dir = '../data/DepMap/19Q4'

out_q3_dir = '../data/DepMap_DROP_PC9/19Q3'
out_q4_dir = '../data/DepMap_DROP_PC9/19Q4'
if not os.path.exists(out_q3_dir):
    shutil.copytree(q3_dir,out_q3_dir)
if not os.path.exists(out_q4_dir):
    shutil.copytree(q4_dir,out_q4_dir)

In [None]:
# Read 19Q3 and 19Q4 file, use column names as reference for genes 
df_ref19q3 = pd.read_csv(os.path.join(q3_dir,'Achilles_gene_effect.csv'), index_col = 0)
df_ref19q4 = pd.read_csv(os.path.join(q4_dir,'Achilles_gene_effect.csv'), index_col = 0)

In [None]:
# Delete PC9 data from gene effect file
df_ref19q3_m = df_ref19q3.drop(['ACH-000030'])
df_ref19q4_m = df_ref19q4.drop(['ACH-000030'])

In [None]:
# Save file
# df_ref19q3_m.to_csv(os.path.join(out_q3_dir,'Achilles_gene_effect.csv'), sep = ',')
# df_ref19q4_m.to_csv(os.path.join(out_q4_dir,'Achilles_gene_effect.csv'), sep = ',')

***
### Format PC9 and To data as q3 input

In [3]:
### Read files
# Need to read 19Q3 files for mapping the gene name + gene code
q3_dir = '../data/DepMap/19Q3'
q4_dir = '../data/DepMap/19Q4'
skmel28_dir = '../data/ceres_external/SKMEL28'
colo704_dir = '../data/ceres_external/COLO704'
pc9_dir = '../data/ceres_external/PC9_corrected'
to_dir = '../data/ceres_external/To'


df_ref19q3 = pd.read_csv(os.path.join(q3_dir,'Achilles_gene_effect.csv'), index_col = 0)
df_pc9 = pd.read_csv(os.path.join(pc9_dir,'BatchCorrectedPritchard.csv'), index_col = 0)
df_to = pd.read_csv(os.path.join(to_dir,'ToCellCERES.csv'), index_col = 0)
df_colo704 = pd.read_csv(os.path.join(colo704_dir,'COLO704_CERES.csv'), index_col = 0)
df_skmel28 = pd.read_csv(os.path.join(skmel28_dir,'SKMEL28_CERES.csv'), index_col = 0)

In [9]:
### Assign random cell line names and transpose the dataframe to let columns be genes
df_pc9.columns = ['ACH-000113'] # Assign a random ID for PC9
df_pc9= df_pc9.T
df_to.columns = ['ACH-000113','ACH-000067','ACH-000070','ACH-000075',\
                 'ACH-000108','ACH-000228','ACH-000233','ACH-000414']
df_to= df_to.T

In [41]:
df_colo704.columns = ['ACH-000113']
df_skmel28.columns = ['ACH-000113']
df_colo704= df_colo704.T
df_skmel28= df_skmel28.T

In [42]:
### Map 19q3 column names to PC9 and To data
# Create dictionary for gene names and id in 19Q3 file
dict_code = {}
for i in range(0, len(df_ref19q3.T)):
    name,idx = df_ref19q3.columns[i].split(' ')[0:2]
    if name not in dict_code:
        dict_code[name] = idx

# Map gene in PC9 and To data
def map_gene_names(df):
    global dict_code
    for names in df.columns:
        if names in dict_code:
            new_name = names + ' ' + dict_code[names]
            df.rename(columns = {names:new_name}, inplace = True)
        else:
            df = df.drop(columns=[names])
    
    df = df.dropna(axis = 1)
    return df

# df_pc9 = map_gene_names(df_pc9)
# df_to = map_gene_names(df_to)
df_colo704 = map_gene_names(df_colo704)
df_skmel28 = map_gene_names(df_skmel28)

In [9]:
# df_pc9.to_csv(os.path.join(pc9_dir,'gene_effect.csv'), sep = ',')
# df_to.to_csv(os.path.join(to_dir,'gene_effect.csv'), sep = ',')
df_skmel28.to_csv(os.path.join(skmel28_dir,'gene_effect.csv'), sep = ',')
df_colo704.to_csv(os.path.join(colo704_dir,'gene_effect.csv'), sep = ',')

***
### L200 standalone score - test version before getting gene names

In [None]:
### Read the l200 standalone gene and score
pc9_dir = '../data/ceres_external/PC9_corrected'
df_l200 = pd.read_csv(os.path.join(pc9_dir,'PC9AvL200_CERES.csv'))

In [None]:
### Process landmark.pseudo to possible format with no gene id
l200_gene = [i[0] for i in df_l200['landmark.pseudo'].str.split(' ()')]
df_l200['landmark.pseudo'] = l200_gene

In [32]:
### Assign gene id to landmark.pseudo
### q3 and q4 input and output folder path
q3_dir = '../data/DepMap/19Q3'
q4_dir = '../data/DepMap/19Q4'
    
# Read 19Q3 and 19Q4 file, use column names as reference for genes 
df_ref19q3 = pd.read_csv(os.path.join(q3_dir,'Achilles_gene_effect.csv'), index_col = 0)
df_ref19q4 = pd.read_csv(os.path.join(q4_dir,'Achilles_gene_effect.csv'), index_col = 0)

In [33]:
### map gene id to gene name
dict_code = {}
for i in range(0, len(df_ref19q3.T)):
    name,idx = df_ref19q3.columns[i].split(' ')[0:2]
    if name not in dict_code:
        dict_code[name] = idx

# Map gene in PC9 and To data
for i in range(0,len(df_l200)):
    name = df_l200.loc[i,'landmark.pseudo']
    if name in dict_code:
        new_name = name + ' ' + dict_code[name]
        df_l200.loc[i,'landamrk.pseudo'] = new_name
    else:
        print(name)
        df_l200.drop(i)

NPIPB5
HIAT1
ATP5I
NUDT4
KIAA0907
FAM21C
KIAA1279
PTPLB


In [39]:
### Change gene names in 19q3 and 14(dropped pc9) -- read data
out_q3_dir = '../data/DepMap_DROP_PC9/19Q3'
out_q4_dir = '../data/DepMap_DROP_PC9/19Q4'
df_ref19q3_pc9 = pd.read_csv(os.path.join(out_q3_dir,'Achilles_gene_effect.csv'), index_col = 0)
df_ref19q4_pc9 = pd.read_csv(os.path.join(out_q4_dir,'Achilles_gene_effect.csv'), index_col = 0)

In [41]:
### Change gene names in 19q3 and 14(dropped pc9) -- read data
df_l200

Unnamed: 0,PC9.1,PC9.2,mean,gene.type,scaled.score,landmark.pseudo,landamrk.pseudo
0,0.038043,0.026153,0.032098,cond.ess,-0.370860,MARS2,MARS2 (92935)
1,0.381959,0.233241,0.307600,cond.ess,-0.237248,NRAS,NRAS (4893)
2,-1.411134,-1.353202,-1.382168,essntl,-1.056740,SDHAF1,SDHAF1 (644096)
3,-1.950500,-1.546198,-1.748349,essntl,-1.234328,IRF9,IRF9 (10379)
4,0.541585,0.308363,0.424974,cond.ess,-0.180325,ING5,ING5 (84289)
...,...,...,...,...,...,...,...
195,0.333017,0.231041,0.282029,cond.ess,-0.249650,AHCYL1,AHCYL1 (10768)
196,0.224058,0.264806,0.244432,cond.ess,-0.267883,GMNN,GMNN (51053)
197,-0.322791,0.135472,-0.093659,cond.ess,-0.431848,SNRPB2,SNRPB2 (6629)
198,-0.030267,-0.087209,-0.058738,cond.ess,-0.414913,UBE2D3,UBE2D3 (7323)
