In [32]:
import pandas as pd
import statsmodels.api as sm


# Load data

In [33]:
fn='/Users/joe/code/projects/ets/4-ets-opt-in-pub-mpra/data/saturation-kircher-2019/GRCh38_ALL.tsv'
df=pd.read_csv(fn,sep='\t')
df.head()

Unnamed: 0,Chromosome,Position,Ref,Alt,Tags,DNA,RNA,Value,P-Value,Element
0,2,60494940,C,-,32,577,1345,-0.34,0.00546,BCL11A
1,2,60494940,C,A,146,2785,6772,-0.05,0.38889,BCL11A
2,2,60494940,C,G,60,975,2436,-0.13,0.13721,BCL11A
3,2,60494940,C,T,1084,8543,16057,-0.7,0.0,BCL11A
4,2,60494941,C,A,596,9425,23430,-0.08,0.00413,BCL11A


In [35]:
# correct p values
df['P-Value']=sm.stats.multipletests(df['P-Value'],method='fdr_bh')[1]
df.head()

Unnamed: 0,Chromosome,Position,Ref,Alt,Tags,DNA,RNA,Value,P-Value,Element
0,2,60494940,C,-,32,577,1345,-0.34,0.020485,BCL11A
1,2,60494940,C,A,146,2785,6772,-0.05,0.591536,BCL11A
2,2,60494940,C,G,60,975,2436,-0.13,0.289893,BCL11A
3,2,60494940,C,T,1084,8543,16057,-0.7,0.0,BCL11A
4,2,60494941,C,A,596,9425,23430,-0.08,0.01599,BCL11A


In [37]:
df['Element'].unique()

array(['BCL11A', 'F9', 'FOXE1', 'GP1BA', 'HBB', 'HBG1', 'HNF4A', 'IRF4',
       'IRF6', 'LDLR', 'LDLR.2', 'MSMB', 'MYCrs11986220', 'MYCrs6983267',
       'PKLR-24h', 'PKLR-48h', 'RET', 'SORT1', 'SORT1-flip', 'SORT1.2',
       'TCF7L2', 'TERT-GAa', 'TERT-GBM', 'TERT-GSc', 'TERT-HEK', 'UC88',
       'ZFAND3', 'ZRSh-13', 'ZRSh-13h2'], dtype=object)

In [38]:
df['chrom']=df.Chromosome.apply(lambda s: f'chr{s}')
df['pos']=df.Position
df['ref']=df.Ref
df['alt']=df.Alt
df['effect']=df.Value
df['p-value']=df['P-Value']
df['cse']=df.apply(lambda row: '_'.join([row['chrom'],str(row['pos']),row['ref'],row['alt']]),axis=1)

In [39]:
df.columns.tolist()

['Chromosome',
 'Position',
 'Ref',
 'Alt',
 'Tags',
 'DNA',
 'RNA',
 'Value',
 'P-Value',
 'Element',
 'chrom',
 'pos',
 'ref',
 'alt',
 'effect',
 'p-value',
 'cse']

In [40]:
df.Element.unique()

array(['BCL11A', 'F9', 'FOXE1', 'GP1BA', 'HBB', 'HBG1', 'HNF4A', 'IRF4',
       'IRF6', 'LDLR', 'LDLR.2', 'MSMB', 'MYCrs11986220', 'MYCrs6983267',
       'PKLR-24h', 'PKLR-48h', 'RET', 'SORT1', 'SORT1-flip', 'SORT1.2',
       'TCF7L2', 'TERT-GAa', 'TERT-GBM', 'TERT-GSc', 'TERT-HEK', 'UC88',
       'ZFAND3', 'ZRSh-13', 'ZRSh-13h2'], dtype=object)

In [43]:
outcols=[
 'cse',
 'chrom',
 'pos',
 'ref',
 'alt',
 'effect',
 'p-value',
 'Element']
df.loc[:,outcols].to_csv(f'kircher-hg38-allens-correctPvals=True.tsv',index=None,sep='\t')
df.loc[df.Element.isin(['ZRSh-13','ZRSh-13h2']),outcols].to_csv(f'kircher-hg38-zrs-correctPvals=True.tsv',index=None,sep='\t')


In [44]:
df.head(1)

Unnamed: 0,Chromosome,Position,Ref,Alt,Tags,DNA,RNA,Value,P-Value,Element,chrom,pos,ref,alt,effect,p-value,cse
0,2,60494940,C,-,32,577,1345,-0.34,0.020485,BCL11A,chr2,60494940,C,-,-0.34,0.020485,chr2_60494940_C_-


In [46]:
noDels=(df.Alt!='-')
outcols=[
 'cse',
 'chrom',
 'pos',
 'ref',
 'alt',
 'effect',
 'p-value',
 'Element']
df.loc[(noDels),outcols].to_csv(f'kircher-hg38-allens-nodels-correctPvals=True.tsv',index=None,sep='\t')
df.loc[df.Element.isin(['ZRSh-13','ZRSh-13h2']) & (noDels),outcols].to_csv(f'kircher-hg38-zrs-nodels-correctPvals=True.tsv',index=None,sep='\t')
