# Looking at disease associated SNVs wrt ETS affinity

In [None]:
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time
import pickle
import pybedtools
import js
import os

#########################################################################################
# These methods can be found on my gitub
# see https://github.com/jsolvason/js/tree/6623ca2c3ee871eca8defbac5b85fcb4036eb7a4
#    for methods used here at time of publication

import jsGenome as jsg


In [None]:
def setprint(s,n):
    x=0
    for i in s:
        x+=1
        print(i)
        if x>n: break

In [None]:
####################################################################################
# Download data form wei 2010, 
# http://the_brain.bwh.harvard.edu/uniprobe/detailsDef.php?id=414
# Mouse ETS1 pbm data

# Ets1
ref  =f'/Users/joe/code/ref/binding_affinity/ets/parsed_Ets1_8mers.txt'              
Seq2EtsAff  = {line.split('\t')[0]:float(line.split('\t')[1]) for line in open(ref,'r').readlines()}


In [None]:


hg2chr2seq={}

# Load hg38  genome
# from Bio import SeqIO
file_genome=f'/Users/joe/code/ref/genomes/human/hg38/hg38.fa.chr2seq.pydict.pickle'
hg2chr2seq['hg38'] = jsg.pklLoadGenome(file_genome)

# Load hg19 genome
file_genome=f'/Users/joe/code/ref/genomes/human/hg19/hg19.fa.chr2seq.pydict.pickle'
hg2chr2seq['hg19'] = jsg.pklLoadGenome(file_genome)


# Read Datasests

In [None]:
def getAll8mers(seq,p,ref,mut):
    
    seq=seq[p-7:p+8]    
    seq=seq[:7]+ref+seq[7+1:]
    
    # print(seq[p])
    for i,end in enumerate(range(7,len(seq))):
        start=end-7
        last=end+1
        kmer8=seq[start:last]
        if i==0: kmer8mut=kmer8[:-1]+mut
        else:    kmer8mut=kmer8[:-i-1]+mut+kmer8[-i:]
            
        yield kmer8,kmer8mut
                    #.  012345678901234567890
            
print(list(getAll8mers('NNNNNNGGAANNNNNNNNN',10,'G','X')))

In [None]:
def ets(kmer4):
    if kmer4 in ['GGAA','GGAT','TTCC','ATCC']: return True
    else:                                      return False

In [None]:
# Set up logic for creating/deleting ets sites
BeforeAfter2EtsLogic={
    (0,0):'Ets Never',
    (1,1):'Ets Always',
    (0,1):'Ets Created',
    (1,0):'Ets Deleted'
}

In [None]:

###################################################################################
# Set up input files
###################################################################################

# All data must be hg38 or lifted to hg38
# gwas datasets already filtered for pval ≤ 5e-8
Data2Fn={
    'gwas_uk_pulse_rate'           :f'./0-data/0-snvs/ukbiobank-gwas-hg19/hg38/pvalues-≤5e-8/continuous-102-both_sexes-irnt-hg38-pval≤5e-08.tsv',
    'gwas_uk_qrs_duration'         :f'./0-data/0-snvs/ukbiobank-gwas-hg19/hg38/pvalues-≤5e-8/continuous-12340-both_sexes-irnt-hg38-pval≤5e-08.tsv',
    'gwas_uk_pulse_pressure'       :f'./0-data/0-snvs/ukbiobank-gwas-hg19/hg38/pvalues-≤5e-8/continuous-PP-both_sexes-combined_medadj_irnt-hg38-pval≤5e-08.tsv',
    'gwas_uk_afib'                 :f'./0-data/0-snvs/ukbiobank-gwas-hg19/hg38/pvalues-≤5e-8/icd10-I48-both_sexes-hg38-pval≤5e-08.tsv',
    'gwas_uk_myocardial_infarction':f'./0-data/0-snvs/ukbiobank-gwas-hg19/hg38/pvalues-≤5e-8/phecode-411-hg38-pval≤5e-08.tsv',
}

Data2Tuple2Info={} # tuple=(chrom,0idx,genome,Ref,Alt,infoTuple)
Data2SampleSize={}
Data2Allele2Count={}
    
###################################################################################
# Read inputs
###################################################################################

sanityCheck=False
lineCount_sanityCheck=100000
for dataName,fn in Data2Fn.items():
    
    lc=0
    
    print(dataName)
    
    # Create datasets
    Data2Tuple2Info[dataName]={}
    Data2SampleSize[dataName]={'all':0,'skipped':0,'chosen':0,'duplicate':0}
    Data2Allele2Count[dataName]={'all':0,'ref':0,'alt':0,'neither':0}
    
    # Read file
    for row in js.read_tsv(fn,pc=False,header=True):
        lc+=1
        
        Data2SampleSize[dataName]['all']+=1
        
        # Parse row
        chrom,idx1,ref,alt,info = row
        idx0=int(idx1)-1
        if 'chr' not in chrom: chrom=f'chr{chrom}'
        genome='hg38'

        # Ensure data is snp, no indels
        if len(ref)!=1 or len(alt)!=1: 
            Data2SampleSize[dataName]['skipped']+=1
            continue

        # Tabulate what proportion of nt are ref or alt in genome
        snvNtInGenome=hg2chr2seq[genome][chrom][idx0]
        Data2Allele2Count[dataName]['all']+=1
        if ref==snvNtInGenome:    Data2Allele2Count[dataName]['ref']+=1 
        elif alt==snvNtInGenome:  Data2Allele2Count[dataName]['alt']+=1
        else:                     Data2Allele2Count[dataName]['neither']+=1 
        if sanityCheck==True and lc>lineCount_sanityCheck: break
                
        # Ensure snp changes ets affinity
        etsChanged=False
        for kmer8,kmer8mut in getAll8mers(hg2chr2seq[genome][chrom],idx0,ref,alt):
            core   =kmer8[2:6]
            coremut=kmer8mut[2:6]
            
            ba=(ets(core),ets(coremut)) # before after logic
            etsLogic=BeforeAfter2EtsLogic[ba]
            
            # If ets-changing snp found, continue
            if etsLogic=='Ets Always': 
                etsChanged=True
                break            

        if etsChanged:

            # Add data if not duplicate
            dataTuple=(chrom,idx0,genome,ref,alt)
            if dataTuple not in Data2Tuple2Info[dataName]:
                Data2SampleSize[dataName]['chosen']+=1
                Data2Tuple2Info[dataName][dataTuple]=set()
                Data2Tuple2Info[dataName][dataTuple].add(info)

            else:
                Data2SampleSize[dataName]['duplicate']+=1
                Data2Tuple2Info[dataName][dataTuple].add(info)


    print(dataName, f'{len(Data2Tuple2Info[dataName]):,} variants added')

In [None]:
#####################################################
# Sanity check that REF or ALT is at expected loc
#####################################################

for data in Data2Allele2Count:
    print('--------'*5,'\n\n')
    print(data)
    print()
    
    n=Data2Allele2Count[data]['all']
    
    if n==0: 
        print('No variants in dataset, skipping\n')
        continue
        
    nRef=Data2Allele2Count[data]['ref']
    nAlt=Data2Allele2Count[data]['alt']
    nNone=Data2Allele2Count[data]['neither']
    
    refRate  = nRef/n
    altRate  = nAlt/n
    noneRate = nNone/n
    

    print(f'\tRef Rate     = {js.percent(refRate                 , 2)}')
    print(f'\tAlt Rate     = {js.percent(altRate                 , 2)}')
    print(f'\tNeither Rate = {js.percent(noneRate                , 2)}')
    print(f'\tTotal %      = {js.percent(refRate+altRate+noneRate, 2)}')
    print()

In [None]:
for d in Data2SampleSize:
    nAll=Data2SampleSize  [d]['all']
    nSkip=Data2SampleSize [d]['skipped']
    nChose=Data2SampleSize[d]['chosen']
    
    print('-----------'*5)
    print(d)
    if nAll>0:
        print(f'\tN        = {nAll:,}')
        print(f'\tChosen   = {nChose:,}')    
        print(f'\t% Chosen = {js.percent(nChose/nAll,2)}')    
    else:
        print('None found, skipping')

In [None]:
for d in Data2Tuple2Info:
    
    print('-----------'*5)
    print(d)
    js.dprint(Data2Tuple2Info[d],2)

# Analysis

In [None]:
################################################################
# Set up datatable
################################################################

columns=['dataset'       ,
         'chrom'         ,
         'idx0'  ,
         'idx1',
         'ref'     ,
         'alt'    ,
         'ref8mer',
         'risk8mer',
#          'effect'        ,
#          'creEts'        ,
#          'creEtsAff'     ,
#          'delEts'        ,
#          'delEtsAff'     ,
#          'changedEts'    ,
         'refAff',
         'riskAff'     ,
         'change-in-affinity',
         'info-list',
#          'genome'
        ]
c2v={c:[] for c in columns}


################################################################
# Set up sanity check for indexing snp location
################################################################

Data2Allele2Count={}

################################################################
# For each dataset
################################################################

for data in Data2Tuple2Info:
    print(data)
        
    ############################################################
    # For each snp
    ############################################################
    
    # (chrom,idx0,ref,alt,genome,infoList)
    for (c,p,genome,ref,alt),infoList in Data2Tuple2Info[data].items():
        
        ############################################################
        # Consider each 8mer of the snp
        ############################################################
        for kmer8,kmer8mut in getAll8mers(hg2chr2seq[genome][c],p,ref,alt):
            core   =kmer8[2:6]
            coremut=kmer8mut[2:6]
            
            ba=(ets(core),ets(coremut)) # before after logic
            etsLogic=BeforeAfter2EtsLogic[ba]
            
            if etsLogic=='Ets Never':
#                 c2v['dataset'       ].append(data)
#                 c2v['chrom'         ].append(c)
#                 c2v['pos'           ].append(p)
#                 c2v['ref'     ].append(ref)
#                 c2v['alt'    ].append(alt)
#                 c2v['effect'        ].append(etsLogic)
#                 c2v['ref8mer'       ].append(kmer8)
#                 c2v['risk8mer'      ].append(kmer8mut)
#                 c2v['creEts'        ].append(False)
#                 c2v['creEtsAff'     ].append(False)
#                 c2v['delEts'        ].append(False)
#                 c2v['delEtsAff'     ].append(False)
#                 c2v['changesAff'    ].append(False)
                pass
                
            elif etsLogic=='Ets Always':
                aff   =Seq2EtsAff[kmer8]
                if ('?' not in kmer8mut) and ('*' not in kmer8mut):  
                    affmut=Seq2EtsAff[kmer8mut]
                    delta=affmut-aff
                else:                    
                    affmut=False
                    delta=False
                c2v['dataset'             ].append(data)
                c2v['chrom'               ].append(c)
                c2v['idx0'                ].append(p)
                c2v['idx1'                ].append(p+1)
                c2v['ref'                 ].append(ref)
                c2v['alt'                 ].append(alt)
#                 c2v['effect'              ].append(etsLogic)
                c2v['ref8mer'             ].append(kmer8)
                c2v['risk8mer'            ].append(kmer8mut)
#                 c2v['creEts'              ].append(False)
#                 c2v['creEtsAff'           ].append(False)
#                 c2v['delEts'              ].append(False)
#                 c2v['delEtsAff'           ].append(False)
#                 c2v['changedEts'          ].append(1)
                c2v['refAff'              ].append(aff)
                c2v['riskAff'             ].append(affmut)
                c2v['change-in-affinity'  ].append(delta)
                c2v['info-list'           ].append(infoList)
#                 c2v['genome'              ].append(genome)
                
            elif etsLogic=='Ets Created':
#                 affmut=Seq2EtsAff[kmer8mut]
#                 c2v['dataset'             ].append(data)
#                 c2v['chrom'               ].append(c)
#                 c2v['idx0'        ].append(p)
#                 c2v['ref'           ].append(ref)
#                 c2v['alt'          ].append(alt)
#                 c2v['effect'              ].append(etsLogic)
#                 c2v['ref8mer'             ].append(kmer8)
#                 c2v['risk8mer'            ].append(kmer8mut)
#                 c2v['creEts'              ].append(1)
#                 c2v['creEtsAff'           ].append(affmut)
#                 c2v['delEts'              ].append(False)
#                 c2v['delEtsAff'           ].append(False)
#                 c2v['changedEts'          ].append(False)
#                 c2v['refAff'              ].append(False)
#                 c2v['riskAff'             ].append(False)
#                 c2v['change-in-affinity'      ].append(False)
                pass
                
            elif etsLogic=='Ets Deleted':
#                 aff=Seq2EtsAff[kmer8]
#                 c2v['dataset'             ].append(data)
#                 c2v['chrom'               ].append(c)
#                 c2v['idx0'        ].append(p)
#                 c2v['ref'           ].append(ref)
#                 c2v['alt'          ].append(alt)
#                 c2v['effect'              ].append(etsLogic)
#                 c2v['ref8mer'             ].append(kmer8)
#                 c2v['risk8mer'            ].append(kmer8mut)
#                 c2v['creEts'              ].append(False)
#                 c2v['creEtsAff'           ].append(False)
#                 c2v['delEts'              ].append(1)
#                 c2v['delEtsAff'           ].append(aff)
#                 c2v['changedEts'          ].append(False)
#                 c2v['refAff'              ].append(False)
#                 c2v['riskAff'             ].append(False)
#                 c2v['change-in-affinity'      ].append(False)
                pass
    

# Create DF


In [None]:
df=pd.DataFrame(c2v)
df['chrom-pos1-ref-alt-tuple']=df.apply(lambda row: (row['chrom'],row['idx1'],row['ref'],row['alt']),axis=1)

# Generate Columns

## REF - Load reference data

### REF - Func to print % True

In [None]:
def percentTrue(l,col):
    boolList=[]
    for i in l:
        if i!=False: boolList.append(True)
        else:        boolList.append(False)
    n=sum(boolList)
    p=js.percent(sum(boolList) / len(boolList),2)
    print(f'{n:,} True ({p}) in {col}')
    
percentTrue([False,1,1,1],'test')

### REF - GeneID to Name

In [None]:
with open('./0-data/0-snvs/dantonio-eqtl-hg19/hg19/table_s2_SnpChromPos2Genes.pydict.pickle',"rb") as f:
    EqtlChromPos2Genes=pickle.load(f)

# with open('./0-data/0-snvs/dantonio-eqtl-hg19/hg19/table_s2_GeneId2Name.pydict.pickle','rb') as f:
#     GeneId2Name=pickle.load(f)

with open('/Users/joe/code/ref/genomes/human/hg19/other/gencode.v34lift37.GeneId2Name.pydict.pickle','rb') as f:
    GeneId2Name=pickle.load(f)
    
js.dprint(GeneId2Name,0)

In [None]:
# def getGeneName(geneId):
#     geneIdBasename=geneId.split('.')[0]
#     if geneIdBasename in GeneId2Name:
#         geneName=GeneId2Name[geneIdBasename]
#         # Don't give nonsense names - currently set up for hg38.ncbiRefSeq.gtf
#         if 'LOC' not in geneName:
#             return geneName
#         else:
#             return 'NoName'
#     else:
#         return 'NoName'
    
# getGeneName('ENSG00000128309.13_6'),getGeneName('ENSG000001')

def getGeneName(geneId):
    if geneId in GeneId2Name:
        geneName=GeneId2Name[geneId]
        # Don't give nonsense names - currently set up for hg38.ncbiRefSeq.gtf
        if 'LOC' not in geneName:
            return geneName
        else:
            return 'NoName'
    else:
        return 'NoName'
getGeneName('ENSG00000223972.5_3')

In [None]:
js.dprint(EqtlChromPos2Genes,0)

In [None]:
js.dprint(GeneId2Name,0)

## AFF

### AFF: Delta

In [None]:
# Prepare for Sorting by entries which change ets affinity by a large number
df['change-in-affinity']=df['change-in-affinity'].apply(lambda x: x if x!=False else 0)

### AFF: Large Delta?

In [None]:
# Create column indicating if DNV creates affinity change
foldChangeThreshold=3
df['affinity-fold-change']=df.apply(lambda row: row['riskAff']/row['refAff'] if (type(row['refAff'])==float and type(row['riskAff'])==float) else False, axis=1)
df['dramatic-affinity-increase']=df['affinity-fold-change'].apply(lambda foldchange: foldchange>=foldChangeThreshold)


In [None]:
col='dramatic-affinity-increase'
f'{sum(df[col]!=False):,}',js.percent(sum(df[col]!=False) / len(df),2),f'True in {col}'

In [None]:
col='affinity-fold-change'
f'{sum(df[col]!=False):,}',js.percent(sum(df[col]!=False) / len(df),2),f'True in {col}'

In [None]:
Chrom2AllChromTss={}
for chrom,pos in hg38_ChromTss2GeneSet:
    
    if chrom not in Chrom2AllChromTss:
        Chrom2AllChromTss[chrom]=set()
    Chrom2AllChromTss[chrom].add((chrom,pos))

In [None]:
js.dprint(hg38_ChromTss2GeneSet,10)

In [None]:
def get_nearbyGenes(queryChromPos,window,ChromTss2GeneSet,Chrom2AllChromTss,geneLimit=10):

    qc,qp=list(queryChromPos) # chrom and pos of query
    window=int(window)        # how far to look from tss    
    
    try:               qp=int(qp)
    except ValueError: return False
        
    # Create datasets to fill
    nearGenes=set()
    nearGenesWithDistance=[]
    
    # Filter out all bp that are not tss
    searchNts=[(qc,nt) for nt in range(qp-window,qp+window+1)]
    searchNts=set(searchNts)
    searchNts = searchNts & Chrom2AllChromTss[qc]
        
    # Look within search window
    for qc,ntPos in searchNts:

        # Check each possible name 
        geneList=ChromTss2GeneSet[(qc,ntPos)]
        
        for gene in geneList:

            # Skip conditions
            if gene[:3]=='LOC':   continue # If gene name isn't giberish => currently set up for 'hg38.ncbiRefSeq.gtf.ChrTss2GeneSet.pydict.pickle'
            if gene in nearGenes: continue # If gene hasn't been seen before

            nearGenes.add(gene)
            distance=qp-ntPos
            nearGenesWithDistance.append((distance,gene))
      
    # Convert distances to kb
    nearGenesWithDistance=[(int((d)/1000),g) for (d,g) in nearGenesWithDistance]
    nearGenesWithDistance=sorted(nearGenesWithDistance,key=lambda tup: abs(tup[0]),reverse=False)
    nearGenesWithDistance=[(f'{d:,} kb',g) for (d,g) in nearGenesWithDistance]
    
    if geneLimit!=np.inf:
        nearGenesWithDistance=nearGenesWithDistance[:geneLimit]
    
    return nearGenesWithDistance
    
    


get_nearbyGenes(('chr12',6534517),500e3,hg38_ChromTss2GeneSet,Chrom2AllChromTss,geneLimit=10)


### NEAREST GENE: Nearest genes

In [None]:
# add 10 nearest genes
nGenes=20
window=500e3

nearGeneDfRow=[]
x=0
n=sum((df['dramatic-affinity-increase']==True))
start=time.time()

tempSet=set()

for idx in df.index:
    chrom=df.at[idx,'chrom']
    dnvPos=df.at[idx,'idx0']

    # Determine if you will check this variant
    worthChecking=df.at[idx,'dramatic-affinity-increase']
    if worthChecking!=True: 
        nearGeneDfRow.append(False)
        continue
        
    x+=1
    if x%100==0: 
        end = time.time()
        sec = end-start
        rowsLeft=n-x
        rowsPerSec=x/sec
        secLeft=rowsLeft/rowsPerSec
        minLeft=round(secLeft/60,1)
        js.percentDone=round(100*x/n)
        print(f'{js.percentDone}% ({minLeft} min left)',end=', ')
        
    nearGenesWithDistance=get_nearbyGenes((chrom,dnvPos),window,hg38_ChromTss2GeneSet,Chrom2AllChromTss,geneLimit=np.inf)
    if nearGenesWithDistance!=[]:
        nearGeneDfRow.append(nearGenesWithDistance)
    else:
        nearGeneDfRow.append(False)
    
df['near-genes-500kb']=nearGeneDfRow

In [None]:
col='near-genes-500kb'
percentTrue(df[col],col)