Predicting possible TF-TF interactions based on homology

In [15]:
import xlrd
import itertools
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import requests


In [65]:
#!python -m wget http://www.interactome-atlas.org/data/HuRI.tsv
#!python -m wget http://humantfs.ccbr.utoronto.ca/download/v_1.01/DatabaseExtract_v_1.01.txt
!python -m wget http://www.interactome-atlas.org/data/Lit-BM.tsv


Saved under Lit-BM.tsv


In [96]:
# Function to Parse FASTA
def parse_file(file):
    with open(file, "r") as f1:
        header, seqs = [], []
        for line in f1:
            line = line.rstrip()
            if line.startswith(">"):
                if header: yield(header,''.join(seqs))
                header, seqs = line, []
            else:
                seqs.append(line)
        if header: yield(header,''.join(seqs))


In [52]:
Lambert = pd.read_csv('DatabaseExtract_v_1.01.txt', delimiter = '\t')[ ['Ensembl ID', 'HGNC symbol', 'DBD', 'Is TF?']]
Lambert = Lambert[ Lambert['Is TF?'] == 'Yes']
Lambert

Unnamed: 0,Ensembl ID,HGNC symbol,DBD,Is TF?
0,ENSG00000137203,TFAP2A,AP-2,Yes
1,ENSG00000008196,TFAP2B,AP-2,Yes
2,ENSG00000087510,TFAP2C,AP-2,Yes
3,ENSG00000008197,TFAP2D,AP-2,Yes
4,ENSG00000116819,TFAP2E,AP-2,Yes
...,...,...,...,...
2629,ENSG00000151500,THYN1,Unknown,Yes
2644,ENSG00000144747,TMF1,Unknown,Yes
2656,ENSG00000197579,TOPORS,Unknown,Yes
2698,ENSG00000102804,TSC22D1,Unknown,Yes


In [86]:
LitBM = pd.read_csv('Lit-BM.tsv', sep="\t",    header = None)
LitBM = LitBM.rename(columns = {0: 'ENSG_A', 1:'ENSG_B'})
LitBM = LitBM[ ( LitBM['ENSG_A'].isin(Lambert['Ensembl ID']) ) & ( LitBM['ENSG_B'].isin(Lambert['Ensembl ID']) ) ]
LitBM['GENE_A'] = LitBM['ENSG_A'].map(Lambert.set_index('Ensembl ID')['HGNC symbol'])
LitBM['GENE_B'] = LitBM['ENSG_B'].map(Lambert.set_index('Ensembl ID')['HGNC symbol'])

print(LitBM)

                ENSG_A           ENSG_B  GENE_A GENE_B
0      ENSG00000001167  ENSG00000066136    NFYA   NFYC
1      ENSG00000001167  ENSG00000120837    NFYA   NFYB
31     ENSG00000029153  ENSG00000134852  ARNTL2  CLOCK
79     ENSG00000066136  ENSG00000120837    NFYC   NFYB
81     ENSG00000067066  ENSG00000067066   SP100  SP100
...                ...              ...     ...    ...
13330  ENSG00000084676  ENSG00000143257   NCOA1  NR1I3
13360  ENSG00000122691  ENSG00000168610  TWIST1  STAT3
13373  ENSG00000105516  ENSG00000175197     DBP  DDIT3
13412  ENSG00000168610  ENSG00000168610   STAT3  STAT3
13414  ENSG00000132170  ENSG00000204231   PPARG   RXRB

[511 rows x 4 columns]


In [91]:
LitBM_dict = {}

# Make Dictionary
for TF in LitBM['GENE_A']:
    indexs = [ind for ind, x in enumerate(LitBM['GENE_A']) if x == TF]
    tf_2 = [LitBM['GENE_B'].to_list()[i] for i in indexs]
    LitBM_dict[TF] = tf_2

# Considering HURI all interactions
for k in LitBM_dict.keys():
    for tf_pair in LitBM_dict[k]:
        if tf_pair in LitBM_dict and k not in LitBM_dict[tf_pair]:
            LitBM_dict[tf_pair] = LitBM_dict[tf_pair] + [k]

LitBM_dict

{'NFYA': ['NFYC', 'NFYB', 'ZHX1', 'SRF'],
 'ARNTL2': ['CLOCK'],
 'NFYC': ['NFYB', 'MYC', 'ATF6', 'NFYA'],
 'SP100': ['SP100', 'ARID3A'],
 'FOSL2': ['JUN'],
 'RARB': ['RXRG', 'NCOA1'],
 'ZNF446': ['ZNF397'],
 'HIF1A': ['ARNTL', 'MAFG', 'ARNT', 'HIF1A'],
 'ZNF174': ['ZSCAN32'],
 'CREB3': ['CREB3', 'JUN'],
 'ZBTB16': ['ZBTB16', 'RARA', 'BCL6', 'ZBTB32'],
 'VDR': ['RXRG', 'RXRA', 'SMAD3', 'NCOA2', 'VDR', 'NCOA3', 'RXRB', 'NCOA1'],
 'BCL6': ['BCL6', 'BCL6B', 'ZBTB7A', 'JUN', 'ZBTB16', 'PATZ1'],
 'TFDP2': ['E2F4', 'E2F1'],
 'ATF2': ['FOS', 'SMAD4', 'ATF2', 'JUN', 'SMAD3', 'ATF3'],
 'EPAS1': ['ARNTL', 'ARNT', 'ARNT2', 'EPAS1', 'SMAD3'],
 'NFE2L2': ['MAFG', 'MAFK', 'MAFF'],
 'TCF21': ['TCF12', 'TCF4', 'TCF3'],
 'MXI1': ['MAX'],
 'MAX': ['MYC',
  'MXD3',
  'MAX',
  'MYCN',
  'MGA',
  'SMAD3',
  'MXI1',
  'MNT',
  'MXD1',
  'MXD4'],
 'NR1H2': ['RXRG', 'RXRB', 'RXRA'],
 'RARA': ['RXRG',
  'RXRB',
  'HMGA1',
  'RXRA',
  'NCOA2',
  'CLOCK',
  'NPAS2',
  'ZBTB16',
  'SRF',
  'IRX4',
  'NCOA1',
  'NC

In [97]:
tf_list = [i.rstrip() for i in Lambert['HGNC symbol'].to_list()]

# Creating dictionary { TF name: TF sequence}
header_tf = {}
for header, seq in parse_file("mart_export.txt"):
    name = header.split("|")[-1]
    for tf in tf_list:
        if tf == name:
            cond_1 = seq.startswith("M")
            cond_2 = seq[-1] == "*"
            if cond_1 and cond_2:
                if name in header_tf:
                    if len(seq) >= len(header_tf[name]):
                        header_tf[name] = seq
                else:
                    header_tf[name] = seq

# Writting all 1619 TF_name / Sequences to go CLUSTAL OMEGA to align and get Matrix Distance
with open("sequences.fasta", "w") as f1:
    for k,v in header_tf.items():
        f1.write(">" + str(k) + "\n")
        f1.write(str(v[:-1]) + "\n")


In [100]:
# After submitting fasta file to multiple alignment in CLUSTAL OMEGA

# Open Matrix Distance
data_with_zero = []
data = []
i = 3 # First column: index ; Second column: name TF
matrix_distances = []
matrix_tf = []
with open("pim.txt","r") as f1:
    for line in f1:
        line = line.rstrip().split()
        if "#" not in line and len(line)>0:
            matrix_distances.append(line[2:])
            matrix_tf.append(line[1])
            j = i
            for x in line[j:]:
                if x == '-nan':
                    continue
                elif x == "0.00":
                    data_with_zero.append(float(x))
                else:
                    data.append(float(x))
                    data_with_zero.append(float(x))
            i+=1
percentiles = [0.90, 0.95, 0.975, 0.99, 0.995, 0.999]

import numpy as np

for i in percentiles:
    print(i, np.quantile(data_with_zero,i))

percentiles = [0.90]


0.9 26.92
0.95 38.51
0.975 44.03
0.99 48.79
0.995 51.33
0.999 68.83


In [107]:
# list of TF in LitBM
TF_LitBM = list(set(LitBM['GENE_A'].to_list() + LitBM['GENE_B'].to_list()))

In [112]:

# Get index of TF LitBM in Matrix_TF:
index_tf_LitBM = []
for i in TF_LitBM:
    j = matrix_tf.index(i)
    index_tf_LitBM.append(j)

# 99% = 48.79x
# 99.5% = 51.33
# 99.9% = 68.83

identity_tf = {}

## Calculate the most similar TFs for each TF in LitBM
for i in index_tf_LitBM:
    for j in range(len(matrix_distances[i])):
        if matrix_distances[i][j] == 'nan':
            continue
        elif float(matrix_distances[i][j]) >= 68.83 and matrix_tf[j] in TF_LitBM and matrix_tf[j] != matrix_tf[i]:
            if matrix_tf[i] not in identity_tf:
                identity_tf[matrix_tf[i]] = [matrix_tf[j]]
            else:
                identity_tf[matrix_tf[i]] += [matrix_tf[j]]


total_interactions= []
for TF1 in LitBM_dict.keys(): # For each TF in LitBM
    for TF2 in LitBM_dict[TF1]: # For each list of reported interactors of such TF in LitBM
        if TF1 in identity_tf and TF2 in identity_tf:
            first_tf = identity_tf[TF1][:]
            first_tf.append(TF1)
            second_tf = identity_tf[TF2][:]
            second_tf.append(TF2)
            new_interactions = [i for i in itertools.product(first_tf,second_tf)]
        elif TF1 in identity_tf:
            first_tf = identity_tf[TF1][:]
            first_tf.append(TF1)
            second_tf = [TF2]
            new_interactions = [i for i in itertools.product(first_tf, second_tf)]
        elif TF2 in identity_tf:
            first_tf = [TF1]
            second_tf = identity_tf[TF2][:]
            second_tf.append(TF2)
            new_interactions = [i for i in itertools.product(first_tf, second_tf)]
        try:
            total_interactions += new_interactions
        except:
            continue


print("Second Approach")

print("Number of total interactions", len(total_interactions))

f_total_interactions = []
for i in total_interactions:
    if f_total_interactions:
        if (i[0],i[1]) in f_total_interactions or (i[1],i[0]) in f_total_interactions:
            continue
        else:
            f_total_interactions.append(i)
    else:
        f_total_interactions.append(i)

print("Number of unique interactions", len(f_total_interactions))

new_intx = []
for pair in f_total_interactions:
    if pair[0] in LitBM_dict and pair[1] in LitBM_dict[pair[0]]:
        continue
        #print("%s and %s are ready in LitBM" % (pair[0],pair[1]))
    else:
        #print("%s and %s are new pairs" % (pair[0], pair[1]))
        new_intx.append((pair[0], pair[1]))
        
print("Number of unique interactions which are not in LitBM database", len(new_intx)) # 2173

with open("Second_approach.txt", "w") as f1:
    for pair in new_intx:
        tf_1, tf_2 = pair[0], pair[1]
        f1.write(tf_1 + "\t" + tf_2 + "\n")


Second Approach
Number of total interactions 1997
Number of unique interactions 338
Number of unique interactions which are not in LitBM database 192
