# Jaspar Analysis

In [2]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
matplotlib.pylab.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
import pickle
import time
import scipy
import Bio
sns.set_context('notebook')

In [3]:
workingDirectory = '/home/jtao/analysis/jaspar_analysis/'
os.chdir(workingDirectory)

### Read in Motif ID and Index mapping

In [4]:
with open('./MATRIX_2016.txt') as f:
    data = f.readlines()

motifID_index_dict = {}
motifName_index_dict = {}
for line in data:
    tokens = line.strip().split()
    index = tokens[0]
    motif_id = tokens[2]
    name = tokens[4]
    
    motifID_index_dict[motif_id] = index
    motifName_index_dict[name] = index

### Read in Protein ID

In [5]:
with open('./MATRIX_PROTEIN_2016.txt') as f:
    data = f.readlines()

index_uniprot_dict = {}
for line in data:
    tokens = line.strip().split()
    
    index = tokens[0]
    uniprot = tokens[1]
    
    index_uniprot_dict[index] = uniprot
#     print(uniprot)

### Read in Annotations (Family)

In [9]:
tokens

['10763', 'tfe_id  325']

In [15]:
with open('./MATRIX_ANNOTATION_2016.txt') as f:
    data = f.readlines()

index_family_dict = {}
index_class_dict = {}
for line in data:
    tokens = line.strip().split()
    index = tokens[0]
    data_type = tokens[1]
    data = '_'.join(tokens[2:])
    if data_type == 'family':
        family = data
        index_family_dict[index] = family
    if data_type == 'class':
        motif_class = data
        index_class_dict[index] = motif_class

### Read Uniprot Gene Mapping

In [18]:
with open('./pfm_vertebrates_jaspar_2016.txt') as f:
    data = f.readlines()

In [19]:
nonempty_lines = []
for line in data:
    if len(line) > 1:
        nonempty_lines.append(line.strip())

In [20]:
uniprot_geneName_dict = {}
with open('./uniprot_gene_mapping.txt') as f:
    data = f.readlines()
for line in data:
    tokens = line.strip().split()
    uniprot = tokens[0]
    geneName = tokens[1]
    uniprot_geneName_dict[uniprot] = geneName

### Homer Format 

In [21]:
output_path = './converted_motifs/'
if not os.path.isdir(output_path):
    os.mkdir(output_path)
for i in range(0, len(nonempty_lines), 5):
    name_line = nonempty_lines[i + 0]
    A_line = nonempty_lines[i + 1]
    C_line = nonempty_lines[i + 2]
    G_line = nonempty_lines[i + 3]
    T_line = nonempty_lines[i + 4]

    A_freqs = np.array([float(x) for x in A_line[4:-2].split()])
    C_freqs = np.array([float(x) for x in C_line[4:-2].split()])
    G_freqs = np.array([float(x) for x in G_line[4:-2].split()])
    T_freqs = np.array([float(x) for x in T_line[4:-2].split()])

    freqs = np.array([A_freqs, C_freqs, G_freqs, T_freqs])
    normed_freqs = freqs/freqs.sum(axis=0)
    normed_freqs = normed_freqs.T
    
    name_tokens = name_line.strip().split()
    motif_name = name_tokens[-1]
    motif_id = name_tokens[-2][1:].split('.')[0]
    
    # get uniprot
    index = motifID_index_dict[motif_id]
    uniprot = index_uniprot_dict[index]
    if not uniprot in uniprot_geneName_dict:
        print(motif_name)
    else:
        geneName = uniprot_geneName_dict[uniprot]
    geneNameTokens = geneName.upper().split('/')

            
    if index in index_family_dict:
        family = index_family_dict[index]
    else:
        family = motif_name
    out_file = open(output_path + '/' + motif_name + '.motif', 'w')
    out_file.write('\t'.join(['>'+motif_id, motif_name, family, geneName, '\n']))
    for i in range(normed_freqs.shape[0]):
        out_file.write('\t'.join([str(x) for x in normed_freqs[i]]) + '\n') 
    out_file.close()

### JASPAR Format

In [22]:
output_path = './individual_motifs/'
if not os.path.isdir(output_path):
    os.mkdir(output_path)
all_motif_names = []
for i in range(0, len(nonempty_lines), 5):
    name_line = nonempty_lines[i + 0]
    motif_name = name_line.split()[1]
    all_motif_names.append(motif_name)
    A_line = nonempty_lines[i + 1]
    C_line = nonempty_lines[i + 2]
    G_line = nonempty_lines[i + 3]
    T_line = nonempty_lines[i + 4]
  
    out_file = open(output_path + '/' + motif_name + '.jaspar', 'w')
    out_file.write(name_line + '\n')
    out_file.write(A_line + '\n')
    out_file.write(C_line + '\n')
    out_file.write(G_line + '\n')
    out_file.write(T_line + '\n')

    out_file.close()

## Create MetaData Mapping

In [29]:
name_list = []
gene_list = []
class_list = []
family_list = []

for motifName in all_motif_names:
    gene = 'unknown'
    family = 'unknown'
    motif_class = 'unknown'
    
    index = motifName_index_dict[motifName]
    if index in index_uniprot_dict:
        uniprot = index_uniprot_dict[index]
        if uniprot in uniprot_geneName_dict:
            gene = uniprot_geneName_dict[uniprot]
    if index in index_family_dict:
        family = index_family_dict[index]
    if index in index_class_dict:
        motif_class = index_class_dict[index]
    
    name_list.append(motifName)
    gene_list.append(gene)
    class_list.append(motif_class)
    family_list.append(family)
    
frame = pd.DataFrame({'Name':name_list,
              'Family':family_list,
              'Class':class_list,
              'Gene':gene_list})
metadata_frame = frame[['Name', 'Gene', 'Family', 'Class', ]]
metadata_frame.to_csv('./metadata.tsv', sep='\t', index=False)

### Score Motifs

In [24]:
%%bash
rm ./motif_similarity_scores/ -r
/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/score_motifs.py ./motif_similarity_scores -num_procs 48 ./individual_motifs/*jaspar

Reading motif files...
Calculating alignments between motifs and scoring motifs
Creating visualizations...


### Cluster

In [26]:
%%bash
rm ./clustered_motifs/ -r

/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/threshold_cluster_motifs.py ./motif_similarity_scores/correlation.tsv ./clustered_motifs 0.9 ./individual_motifs/*jaspar

## Check Human TFome

In [27]:
with open ('./Human-TFome.txt') as f:
    data = f.readlines()
    
human_tfs = [x.strip().lower() for x in data]

In [31]:
motif_genes = [x.lower() for x in  metadata_frame['Gene'].values]

In [38]:
count = 0
for x in human_tfs:
    if not x in motif_genes:
        print(x)
        count+=1
print(count)

adnp
adnp2
aebp2
aff1
aff3
aff4
ahctf1
ahr
ahrr
aire
akna
ankzf1
ap5z1
argfx
arhgap35
arid5b
arnt2
arntl2
ascl1
ascl3
ascl4
ascl5
atf2
atf5
atf6
atf6b
atmin
atoh7
atoh8
atxn7
bach1
bach2
barx2
batf
batf2
bbx
bcl11a
bcl11b
bclaf1
bhlha9
bhlhb9
bmp2
bnc1
bnc2
bola1
bola2
bola2b
bola3
c20orf194
camta1
camta2
carhsp1
casz1
cbfb
cdc5l
cdx4
cebpz
cenpt
cers3
cers6
champ1
cic
ciz1
cpxcr1
cramp1
creb3l3
creb3l4
crebl2
crebrf
crebzf
ctcfl
cxxc1
dach1
dach2
dbx1
dbx2
ddit3
deaf1
dlx5
dmrt1
dmrt2
dmrta1
dmrta2
dmrtb1
dmrtc2
dmtf1
dnajc2
dnajc21
dpf1
dpf2
dprx
dr1
drap1
drgx
e2f5
e4f1
ebf3
ebf4
elf2
elmsan1
epas1
ervk3-1
etf1
ets2
etv3l
etv7
ezh1
ezh2
fam170a
ferd3l
fezf1
fezf2
fiz1
fosb
foxa3
foxb2
foxd4
foxd4l1
foxd4l3
foxd4l4
foxd4l6
foxe1
foxe3
foxf1
foxi2
foxi3
foxj1
foxk2
foxl2
foxm1
foxn1
foxn2
foxn3
foxn4
foxp4
foxr1
foxr2
foxs1
gabpb1
gata6
gatad1
gli1
gli3
gli4
gon4l
grhl2
grhl3
gtf3a
gzf1
hand1
hand2
hbp1
hdx
helt
hes3
hes4
hes6
heyl
hhex
hif3a
hivep1
hivep2
hivep3
hkr1
hlx
hmga1
homez


## Convert CISBP Motifs

In [None]:
for 