# Jaspar Analysis

In [1]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
sys.setrecursionlimit(3000)
import pickle
import time
import scipy
import Bio
sns.set_context('notebook')



In [2]:
workingDirectory = '/home/jtao/analysis/jaspar_analysis/'
os.chdir(workingDirectory)

## Separate JASPAR motifs into individual files

In [3]:
with open('./pfm_vertebrates_jaspar_2016.txt') as f:
    data = f.readlines()

In [4]:
nonempty_lines = []
for line in data:
    if len(line) > 1:
        nonempty_lines.append(line.strip())

In [5]:
output_path = './individual_motifs/'
if not os.path.isdir(output_path):
    os.mkdir(output_path)
all_motif_names = []
for i in range(0, len(nonempty_lines), 5):
    name_line = nonempty_lines[i + 0]
    motif_name = name_line.split()[1]
    all_motif_names.append(motif_name)
    A_line = nonempty_lines[i + 1]
    C_line = nonempty_lines[i + 2]
    G_line = nonempty_lines[i + 3]
    T_line = nonempty_lines[i + 4]
  
    out_file = open(output_path + '/' + motif_name + '.jaspar', 'w')
    out_file.write(name_line + '\n')
    out_file.write(A_line + '\n')
    out_file.write(C_line + '\n')
    out_file.write(G_line + '\n')
    out_file.write(T_line + '\n')

    out_file.close()

## Read Jaspar Metadata

### Read in Motif ID and Index mapping

In [6]:
with open('./MATRIX_2016.txt') as f:
    data = f.readlines()

motifID_index_dict = {}
motifName_index_dict = {}
for line in data:
    tokens = line.strip().split()
    index = tokens[0]
    motif_id = tokens[2]
    name = tokens[4]
    
    motifID_index_dict[motif_id] = index
    motifName_index_dict[name] = index

### Read in Protein ID

In [7]:
with open('./MATRIX_PROTEIN_2016.txt') as f:
    data = f.readlines()

index_uniprot_dict = {}
for line in data:
    tokens = line.strip().split()
    
    index = tokens[0]
    uniprot = tokens[1]
    
    index_uniprot_dict[index] = uniprot
#     print(uniprot)

### Read in Annotations (Family)

In [8]:
with open('./MATRIX_ANNOTATION_2016.txt') as f:
    data = f.readlines()

index_family_dict = {}
index_class_dict = {}
for line in data:
    tokens = line.strip().split()
    index = tokens[0]
    data_type = tokens[1]
    data = '_'.join(tokens[2:])
    if data_type == 'family':
        family = data
        index_family_dict[index] = family
    if data_type == 'class':
        motif_class = data
        index_class_dict[index] = motif_class

### Read Uniprot Gene Mapping

In [9]:
uniprot_geneName_dict = {}
with open('./uniprot_gene_mapping.txt') as f:
    data = f.readlines()
for line in data:
    tokens = line.strip().split()
    uniprot = tokens[0]
    geneName = tokens[1]
    uniprot_geneName_dict[uniprot] = geneName

### Create DataFrame

In [10]:
name_list = []
gene_list = []
class_list = []
family_list = []

for motifName in all_motif_names:
    gene = 'unknown'
    family = 'unknown'
    motif_class = 'unknown'
    
    index = motifName_index_dict[motifName]
    if index in index_uniprot_dict:
        uniprot = index_uniprot_dict[index]
        if uniprot in uniprot_geneName_dict:
            gene = uniprot_geneName_dict[uniprot]
    if index in index_family_dict:
        family = index_family_dict[index]
    if index in index_class_dict:
        motif_class = index_class_dict[index]
    
    name_list.append(motifName)
    gene_list.append(gene)
    class_list.append(motif_class)
    family_list.append(family)
    
jaspar_metadata_frame = pd.DataFrame({'Name':name_list,
              'Family':family_list,
              'Class':class_list,
              'Gene':gene_list})
jaspar_metadata_frame = jaspar_metadata_frame[['Name', 'Gene', 'Family', 'Class', ]]
# jaspar_metadata_frame.to_csv('./metadata.tsv', sep='\t', index=False)

## Convert CISBP Motifs

In [11]:
cisbp_metadata_frame = pd.read_csv('./cisbp/TF_Information.txt', sep='\t')

# cisbp_metadata_frame= cisbp_metadata_frame[cisbp_metadata_frame['TF_Status'].astype(str)=='D']

cisbp_metadata_frame= cisbp_metadata_frame[cisbp_metadata_frame['TF_Status'].astype(str)!='N']

cisbp_motif_genes = [x.lower() for x in cisbp_metadata_frame['TF_Name'].unique()]

## Check Human TFome

In [12]:
with open ('./Human-TFome.txt') as f:
    data = f.readlines()
    
human_tfs = [x.strip().lower() for x in data]

In [13]:
jaspar_motif_genes = [x.lower() for x in  jaspar_metadata_frame['Gene'].values]

### Identify Motifs to Grab from CISBP

In [14]:
count = 0
count2=0
count3=0
cisbp_rescued_genes = []
for x in human_tfs:
    if not x in jaspar_motif_genes:
        count+=1
        if x in cisbp_motif_genes:
            cisbp_rescued_genes.append(x)
            count2+=1
        if 'znf' in x:
            count3+=1
print('num missing', count)
print('rescued by cisbp', count2)
print('zinc fingers', count3)

num missing 1105
rescued by cisbp 348
zinc fingers 503


In [15]:
rescued_indices = cisbp_metadata_frame[cisbp_metadata_frame['TF_Name'].str.lower().isin(cisbp_rescued_genes)].index.values

In [16]:
cisbp_metadata_frame.head()

Unnamed: 0,TF_ID,Family_ID,TSource_ID,Motif_ID,MSource_ID,DBID,TF_Name,TF_Species,TF_Status,Family_Name,...,MSource_Type,MSource_Author,MSource_Year,PMID,MSource_Version,TfSource_Name,TfSource_URL,TfSource_Year,TfSource_Month,TfSource_Day
0,T004843_1.02,F035_1.02,TS19_1.02,M2938_1.02,MS26_1.02,ENSG00000008196,TFAP2B,Homo_sapiens,D,AP-2,...,Transfac,Matys,2006,16381825,2014.2,Ensembl,http://www.ensembl.org/,2011,Oct,26
1,T004843_1.02,F035_1.02,TS19_1.02,M5917_1.02,MS20_1.02,ENSG00000008196,TFAP2B,Homo_sapiens,D,AP-2,...,SELEX,Jolma,2013,23332764,January 2013,Ensembl,http://www.ensembl.org/,2011,Oct,26
2,T004843_1.02,F035_1.02,TS19_1.02,M5918_1.02,MS20_1.02,ENSG00000008196,TFAP2B,Homo_sapiens,D,AP-2,...,SELEX,Jolma,2013,23332764,January 2013,Ensembl,http://www.ensembl.org/,2011,Oct,26
3,T004843_1.02,F035_1.02,TS19_1.02,M5919_1.02,MS20_1.02,ENSG00000008196,TFAP2B,Homo_sapiens,D,AP-2,...,SELEX,Jolma,2013,23332764,January 2013,Ensembl,http://www.ensembl.org/,2011,Oct,26
4,T004843_1.02,F035_1.02,TS19_1.02,M6144_1.02,MS18_1.02,ENSG00000008196,TFAP2B,Homo_sapiens,D,AP-2,...,HocoMoco,Kulakovskiy,2013,23175603,July 2014,Ensembl,http://www.ensembl.org/,2011,Oct,26


In [38]:
output_path = './individual_cisbp_motifs/'
if not os.path.isdir(output_path):
    os.mkdir(output_path)
else:
    for f in os.listdir(output_path):
        os.remove(output_path + '/'+f)
names = cisbp_metadata_frame.loc[rescued_indices, 'TF_Name'].values
motif_ids = cisbp_metadata_frame.loc[rescued_indices, 'Motif_ID'].values
name_count_dict = {}
for motif_id, motif_name in zip(motif_ids, names):
    with open('./cisbp/pwms_all_motifs/'+motif_id+'.txt') as f:
        data = f.readlines()
    if len(data) > 1:
        if motif_name in name_count_dict:
            name_count_dict[motif_name] += 1
            new_name = motif_name +'_ver'+str(name_count_dict[motif_name])
            cisbp_id_name_dict[motif_id] = new_name
        else:
            name_count_dict[motif_name] = 1
            new_name=motif_name
            
        A_freqs = []
        C_freqs = []
        G_freqs = []
        T_freqs = []    
        new_name = cisbp_id_name_dict[motif_id]
        converted_file = open(output_path + '/' + new_name + '.jaspar', 'w')
        converted_file.write('>' + motif_id + '\t' + new_name + '\n')
        for line in data[1:]:
            freqs = line.strip().split()[1:]
            A_freqs.append(freqs[0])
            C_freqs.append(freqs[1])
            G_freqs.append(freqs[2])
            T_freqs.append(freqs[3])
        converted_file.write('A [' + '\t'.join(A_freqs) + ' ]\n')
        converted_file.write('C [' + '\t'.join(C_freqs) + ' ]\n')
        converted_file.write('G [' + '\t'.join(G_freqs) + ' ]\n')
        converted_file.write('T [' + '\t'.join(T_freqs) + ' ]\n')
        converted_file.close()


## Merge Metadata

### Score Motifs

In [66]:
%%bash
rm ./motif_similarity_scores/ -r
/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/score_motifs.py ./motif_similarity_scores -num_procs 48 ./individual_motifs/*jaspar ./individual_cisbp_motifs/*jaspar

Process is terminated.


### Cluster

In [67]:
%%bash
rm ./clustered_motifs/ -r

/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/threshold_cluster_motifs.py ./motif_similarity_scores/correlation.tsv ./clustered_motifs 0.9 ./individual_motifs/*jaspar ./individual_cisbp_motifs/*jaspar

Traceback (most recent call last):
  File "/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/threshold_cluster_motifs.py", line 523, in <module>
    score_frame = pd.read_csv(scorePath, sep='\t',index_col=0)
  File "/gpfs/data01/glasslab/home/jtao/software/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py", line 655, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/gpfs/data01/glasslab/home/jtao/software/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py", line 405, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/gpfs/data01/glasslab/home/jtao/software/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py", line 764, in __init__
    self._make_engine(self.engine)
  File "/gpfs/data01/glasslab/home/jtao/software/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py", line 985, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/gpfs/data01/glasslab/home/jtao/software/anaconda3/lib/pytho