# Jaspar Analysis

In [2]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
sys.setrecursionlimit(3000)
import pickle
import time
import scipy
import Bio
sns.set_context('notebook')

In [3]:
workingDirectory = '/home/jtao/analysis/jaspar_analysis/'
os.chdir(workingDirectory)

## Separate JASPAR motifs into individual files

In [4]:
with open('./pfm_vertebrates_jaspar_2016.txt') as f:
    data = f.readlines()

In [5]:
nonempty_lines = []
for line in data:
    if len(line) > 1:
        nonempty_lines.append(line.strip())

In [6]:
output_path = './individual_motifs/'
if not os.path.isdir(output_path):
    os.mkdir(output_path)
else:
    for f in os.listdir(output_path):
        os.remove(output_path + '/' + f)
all_motif_names = []
for i in range(0, len(nonempty_lines), 5):
    name_line = nonempty_lines[i + 0]
    motif_name = name_line.split()[1]
    all_motif_names.append(motif_name)
    A_line = nonempty_lines[i + 1]
    C_line = nonempty_lines[i + 2]
    G_line = nonempty_lines[i + 3]
    T_line = nonempty_lines[i + 4]
  
    out_file = open(output_path + '/' + motif_name + '.jaspar', 'w')
    out_file.write(name_line + '\n')
    out_file.write(A_line + '\n')
    out_file.write(C_line + '\n')
    out_file.write(G_line + '\n')
    out_file.write(T_line + '\n')

    out_file.close()

## Read Jaspar Metadata

### Read in Motif ID and Index mapping

In [7]:
with open('./MATRIX_2016.txt') as f:
    data = f.readlines()

motifID_index_dict = {}
motifName_index_dict = {}
for line in data:
    tokens = line.strip().split()
    index = tokens[0]
    motif_id = tokens[2]
    name = tokens[4]
    
    motifID_index_dict[motif_id] = index
    motifName_index_dict[name] = index

### Read in Protein ID

In [8]:
with open('./MATRIX_PROTEIN_2016.txt') as f:
    data = f.readlines()

index_uniprot_dict = {}
for line in data:
    tokens = line.strip().split()
    
    index = tokens[0]
    uniprot = tokens[1]
    
    index_uniprot_dict[index] = uniprot
#     print(uniprot)

### Read in Annotations (Family)

In [9]:
with open('./MATRIX_ANNOTATION_2016.txt') as f:
    data = f.readlines()

index_family_dict = {}
index_class_dict = {}
for line in data:
    tokens = line.strip().split()
    index = tokens[0]
    data_type = tokens[1]
    data = '_'.join(tokens[2:])
    if data_type == 'family':
        family = data
        index_family_dict[index] = family
    if data_type == 'class':
        motif_class = data
        index_class_dict[index] = motif_class

### Read Uniprot Gene Mapping

In [10]:
uniprot_geneName_dict = {}
with open('./uniprot_gene_mapping.txt') as f:
    data = f.readlines()
for line in data:
    tokens = line.strip().split()
    uniprot = tokens[0]
    geneName = tokens[1]
    uniprot_geneName_dict[uniprot] = geneName

### Create DataFrame

In [11]:
name_list = []
gene_list = []
class_list = []
family_list = []

for motifName in all_motif_names:
    gene = 'unknown'
    family = 'unknown'
    motif_class = 'unknown'
    
    index = motifName_index_dict[motifName]
    if index in index_uniprot_dict:
        uniprot = index_uniprot_dict[index]
        if uniprot in uniprot_geneName_dict:
            gene = uniprot_geneName_dict[uniprot]
    if index in index_family_dict:
        family = index_family_dict[index]
    if index in index_class_dict:
        motif_class = index_class_dict[index]
    
    name_list.append(motifName)
    gene_list.append(gene)
    class_list.append(motif_class)
    family_list.append(family)
    
jaspar_metadata_frame = pd.DataFrame({'Name':name_list,
              'Family':family_list,
              'Class':class_list,
              'Gene':gene_list})
jaspar_metadata_frame = jaspar_metadata_frame[['Name', 'Gene', 'Family', 'Class', ]]

# edit rel data
jaspar_metadata_frame.loc[jaspar_metadata_frame['Name'] == 'REL','Gene'] = 'REL'

jaspar_metadata_frame.loc[jaspar_metadata_frame['Name'] == 'REL','Family'] = 'Rel'


## Convert CISBP Motifs

In [12]:
cisbp_metadata_frame = pd.read_csv('./cisbp/TF_Information.txt', sep='\t')

# cisbp_metadata_frame= cisbp_metadata_frame[cisbp_metadata_frame['TF_Status'].astype(str)=='D']

cisbp_metadata_frame= cisbp_metadata_frame[cisbp_metadata_frame['TF_Status'].astype(str)!='N']

cisbp_motif_genes = [x.lower() for x in cisbp_metadata_frame['TF_Name'].unique()]

## Check Human TFome

In [13]:
with open ('./Human-TFome.txt') as f:
    data = f.readlines()
    
human_tfs = [x.strip().lower() for x in data]

In [14]:
jaspar_motif_genes = [x.lower() for x in  jaspar_metadata_frame['Gene'].values]

### Identify Motifs to Grab from CISBP

In [15]:
count = 0
count2=0
count3=0
cisbp_rescued_genes = []
for x in human_tfs:
    if not x in jaspar_motif_genes:
        count+=1
        if x in cisbp_motif_genes:
            cisbp_rescued_genes.append(x)
            count2+=1
        if 'znf' in x:
            count3+=1
print('num missing', count)
print('rescued by cisbp', count2)
print('zinc fingers', count3)

num missing 1104
rescued by cisbp 347
zinc fingers 503


In [16]:
rescued_indices = cisbp_metadata_frame[cisbp_metadata_frame['TF_Name'].str.lower().isin(cisbp_rescued_genes)].index.values

In [17]:
output_path = './individual_cisbp_motifs/'
if not os.path.isdir(output_path):
    os.mkdir(output_path)
else:
    for f in os.listdir(output_path):
        os.remove(output_path + '/'+f)
names = cisbp_metadata_frame.loc[rescued_indices, 'TF_Name'].values
motif_ids = cisbp_metadata_frame.loc[rescued_indices, 'Motif_ID'].values
name_count_dict = {}
cisbp_id_newName_dict = {}
cisbp_id_seenName_dict = {}
for motif_id, motif_name in zip(motif_ids, names):
    with open('./cisbp/pwms_all_motifs/'+motif_id+'.txt') as f:
        data = f.readlines()
    if len(data) > 1:
        if motif_id in cisbp_id_newName_dict:
            motif_name = cisbp_id_seenName_dict[motif_id]
            name_count_dict[seen_name] += 1
            new_name = seen_name +'_var'+str(name_count_dict[seen_name])
            print(motif_id, new_name)
        else:
            if motif_name in name_count_dict:
                name_count_dict[motif_name] += 1
                new_name = motif_name +'_var'+str(name_count_dict[motif_name])
            else:
                cisbp_id_seenName_dict[motif_id] = motif_name
                name_count_dict[motif_name] = 1
                new_name=motif_name
        
            
        cisbp_id_newName_dict[motif_id + '|' + motif_name] = new_name    
        
        A_freqs = []
        C_freqs = []
        G_freqs = []
        T_freqs = []    

        converted_file = open(output_path + '/' + new_name + '.jaspar', 'w')
        converted_file.write('>' + motif_id + '\t' + new_name + '\n')
        for line in data[1:]:
            freqs = line.strip().split()[1:]
            A_freqs.append(freqs[0])
            C_freqs.append(freqs[1])
            G_freqs.append(freqs[2])
            T_freqs.append(freqs[3])
        converted_file.write('A [' + '\t'.join(A_freqs) + ' ]\n')
        converted_file.write('C [' + '\t'.join(C_freqs) + ' ]\n')
        converted_file.write('G [' + '\t'.join(G_freqs) + ' ]\n')
        converted_file.write('T [' + '\t'.join(T_freqs) + ' ]\n')
        converted_file.close()


## Merge Metadata

In [18]:
jaspar_merge_data = jaspar_metadata_frame[['Name', 'Gene', 'Family']]
jaspar_merge_data['Source'] = 'JASPAR'
rescued_ids = [x.split('|')[0] for x in cisbp_id_newName_dict.keys()]
rescued_names = [x.split('|')[1] for x in cisbp_id_newName_dict.keys()]
cisbp_merge_data = cisbp_metadata_frame[(cisbp_metadata_frame['Motif_ID'].isin(rescued_ids)) &
                                        (cisbp_metadata_frame['TF_Name'].isin(rescued_names))][['Motif_ID', 'TF_Name', 'Family_Name']]


   
cisbp_merge_data['Motif_ID'] = [cisbp_id_newName_dict[x] for x in cisbp_merge_data['Motif_ID'] + '|' + cisbp_merge_data['TF_Name']]
cisbp_merge_data.columns = ['Name', 'Gene', 'Family']
cisbp_merge_data['Source'] = 'CISBP'
merged_metadata_frame = pd.concat([jaspar_merge_data, cisbp_merge_data])

merged_metadata_frame.to_csv('./metadata.tsv', sep='\t', index=False)

## Clean Up

In [19]:
os.remove('./individual_motifs/Nobox.jaspar')

In [20]:
len(merged_metadata_frame['Name'])

848

In [21]:
len(merged_metadata_frame['Name'].unique())

848

## Round 1

### Score Motifs

In [33]:
%%bash
rm ./motif_similarity_scores/ -r
/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/score_motifs.py ./motif_similarity_scores -num_procs 48 ./individual_motifs/*jaspar ./individual_cisbp_motifs/*jaspar

Reading motif files...
Calculating alignments between motifs and scoring motifs
Creating visualizations...


rm: cannot remove ‘./motif_similarity_scores/’: No such file or directory


### Cluster

In [34]:
%%bash
rm ./clustered_motifs/ -r

/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/threshold_cluster_motifs.py ./motif_similarity_scores/correlation.tsv ./clustered_motifs 0.8 ./individual_motifs/*jaspar ./individual_cisbp_motifs/*jaspar 

### Merge metadata

In [35]:
%%bash
cat metadata.tsv > metadata_round1.tsv
tail -n +2 clustered_motifs/motif_metadata.tsv >> metadata_round1.tsv 

## Round 2

### Score Motifs

In [None]:
%%bash
rm ./motif_similarity_scores_round/ -r
/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/score_motifs.py ./motif_similarity_scores_round2/ -num_procs 48 ./clustered_motifs/clustered_motifs/*motif

### Cluster

In [37]:
%%bash
rm ./clustered_motifs_round2/ -r

/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/threshold_cluster_motifs.py ./motif_similarity_scores_round2/correlation.tsv ./clustered_motifs_round2 0.8 ./clustered_motifs/clustered_motifs/*motif -metadata_path /home/jtao/analysis/jaspar_analysis/metadata_round1.tsv

  r = r_num / r_den


### Merge metadata

In [38]:
%%bash
cat metadata.tsv > final_metadata.tsv
tail -n +2 clustered_motifs_round2/motif_metadata.tsv >> final_metadata.tsv 

## Create HTML

In [None]:
%%bash
rm ./html_clustering/ -r

/gpfs/data01/glasslab/home/jtao/code/tba/motif_tools/threshold_cluster_motifs.py ./motif_similarity_scores_round2/correlation.tsv ./html_clustering 0.8 ./clustered_motifs/clustered_motifs/*motif -metadata_path /home/jtao/analysis/jaspar_analysis/metadata_round1.tsv -createHTML