# Jaspar Analysis

In [1]:
### header ###
__author__ = "Jenhan Tao"
__license__ = "BSD"
__email__ = "jenhantao@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
matplotlib.pylab.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
import pickle
import time
import scipy
import Bio
sns.set_context('notebook')

In [2]:
workingDirectory = '/home/jtao/analysis/jaspar_analysis/'
os.chdir(workingDirectory)

### Read in Motif ID and Index mapping

In [76]:
with open('./MATRIX_2016.txt') as f:
    data = f.readlines()

motifID_index_dict = {}

for line in data:
    tokens = line.strip().split()
    index = tokens[0]
    motif_id = tokens[2]
    name = tokens[4]
    
    motifID_index_dict[motif_id] = index

### Read in Protein ID

In [74]:
with open('./MATRIX_PROTEIN_2016.txt') as f:
    data = f.readlines()

index_uniprot_dict = {}
for line in data:
    tokens = line.strip().split()
    
    index = tokens[0]
    uniprot = tokens[1]
    
    index_uniprot_dict[index] = uniprot

### Read in Annotations (Family)

In [110]:
with open('./MATRIX_ANNOTATION_2016.txt') as f:
    data = f.readlines()

index_family_dict = {}
for line in data:
    tokens = line.strip().split()
    if tokens[1] == 'family':
        index = tokens[0]
        family = tokens[2]
    
        index_family_dict[index] = family

### Read in Uniprot ID Mappings

Documentation:

ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README

1. UniProtKB-AC
2. UniProtKB-ID
3. GeneID (EntrezGene)
4. RefSeq
5. GI
6. PDB
7. GO
8. UniRef100
9. UniRef90
10. UniRef50
11. UniParc
12. PIR
13. NCBI-taxon
14. MIM
15. UniGene
16. PubMed
17. EMBL
18. EMBL-CDS
19. Ensembl
20. Ensembl_TRS
21. Ensembl_PRO
22. Additional PubMed

In [None]:
with open('./MOUSE_10090_idmapping_selected.tab') as f:
    data = f.readlines()

uniprot_refseq_dict = {}
for line in data:
    tokens = line.strip().split('\t')
    accession = tokens[0]
    refseq = tokens[3]
    uniprot_refseq_dict[accession] = refseq

### Read in RPKM Data

In [20]:
rpkm_frame = pd.read_csv('/gpfs/data01/glasslab/home/jtao/analysis/signals_analysis/rpkm.tsv', sep='\t')
columns = rpkm_frame.columns.values
columns[0] = 'Refseq'
columns[7] = 'Gene'
rpkm_frame.columns = columns
rpkm_frame.index = rpkm_frame['Gene'].values
rpkm_frame['Gene'] = [x.split('|')[0] for x in rpkm_frame['Gene'].values]
rpkm_frame.drop('chr', axis=1, inplace=True)
rpkm_frame.drop('start', axis=1, inplace=True)
rpkm_frame.drop('end', axis=1, inplace=True)
rpkm_frame.drop('strand', axis=1, inplace=True)
rpkm_frame.drop('Length', axis=1, inplace=True)
rpkm_frame.drop('Copies', axis=1, inplace=True)

rpkm_mean_frame = rpkm_frame[['Refseq', 'Gene']]
for condition in ['Veh-1h', 'KLA-1h', 'IFNg-1h']:
    current_cols = [x for x in rpkm_frame.columns if condition in x]
    mean_vals = rpkm_frame[current_cols].mean(axis=1)
    rpkm_mean_frame[condition] = mean_vals
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [51]:
refseq_gene_dict = dict(zip(rpkm_mean_frame['Refseq'].values,
                            rpkm_mean_frame['Gene'].values))

In [None]:
homer_rpkms = rpkm_mean_frame['Refseq'].values

### Read In Motifs to get Uniprot IDs

In [24]:
with open('./pfm_vertebrates_jaspar_2016.txt') as f:
    data = f.readlines()

In [27]:
nonempty_lines = []
for line in data:
    if len(line) > 1:
        nonempty_lines.append(line.strip())

In [71]:
counter =0
for i in range(0, len(nonempty_lines), 5):
    name_line = nonempty_lines[i + 0]
    A_line = nonempty_lines[i + 1]
    C_line = nonempty_lines[i + 2]
    G_line = nonempty_lines[i + 3]
    T_line = nonempty_lines[i + 4]

    A_freqs = np.array([float(x) for x in A_line[4:-2].split()])
    C_freqs = np.array([float(x) for x in C_line[4:-2].split()])
    G_freqs = np.array([float(x) for x in G_line[4:-2].split()])
    T_freqs = np.array([float(x) for x in T_line[4:-2].split()])

    freqs = np.array([A_freqs, C_freqs, G_freqs, T_freqs])
    normed_freqs = freqs/freqs.sum(axis=0)
    normed_freqs = normed_freqs.T
    
    name_tokens = name_line.strip().split()
    motif_name = name_tokens[-1]
    motif_id = name_tokens[-2][1:].split('.')[0]
    
    # get uniprot
    index = motifID_index_dict[motif_id]
    uniprot = index_uniprot_dict[index]
    
    print(uniprot)

Q01196
P05549
P53762
P53762
P19091
O15178
Q02548
P10589
P16220
P05554
Q01094
Q16649
Q05925
P19419
A4QPC8
Q12947
Q16676
Q12948
Q12952
P17679
P23769
P23771
Q07120
Q60793
Q63244
Q63245
Q12951
Q16534
P20823
P35583
Q02575
P10914
P14316
Q02078
P28698
P28698
P61244
P01106
P23511
Q91YY8
P42582
P28700
P37231
P32114
O43316
P26367
P40424
P35398
P35398
Q92766
P11473
Q06348
P28324
P48436
Q61473
P08047
P17947
Q01892
P11831
Q05066
P35710
P52747
Q5ZL67
P28347
P15923
P15806
P22415
P25490
P14921
P05412
P06876
Q04864
P49715
P37275
P03966
P19838
P04637
Q04206
P20226
Q95216
Q99MY0
P03372
P04150
P49698
P19793
O08961
P54841
P31314
P78367
P97436
Q8VIH1
Q86Y25
Q9BQA5
P52945
P50481
Q9UKW6
P42224
Q13127
P49711
P17542
O95718
P48432
P48432
P40763
Q12800
P17012
P01108
P55317
F1JVV8
Q60795
Q62431
Q13469
P35680
Q9UH73
Q01101
Q99581
O43524
P20719
P19793
P43354
P08651
P18146
Q6DJT9
Q9QXZ7
Q92731
Q16665
P56693
P48985
P05412
P41183
O14503
Q99626
P17676
O54751
Q9UBX2
O00716
Q16254
O75461
P11161
P32519
P11308
Q01543
P01100

Mappings retrieved via:

http://www.uniprot.org/uploadlists/

In [81]:
uniprot_geneName_dict = {}
with open('./uniprot_gene_mapping.txt') as f:
    data = f.readlines()
for line in data:
    tokens = line.strip().split()
    uniprot = tokens[0]
    geneName = tokens[1]
    uniprot_geneName_dict[uniprot] = geneName

### Match Gene Name with Mouse Refseq

#### Using RPKM file

In [104]:
frame = pd.read_csv('./rpkm.tsv', sep='\t')

refseqs = frame.ix[:,0]
gene_names = frame['Annotation/Divergence'].values
gene_refseq_dict = {}

for i in range(len(refseqs)):
    ref = refseqs[i]
    gene = gene_names[i]
    tokens = gene.split('|')
    for name in tokens:
        gene_refseq_dict[name.upper()] = ref

In [161]:
for g in [x for x in sorted(gene_refseq_dict.keys()) if x[0] == 'R']:
    print(g)

R
R-CAD
R-CADH
R-PK
R-PTP-DELTA
R-PTP-O
R-PTP-PSI
R-PTP-S
R-PTP-T
R-PTP-U
R-PTP-ZETA
R-SPONDIN
R00504
R124
R16
R2
R21
R26
R26-EGFP
R27090_2
R29144/1
R32184_3
R35
R3F
R3HCC1
R3HCC1L
R3HDM
R3HDM1
R3HDM2
R3HDM3
R3HDM4
R3HDML
R4B2
R51H2
R51H3
R74613
R74621
R74626
R74628
R74630
R74640
R74645
R74651
R74653
R74677
R74690
R74720
R74724
R74726
R74732
R74740
R74756
R74766
R74783
R74805
R74807
R74815
R74819
R74825
R74830
R74833
R74844
R74849
R74856
R74860
R74862
R74866
R74877
R74903
R74911
R74921
R74924
R74941
R74955
R74975
R74981
R74983
R74989
R74996
R75000
R75019
R75022
R75030
R75047
R75054
R75064
R75066
R75070
R75078
R75094
R75096
R75106
R75121
R75137
R75140
R75142
R75148
R75150
R75156
R75157
R75174
R75178
R75185
R75201
R75218
R75223
R75228
R75232
R75240
R75241
R75243
R75250
R75254
R75280
R75284
R75289
R75297
R75304
R75323
R75334
R75336
R75353
R75359
R75364
R75368
R75370
R75373
R75378
R75380
R75390
R75394
R75400
R75405
R75422
R75424
R75430
R75447
R75468
R75484
R75501
R75505
R75514
R75516
R7552

In [164]:
output_path = './converted_motifs/'
if not os.path.isdir(output_path):
    os.mkdir(output_path)
for i in range(0, len(nonempty_lines), 5):
    name_line = nonempty_lines[i + 0]
    A_line = nonempty_lines[i + 1]
    C_line = nonempty_lines[i + 2]
    G_line = nonempty_lines[i + 3]
    T_line = nonempty_lines[i + 4]

    A_freqs = np.array([float(x) for x in A_line[4:-2].split()])
    C_freqs = np.array([float(x) for x in C_line[4:-2].split()])
    G_freqs = np.array([float(x) for x in G_line[4:-2].split()])
    T_freqs = np.array([float(x) for x in T_line[4:-2].split()])

    freqs = np.array([A_freqs, C_freqs, G_freqs, T_freqs])
    normed_freqs = freqs/freqs.sum(axis=0)
    normed_freqs = normed_freqs.T
    
    name_tokens = name_line.strip().split()
    motif_name = name_tokens[-1]
    motif_id = name_tokens[-2][1:].split('.')[0]
    
    # get uniprot
    index = motifID_index_dict[motif_id]
    uniprot = index_uniprot_dict[index]
    geneName = uniprot_geneName_dict[uniprot]
    geneNameTokens = geneName.upper().split('/')
    for gn in geneNameTokens:
        if gn in gene_refseq_dict:
            refseq = gene_refseq_dict[gn]
        else:
            refseq = 'None'
            print(geneName,'not expressed')
            
    if index in index_family_dict:
        family = index_family_dict[index]
    else:
        family = motif_name
#     print(motif_name, refseq, family, geneName)
    out_file = open(output_path + '/' + motif_name + '.motif', 'w')
    out_file.write('\t'.join([motif_id, motif_name, family, geneName, '\n']))
    for i in range(normed_freqs.shape[0]):
        out_file.write('\t'.join([str(x) for x in normed_freqs[i]]) + '\n') 
    out_file.close()

ZNF263 not expressed
mix-a not expressed
SHOX not expressed
RAX2 not expressed
RHOXF1 not expressed
VENTX not expressed
ZBED1 not expressed
POU5F1B not expressed
DUXA not expressed


### Score Motifs

In [122]:
!/gpfs/data01/glasslab/home/jtao/code/tba/score_motifs.py ./motif_similarity_scores -num_procs 48 ./converted_motifs/*motif

Reading motif files...
Calculating alignments between motifs and scoring motifs
Creating visualizations...
Serializing scores... 
correlation should be used for clustering.


In [173]:
! /gpfs/data01/glasslab/home/jtao/code/tba/threshold_cluster_motifs.py ./motif_similarity_scores/correlation.npz ./clustered_motifs 0.9 ./converted_motifs/*motif