-
Notifications
You must be signed in to change notification settings - Fork 1
/
hmmer2go.py
executable file
·177 lines (137 loc) · 5.04 KB
/
hmmer2go.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python2
#-*- coding:utf-8 -*-
"""
HMMER2GO
keith hughitt <khughitt@umd.edu>
2012/10/07
Parses HMMER3 output tables and maps matched Pfam domains to GO terms.
Usage:
hmmer2go.py gene_ontology.1_2.obo hmmer_results.csv
"""
import os
import re
import sys
import csv
from scripts import hmmer
from goatools import obo_parser
def analyze_hmmer_table(go_terms, pfam2go, hmmer_table, go_level=1):
"""Analyzes a given HMMER3 search table for Pfam/GO term matches.
Arguments
---------
go_terms : goatools.obo_parser.GODag
A dictionary of GO terms
pfam2go : dict
Pfam/GO mapping
hmmer_table : str
filepath to a HMMER3 search output table
go_level : int
(Optional) the GO level to summarize
"""
summary = {
"unknown": 0
}
# parse HMMER output and store in data dict
contigs = hmmer.parse_csv(hmmer_table)
# iterate through HMMER output
for contig in contigs:
pfam_domain = contig[2]
# check to see if domain is in Pfam2go
if pfam_domain not in pfam2go:
summary['unknown'] += 1
continue
# otherwise get a list of the associated GO terms
terms = pfam2go[pfam_domain]
# iterate through GO terms for the contig
for t in terms:
go_term = go_terms[t[0]]
# find GO category at the desired level
node = go_term
for i in range(go_term.level - go_level):
node = node.parents[0]
# add to summary table
category = node.name
if not category in summary:
summary[category] = 0
summary[category] += 1
return summary
def load_pfam2go(source=None):
"""Map HMMER3 results to Gene Ontology (GO) terms"""
# download pfam2go mapping if none specified
if source is None:
import urllib2
url = "http://www.geneontology.org/external2go/pfam2go"
raw = urllib2.urlopen(url).read().split("\n")
else:
# otherwise read in local file
fp = open(source, 'r')
raw = fp.readlines()
fp.close()
# parse and store in a python dict
mapping = {}
# pfam2go entry regex
regex = r"Pfam:(\w+) ([\w\-]+) > (GO:[\w\- \(\)\[\]\/\+>,':]+) ; (GO:\d+)"
for line in raw:
# skip comments
if line.startswith("!") or line == '':
continue
# parse a single entry in pfam2go
# ex: Pfam:PF00060 Lig_chan > GO:membrane ; GO:0016020
match = re.match(regex, line)
pfam_id, pfam_name, go_name, go_id = match.groups()
if pfam_name not in mapping:
mapping[pfam_name] = []
mapping[pfam_name].append((go_id, go_name))
return mapping
def write_csv(results, filepath, norm=None):
"""Write GO analysis results to CSV"""
# get a list of all GO categories used
categories = set([])
for data in results.values():
categories = categories.union(data.keys())
# if no normalization requested do not scale results
if norm is None:
scale_factors = {k: 1 for k in results.keys()}
else:
# otherwise divide norm value by the number of Pfam domains used
scale_factors = {}
for species, data in results.items():
# get total number of domains
total = 0
for term, value in data.items():
total += value
# scale factor = NORM / NUM DOMAINS
scale_factors[species] = norm / float(total)
# write results to CSV
writer = csv.writer(open(filepath, 'wt'))
writer.writerow(["category"] + results.keys())
for category in categories:
row = [category]
for species, data in results.items():
# add category count if it exists for the species
if category in data:
row.append(int(round(data[category] * scale_factors[species])))
else:
# otherwise add 0
row.append(0)
# add row to CSV
writer.writerow(row)
def parse_hmmer_table(filepath):
"""Parses a hmmer table and outputs a summary of the Pfam GO associations"""
if __name__ == "__main__":
go_input = "../input/gene_ontology.1_2.obo"
pfam2go_file = "../input/pfam2go.txt"
# load Gene ontology
print("Loading Gene Ontology...")
go_terms = obo_parser.GODag(go_input)
#go_terms = obo_parser.GODag(sys.argv[1])
# load pfam2go
print("Loading Pfam2go...")
pfam2go = load_pfam2go(pfam2go_file)
# map GO terms
results = {}
for filepath in sys.argv[1:]:
name = os.path.splitext(os.path.basename(filepath))[0]
results[name] = analyze_hmmer_table(go_terms, pfam2go, filepath)
# output results to CSV
write_csv(results, "../csv/go_analysis.csv")
write_csv(results, "../csv/go_analysis_normed.csv", norm=5e4)