forked from biopython/biopython
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_ncbi_codon_table.py
120 lines (106 loc) · 4.46 KB
/
update_ncbi_codon_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Helper script to update Codon tables from the NCBI.
These tables are based on parsing the NCBI file:
ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
More detailed information about the tables are here:
https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
This script is used to update Bio/Data/CodonTable.py
Note that the NCBI sometimes revise the older tables,
so don't just add new tables - replace all of them
and check for any differences in the old tables.
"""
import re
INDENT = len("register_ncbi_table(")
INDENT2 = INDENT + len("table={")
def line_wrap(text, indent=0, max_len=78, string=False):
"""Return a wrapped line if length is larger max_len.
The new parameter 'string' allows to wrap quoted text which is delimited
by single quotes. It adds " '" to the end of the line and "'" to the start
of the next line.
"""
split_len = max_len if not string else max_len - 2
if len(text) <= max_len:
return text
line = text[:split_len]
assert " " in line, line
line, rest = line.rsplit(" ", 1)
# New:
if string:
line += " '"
rest = "'" + rest
rest = " " * indent + rest + text[split_len:]
assert len(line) < max_len
if indent + len(rest) <= max_len:
return line + "\n" + rest
else:
return line + "\n" + line_wrap(rest, indent, max_len, string)
print("""
##########################################################################
# Start of auto-generated output from Scripts/update_ncbi_codon_table.py #
##########################################################################
"""[1:])
version = ''
for line in open("gc.prt").readlines():
if not version and line.startswith("-- Version"):
version = line.split('Version', 1)[1].strip()
print("# Data from NCBI genetic code table version %s\n" % version)
if line[:2] == " {":
names = []
id = None
aa = None
start = None
bases = []
elif line[:6] == " name":
names.append(re.search('"([^"]*)"', line).group(1))
elif line[:8] == " name":
names.append(re.search('"(.*)$', line).group(1))
elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n':
names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma"
elif line[:4] == " id":
id = int(re.search('(\d+)', line).group(1))
elif line[:10] == " ncbieaa ":
aa = line[12:12 + 64]
elif line[:10] == " sncbieaa":
start = line[12:12 + 64]
elif line[:9] == " -- Base":
bases.append(line[12:12 + 64])
elif line[:2] == " }":
assert names != [] and id is not None and aa is not None
assert start is not None and bases != []
if len(names) == 1:
names.append(None)
# Use %r instead of %s to include the quotes of the string!
print(line_wrap("register_ncbi_table(name=%r," % names[0],
indent=INDENT, string=True))
print(" " * INDENT + "alt_name=%r, id=%d," % (names[1], id))
s = " " * INDENT + "table={"
for i in range(64):
if aa[i] != "*":
t = "'%s%s%s': '%s', " % (bases[0][i], bases[1][i],
bases[2][i], aa[i])
if len(s) + len(t) > 75:
print(s.rstrip())
s = " " * INDENT2 + t
else:
s += t
print("%s}," % s[:-2]) # remove ', ' from last entry before '}'
codons = [bases[0][i] + bases[1][i] + bases[2][i]
for i in range(64) if start[i] == "*"]
print(line_wrap(" " * INDENT + "stop_codons=%r,"
% codons, indent=INDENT + 13))
codons = [bases[0][i] + bases[1][i] + bases[2][i]
for i in range(64) if start[i] == "M"]
print(line_wrap(" " * INDENT + "start_codons=%r)"
% codons, indent=INDENT + 14))
print("")
elif line[:2] == "--" or line in ("\n", "}\n",
'Genetic-code-table ::= {\n'):
pass
else:
raise Exception("Unparsed: " + repr(line))
print("""
########################################################################
# End of auto-generated output from Scripts/update_ncbi_codon_table.py #
########################################################################""")