forked from biopython/biopython
/
test_SeqUtils.py
154 lines (130 loc) · 6.87 KB
/
test_SeqUtils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Copyright 2007-2009 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
import os
import unittest
from Bio import SeqIO
from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq, MutableSeq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import GC, seq1, seq3
from Bio.SeqUtils.lcc import lcc_simp, lcc_mult
from Bio.SeqUtils.CheckSum import crc32, crc64, gcg, seguid
from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
def u_crc32(seq):
# NOTE - On Python 2 crc32 could return a signed int, but on Python 3 it is
# always unsigned
# Docs suggest should use crc32(x) & 0xffffffff for consistency.
return crc32(seq) & 0xffffffff
def simple_LCC(s):
# Avoid cross platforms with printing floats by doing conversion explicitly
return "%0.2f" % lcc_simp(s)
def windowed_LCC(s):
return ", ".join("%0.2f" % v for v in lcc_mult(s, 20))
class SeqUtilsTests(unittest.TestCase):
def setUp(self):
# Example of crc64 collision from Sebastian Bassi using the
# immunoglobulin lambda light chain variable region from Homo sapiens
# Both sequences share the same CRC64 checksum: 44CAAD88706CC153
self.str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
+ "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
+ "YCSSYAGSSTLVFGGGTKLTVL"
self.str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
+ "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
+ "YCCSYAGSSTWVFGGGTKLTVL"
def test_codon_usage_ecoli(self):
"""Test Codon Adaptation Index (CAI) using default E. coli data."""
CAI = CodonAdaptationIndex()
self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
"0.09978")
def test_codon_usage_custom(self):
"""Test Codon Adaptation Index (CAI) using FASTA file for background."""
# We need a FASTA file of CDS sequences to count the codon usage...
dna_fasta_filename = "fasta.tmp"
dna_genbank_filename = "GenBank/NC_005816.gb"
record = SeqIO.read(dna_genbank_filename, "genbank")
records = []
for feature in record.features:
if feature.type == "CDS" and not feature.sub_features:
start = feature.location.start.position
end = feature.location.end.position
table = int(feature.qualifiers["transl_table"][0])
if feature.strand == -1:
seq = record.seq[start:end].reverse_complement()
else:
seq = record.seq[start:end]
# Double check we have the CDS sequence expected
# TODO - Use any cds_start option if/when added to deal with the met
a = "M" + str(seq[3:].translate(table))
b = feature.qualifiers["translation"][0] + "*"
self.assertEqual(a, b, "%r vs %r" % (a, b))
records.append(SeqRecord(seq, id=feature.qualifiers["protein_id"][0],
description=feature.qualifiers["product"][0]))
with open(dna_fasta_filename, "w") as handle:
SeqIO.write(records, handle, "fasta")
CAI = CodonAdaptationIndex()
# Note - this needs a FASTA file which containing non-ambiguous DNA coding
# sequences - which should each be a whole number of codons.
CAI.generate_index(dna_fasta_filename)
# Now check codon usage index (CAI) using this species
self.assertEqual(record.annotations["source"],
"Yersinia pestis biovar Microtus str. 91001")
self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
"0.67213")
os.remove(dna_fasta_filename)
def test_crc_checksum_collision(self):
# Explicit testing of crc64 collision:
self.assertNotEqual(self.str_light_chain_one, self.str_light_chain_two)
self.assertNotEqual(crc32(self.str_light_chain_one), crc32(self.str_light_chain_two))
self.assertEqual(crc64(self.str_light_chain_one), crc64(self.str_light_chain_two))
self.assertNotEqual(gcg(self.str_light_chain_one), gcg(self.str_light_chain_two))
self.assertNotEqual(seguid(self.str_light_chain_one), seguid(self.str_light_chain_two))
def seq_checksums(self, seq_str, exp_crc32, exp_crc64, exp_gcg, exp_seguid,
exp_simple_LCC, exp_window_LCC):
for s in [seq_str,
Seq(seq_str, single_letter_alphabet),
MutableSeq(seq_str, single_letter_alphabet)]:
self.assertEqual(exp_crc32, u_crc32(s))
self.assertEqual(exp_crc64, crc64(s))
self.assertEqual(exp_gcg, gcg(s))
self.assertEqual(exp_seguid, seguid(s))
self.assertEqual(exp_simple_LCC, simple_LCC(s))
self.assertEqual(exp_window_LCC, windowed_LCC(s))
def test_checksum1(self):
self.seq_checksums(self.str_light_chain_one,
2994980265,
"CRC-44CAAD88706CC153",
9729,
"BpBeDdcNUYNsdk46JoJdw7Pd3BI",
"1.03",
"0.00, 1.00, 0.96, 0.96, 0.96, 0.65, 0.43, 0.35, 0.35, 0.35, 0.35, 0.53, 0.59, 0.26")
def test_checksum2(self):
self.seq_checksums(self.str_light_chain_two,
802105214,
"CRC-44CAAD88706CC153",
9647,
"X5XEaayob1nZLOc7eVT9qyczarY",
"1.07",
"0.00, 1.00, 0.96, 0.96, 0.96, 0.65, 0.43, 0.35, 0.35, 0.35, 0.35, 0.53, 0.59, 0.26")
def test_checksum3(self):
self.seq_checksums("ATGCGTATCGATCGCGATACGATTAGGCGGAT",
817679856,
"CRC-6234FF451DC6DFC6",
7959,
"8WCUbVjBgiRmM10gfR7XJNjbwnE",
"1.98",
"0.00, 2.00, 1.99, 1.99, 2.00, 1.99, 1.97, 1.99, 1.99, 1.99, 1.96, 1.96, 1.96, 1.96")
def test_GC(self):
seq = "ACGGGCTACCGTATAGGCAAGAGATGATGCCC"
self.assertEqual(GC(seq), 56.25)
def test_seq1_seq3(self):
s3 = "MetAlaTyrtrpcysthrLYSLEUILEGlYPrOGlNaSnaLapRoTyRLySSeRHisTrpLysThr"
s1 = "MAYWCTKLIGPQNAPYKSHWKT"
self.assertEqual(seq1(s3), s1)
self.assertEqual(seq3(s1).upper(), s3.upper())
self.assertEqual(seq1(seq3(s1)), s1)
self.assertEqual(seq3(seq1(s3)).upper(), s3.upper())
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)