-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
113 lines (90 loc) · 3.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import collections, csv, json, os.path, subprocess
from pprint import pprint
import numpy as np
from fasta_parser import FastaParser
from sequence_analyzer import DNAAnalyzer
from classifiers import GeneNameClassifier, RTEClassifier
from gene_grouper import GeneGrouper
from utils import parse_filters
def group_genes(Classifier, genes, fname_out):
""" Group genes given in filename and save results elsewhere
"""
gegro = GeneGrouper(Classifier)
genes = Classifier.preprocess(genes)
groups = gegro.group(genes)
foo = []
filter_stats = collections.defaultdict(int)
dnana = DNAAnalyzer(strict=False)
for group_name, group_genes in groups.items():
# apply post-annotation filters
filters = parse_filters(post_annotation=True)
genes = []
for gene in group_genes:
skip = False
for f in filters:
if not f.skip and not f().apply(gene):
filter_stats[f.__name__] += 1
skip = True
if skip: continue
genes.append(gene)
if len(genes) == 0: continue
# compute codon usage
cum_codu = dnana.get_cum_codon_usage(genes)
foo.append({
'group': group_name,
'cumulative_codon_usage': cum_codu
})
if len(filter_stats) > 0: print('Post-Annotation filters:')
for k, v in filter_stats.items(): print(' ', k, '->', v)
json.dump(foo, open(os.path.join(Classifier.RESULTS_DIR, fname_out), 'w'))
#pprint(foo)
def plot_grouped_genes():
print('Plotting...')
subprocess.check_call(['Rscript', 'plotting/plotter.R'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def save_to_csv():
aa_dict = {
'V': ['GTT', 'GTA', 'GTC', 'GTG'],
'A': ['GCT', 'GCC', 'GCG', 'GCA'],
'C': ['TGT', 'TGC'],
'Y': ['TAT', 'TAC'],
'W': ['TGG'],
'N': ['AAT', 'AAC'],
'T': ['ACT', 'ACC', 'ACG', 'ACA'],
'Q': ['CAG', 'CAA'],
'L': ['CTC', 'CTA', 'CTT', 'CTG', 'TTA', 'TTG'],
'S': ['AGC', 'TCT', 'TCC', 'TCG', 'AGT', 'TCA'],
'E': ['GAA', 'GAG'],
'R': ['CGC', 'CGA', 'CGT', 'AGG', 'CGG', 'AGA'],
'H': ['CAC', 'CAT'],
'F': ['TTC', 'TTT'],
'D': ['GAT', 'GAC'],
'M': ['ATG'],
'K': ['AAA', 'AAG'],
'*': ['TAG', 'TAA', 'TGA'],
'P': ['CCA', 'CCC', 'CCT', 'CCG'],
'I': ['ATT', 'ATC', 'ATA'],
'G': ['GGT', 'GGG', 'GGA', 'GGC'],
}
with open('results/grouped_genes.json', 'r') as fd:
content = json.load(fd)
with open('out.csv', 'w') as fd:
writer = csv.writer(fd)
writer.writerow(['group', 'amino_acid', 'codon', 'codon_usage'])
for entry in content:
group = entry['group']
cuco = entry['cumulative_codon_usage']
for aa, cods in aa_dict.items():
for c in cods:
codu = np.mean(cuco[c])
writer.writerow([group, aa, c, codu])
def apply_procedure(Classifier):
farser = FastaParser(Classifier.data_file)
genes = farser.parse()
group_genes(Classifier, genes, 'grouped_genes.json')
plot_grouped_genes()
#save_to_csv()
def main():
apply_procedure(GeneNameClassifier)
#apply_procedure(RTEClassifier)
if __name__ == '__main__':
main()