In [3]:
from pyteomics import fasta
from os import path, listdir
import os
import pickle
from collections import Counter

In [2]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()
import operator

In [None]:
path_to_uniprot = '/home/fasta/uniprot_bacteria.fasta'
path_to_uniprot_dbs = '/home/kae-13-1/fasta/bacts_bases_uniprot/'
path_to_swissprot_dbs = '/home/kae-13-1/fasta/bacts_bases_sprot/'

# I. Creating organism-specific databases

## Swiss-Prot + TrEMBL

In [None]:
%%time
uniprot_taxid_set = set()
for p in fasta.read(path_to_uniprot):
    spec_i = p[0].split('OX=')[-1].split(' ')[0]
    fasta.write([(p[0], p[1])], output = path.join(path_to_uniprot_dbs, '{}.fasta'.format(spec_i)),
                    file_mode = 'a')
    if spec_i not in taxid_set:
        uniprot_taxid_set.update([int(spec_i)])

## Swiss-Prot

In [None]:
%%time
swissprot_taxid_set = set()
for p in fasta.read(path_to_uniprot):
    if p[0].startswith('sp'):
        spec_i = p[0].split('OX=')[-1].split(' ')[0]
        fasta.write([(p[0], p[1])], output = path.join(path_to_swissprot_dbs, '{}.fasta'.format(spec_i)),
                        file_mode = 'a')
        if spec_i not in swissprot_taxid_set:
            swissprot_taxid_set.update([int(spec_i)])

CPU times: user 6min 6s, sys: 15.4 s, total: 6min 22s

Wall time: 6min 17s

## Calculating number of proteins in databases

In [None]:
%%time
len_fasta_uniprot = {}
for i in uniprot_taxid_set:
    file = path.join(path_to_uniprot_dbs, '{}.fasta'.format())
    # For Windows
    if os.name == 'nt':
        n = sum(1 for _ in fasta.read(file))
    # For Linux
    else:
        n = !grep -o 'OX=' $file | wc -l
    len_fasta_uniprot[i] = int(n[0])

In [None]:
%%time
len_fasta_sprot = {}
for i in swissprot_taxid_set:
    file = path.join(path_to_swissprot_dbs, '{}.fasta'.format(i))
    # For Windows
    if os.name == 'nt':
        n = sum(1 for _ in fasta.read(file))
    # For Linux
    else:
        n = !grep -o 'OX=' $file | wc -l
    len_fasta_sprot[i] = int(n[0])

In [None]:
path_to_len_uniprot = ''
path_to_len_swissprot = ''

pickle.dump(len_fasta_sprot, 
            open(path_to_len_swissprot, 'wb'), 
            protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(len_fasta_uniprot, 
            open(path_to_len_uniprot, 'wb'), 
            protocol=pickle.HIGHEST_PROTOCOL)

# II. Creating species:descendants dictionary

In [None]:
%%time
allowed_ranks = ('strain', 'subspecies', 'forma specialis', 'isolate', 'serotype', 'serogroup')

species_descendants = {}
used = set()

for i in taxid_set:
    if i not in used:
        rank = ncbi.get_rank([i])
        if rank:
            if rank[i] == 'species':
                descendants = ncbi.get_descendant_taxa(i) + [i]
                descendants = [j for j in descendants if j in taxid_set]
                species_descendants[i] = set(descendants)
                used.update(descendants)
            elif rank[int(i)] in allowed_ranks:
                lineage = ncbi.get_lineage(i)
                ranks = ncbi.get_rank(lineage)
                species = [k for k in ranks.keys() if ranks[k] == 'species'][0]
                
                descendants = ncbi.get_descendant_taxa(species) + [species]
                descendants = [j for j in descendants if j in taxid_set]
                species_descendants[species] = set(descendants)
                used.update(descendants)

CPU times: user 10h 11s, sys: 1h 56min 50s, total: 11h 57min 2s

Wall time: 11h 58min 20s

In [None]:
path_to_species_descendants = ''
pickle.dump(species_descendants, 
            open(path_to_species_descendants, 'wb'), 
            protocol=pickle.HIGHEST_PROTOCOL)

# III. Choosing group leaders

## Swiss-Prot + TrEMBL

In [None]:
%%time
species_leader_uniprot = {}

for i in species_descendants.keys():
    strains = species_des_dict[i]
    lens = {j:len_fasta_uniprot[j] for j in strains} 
    if len(lens) == 0:
        continue
    else:
        lead = max(lens.items(), key=operator.itemgetter(1))[0]
        species_leader_uniprot[i] = lead

## Swiss-Prot

In [94]:
%%time
species_leader_sprot = {}
for i in species_descendants.keys():
    strains = species_des_dict[i]
    strains = [i for i in strains if i in swissprot_taxid_set]
    lens = {j:len_fasta_sprot[j] for j in strains} 
    if len(lens) == 0:
        continue
    else:
        lead = max(lens.items(), key=operator.itemgetter(1))[0]
        species_leader_sprot[i] = lead

CPU times: user 28.4 s, sys: 1min 14s, total: 1min 42s
Wall time: 1min 42s


In [None]:
path_to_leaders_uniprot = ''
path_to_leaders_swissprot = ''
pickle.dump(species_leader_uniprot, 
            open(path_to_leaders_uniprot, 'wb'), 
            protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(species_leader_sprot, 
            open(path_to_leaders_swissprot, 'wb'), 
            protocol=pickle.HIGHEST_PROTOCOL)

# IV Excluding 'sp' and 'uncultured' organisms

In [99]:
%%time
i = 0
exclude_names = set()
leaders = set(species_leader_sprot.values()).union(set(species_leader_uniprot.values()))

for k in leaders:
    name = list(ncbi.get_taxid_translator([k]).values())[0]
    if 'sp.' in name:
        if name.split(' ')[1] == 'sp.':
            exclude_names.update([k])
            i+=1
    if name.startswith('uncultured'):
        exclude_names.update([k])

CPU times: user 2.02 s, sys: 695 ms, total: 2.71 s
Wall time: 3 s


In [None]:
path_to_exclude_names = ''
pickle.dump(exclude_names, 
            open(path_to_exclude_names, 'wb'), 
            protocol=pickle.HIGHEST_PROTOCOL)