# Hypothetical proteins

In [1]:
from pathlib import Path
from collections import defaultdict
import pickle

In [2]:
from Bio import AlignIO, SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser 

In [7]:
ls $faa_dir | head

[0m[01;32mbin3c.174.contigs.faa[0m*
[01;32mbin3c.184.contigs.faa[0m*
[01;32mbin3c.225.contigs.faa[0m*
[01;32mDGYMR06203__metabat2_low_PE.047.contigs.faa[0m*
[01;32mE4_54_1-contigs.faa[0m*
[01;32mF157a_European_Toad__metabat2_high_PE.005.contigs.faa[0m*
[01;32mGCA_000436395.1_MGS154_genomic.faa[0m*
[01;32mGCA_000437075.1_MGS344_genomic.faa[0m*
[01;32mGCA_001917295.1_ASM191729v1_genomic.faa[0m*
[01;32mGCA_002362435.1_ASM236243v1_genomic.faa[0m*


## Create an index for all the proteins of the akkermansia dataset

This is recommended if working with big datasets (in this case isn't that big). Indexes are way faster to operate with compared to parsing hundreds of ascii files.

In [11]:
files = list(faa_dir.glob('*.faa'))
faas_idx = SeqIO.index_db("output_dir/faas.idx", files, "fasta")
len(faas_idx.values())

472482

## How many *hypothetical proteins* according to `prokka`?

In [183]:
# Generator
records = (faas_idx[name] for name in faas_idx)
rec_hypoth = []
for i, rec in enumerate(records):
    if ' '.join(rec.description.split(' ')[1:]) == 'hypothetical protein':
        rec_hypoth.append(rec)
print(i, len(rec_hypoth))

472481 263307


## Write all the sequences annotated as *hypothetical proteins* to a multi-fasta file

In [60]:
with open('hypothetical_proteins.fa', 'w') as fh:
    for seq in rec_hypoth:
        print(f'>{seq.description}\n{seq.seq}', file=fh)

Check the length of the written file

In [69]:
print(len(rec_hypoth))
! rg -c '^>.+hypothetical' hypothetical_proteins.fa

263307
263307
