In [19]:
import os
import gzip


In [33]:
# extracted from 
# https://www.nature.com/articles/s41587-018-0008-8#MOESM4
#
# https://static-content.springer.com/esm/art%3A10caevow9Voomu.1038%2Fs41587-018-0008-8/MediaObjects/41587_2018_8_MOESM7_ESM.xlsx
valid_species = [
    "Acidaminococcus sp. BV3L6",
    "Alistipes indistinctus",
    "Alistipes sp. CAG:29",
    "Alistipes sp. CAG:53",
    "Anaerofustis stercorihominis",
    "Anaerostipes sp. CAG:276",
    "Bacillus cereus",
    "Bacillus coagulans",
    "Bacillus licheniformis",
    "Bacillus sonorensis",
    "Bacteroides caccae",
    "Bacteroides cellulosilyticus",
    "Bacteroides clarus",
    "Bacteroides coprocola",
    "Bacteroides coprophilus",
    "Bacteroides dorei",
    "Bacteroides eggerthii",
    "Bacteroides faecis",
    "Bacteroides fragilis",
    "Bacteroides intestinalis",
    "Bacteroides nordii",
    "Bacteroides oleiciplenus",
    "Bacteroides ovatus",
    "Bacteroides plebeius",
    "Bacteroides salyersiae",
    "Bacteroides sp. 1_1_14",
    "Bacteroides sp. 1_1_6",
    "Bacteroides sp. 2_1_7",
    "Bacteroides sp. 3_1_23",
    "Bacteroides sp. 3_1_33FAA",
    "Bacteroides sp. 4_1_36",
    "Bacteroides sp. 4_3_47FAA",
    "Bacteroides sp. CAG:20",
    "Bacteroides sp. CAG:98",
    "Bacteroides sp. D1",
    "Bacteroides sp. D20",
    "Bacteroides stercorirosoris",
    "Bacteroides stercoris",
    "Bacteroides thetaiotaomicron",
    "Bacteroides uniformis",
    "Bacteroides vulgatus",
    "Bacteroides xylanisolvens",
    "Bifidobacterium adolescentis",
    "Bifidobacterium animalis",
    "Bifidobacterium bifidum",
    "Bifidobacterium longum",
    "Bifidobacterium pseudocatenulatum",
    "Bifidobacterium pseudolongum",
    "Bifidobacterium stercoris",
    "Blautia sp. CAG:237",
    "Blautia sp. CAG:37",
    "Blautia sp. CAG:52",
    "Burkholderiales bacterium 1 1 47",
    "butyrate-producing bacterium SS3/4",
    "Butyricimonas virosa",
    "Catenibacterium sp. CAG:290",
    "Citrobacter sp. 30_2",
    "Clostridiales bacterium 1_7_47FAA",
    "Clostridiales bacterium VE202-04",
    "Clostridium asparagiforme",
    "Clostridium bolteae",
    "Clostridium butyricum",
    "Clostridium citroniae",
    "Clostridium clostridioforme",
    "Clostridium hathewayi",
    "Clostridium innocuum",
    "Clostridium leptum",
    "Clostridium nexile",
    "Clostridium perfringens",
    "Clostridium sordellii",
    "Clostridium sp. CAG:122",
    "Clostridium sp. CAG:127",
    "Clostridium sp. CAG:149",
    "Clostridium sp. CAG:264",
    "Clostridium sp. CAG:277",
    "Clostridium sp. CAG:352",
    "Clostridium sp. CAG:43",
    "Clostridium sp. CAG:62",
    "Clostridium sp. CAG:7",
    "Clostridium sp. CAG:75",
    "Clostridium sp. CAG:91",
    "Clostridium sp. L2-50",
    "Clostridium spiroforme",
    "Clostridium symbiosum",
    "Collinsella intestinalis",
    "Collinsella tanakaei",
    "Coprobacillus sp. 29_1",
    "Coprobacillus sp. CAG:183",
    "Coprobacillus sp. CAG:235",
    "Coprobacillus sp. D6",
    "Coprococcus catus",
    "Coprococcus comes",
    "Coprococcus eutactus",
    "Coprococcus sp. ART55/1",
    "Coprococcus sp. CAG:131",
    "Coprococcus sp. CAG:782",
    "Coprococcus sp. HPP0048",
    "Coprococcus sp. HPP0074",
    "Dielma fastidiosa",
    "Dorea formicigenerans",
    "Dorea longicatena",
    "Dorea sp. CAG:105",
    "Eggerthella lenta",
    "Enterobacter cloacae",
    "Enterobacter sp. MGH 22",
    "Enterococcus asini",
    "Enterococcus avium",
    "Enterococcus casseliflavus",
    "Enterococcus durans",
    "Enterococcus faecalis",
    "Enterococcus saccharolyticus",
    "Erysipelatoclostridium ramosum",
    "Erysipelotrichaceae bacterium 3_1_53",
    "Escherichia coli",
    "Eubacterium biforme",
    "Eubacterium dolichum",
    "Eubacterium eligens",
    "Eubacterium hallii",
    "Eubacterium rectale",
    "Eubacterium sp. 3_1_31",
    "Eubacterium sp. CAG:156",
    "Eubacterium sp. CAG:161",
    "Eubacterium sp. CAG:192",
    "Eubacterium ventriosum",
    "Faecalibacterium cf. prausnitzii KLE1255",
    "Faecalibacterium prausnitzii",
    "Faecalibacterium sp. CAG:82",
    "Firmicutes bacterium CAG:102",
    "Firmicutes bacterium CAG:212",
    "Firmicutes bacterium CAG:227",
    "Firmicutes bacterium CAG:24",
    "Firmicutes bacterium CAG:270",
    "Firmicutes bacterium CAG:41",
    "Firmicutes bacterium CAG:536",
    "Firmicutes bacterium CAG:65",
    "Fusobacterium mortiferum",
    "Fusobacterium ulcerans",
    "Fusobacterium varium",
    "Holdemania filiformis",
    "Klebsiella pneumoniae",
    "Lachnospiraceae bacterium 1_4_56FAA",
    "Lachnospiraceae bacterium 2_1_58FAA",
    "Lachnospiraceae bacterium 3_1_57FAA_CT1",
    "Lachnospiraceae bacterium 6_1_63FAA",
    "Lachnospiraceae bacterium 9_1_43BFAA",
    "Lachnospiraceae bacterium CAG:25",
    "Lactobacillus amylovorus",
    "Lactobacillus casei",
    "Lactobacillus fermentum",
    "Lactobacillus gasseri",
    "Lactobacillus plantarum",
    "Lactobacillus ruminis",
    "Lactobacillus salivarius",
    "Lactococcus garvieae",
    "Lactococcus lactis",
    "Megamonas funiformis",
    "Megamonas rupellensis",
    "Mitsuokella multacida",
    "Odoribacter sp. CAG:788",
    "Odoribacter splanchnicus",
    "Paenibacillus polymyxa",
    "Parabacteroides distasonis",
    "Parabacteroides gordonii",
    "Parabacteroides merdae",
    "Parabacteroides sp. 20_3",
    "Parabacteroides sp. ASF519",
    "Paraprevotella clara",
    "Prevotella copri",
    "Prevotella disiens",
    "Prevotella sp. CAG:604",
    "Prevotella stercorea",
    "Propionibacterium sp. KPL2009",
    "Roseburia hominis",
    "Roseburia intestinalis",
    "Roseburia inulinivorans",
    "Roseburia sp. CAG:18",
    "Roseburia sp. CAG:380",
    "Roseburia sp. CAG:45",
    "Ruminococcus bromii",
    "Ruminococcus gnavus",
    "Ruminococcus lactaris",
    "Ruminococcus obeum",
    "Ruminococcus sp. 5_1_39BFAA",
    "Ruminococcus sp. CAG:108",
    "Ruminococcus sp. CAG:17",
    "Ruminococcus sp. CAG:55",
    "Ruminococcus sp. CAG:57",
    "Ruminococcus sp. CAG:60",
    "Ruminococcus sp. CAG:9",
    "Ruminococcus sp. CAG:90",
    "Ruminococcus torques",
    "Solobacterium moorei",
    "Staphylococcus warneri",
    "Streptococcus anginosus",
    "Streptococcus equinus",
    "Streptococcus gordonii",
    "Streptococcus lutetiensis",
    "Streptococcus lutetiensis",
    "Streptococcus mutans",
    "Streptococcus parasanguinis",
    "Streptococcus pasteurianus",
    "Streptococcus salivarius",
    "Streptococcus sp. I-P16",
    "Streptococcus vestibularis",
    "Subdoligranulum sp. 4_3_54A2FAA",
    "Tannerella sp. CAG:51",
    "unclassified",
    "Veillonella atypica",
    "Veillonella parvula",
    "Veillonella sp. 3_1_44",
    "Veillonella sp. 6_1_27",
    "Weissella cibaria",
    "Weissella confusa"
]

In [42]:
for fname in os.listdir("refseq/release/bacteria/"):
    print(fname)
    with gzip.open("refseq/release/bacteria/"+fname, "r") as f:
        lines = [l for l in f.read().decode("utf-8").split("\n")]
        print_seq = False
        
        with open("gut_bacteria.fa", "a") as gf:
            for line in lines:
                if len(line) > 0 and line[0] == ">":
                    if any(specie in line for specie in valid_species):
                        gf.write(line+"\n")
                        print_seq = True
                    else:
                        print_seq = False
                else:
                    if print_seq:
                        gf.write(line+"\n")


bacteria.590.1.genomic.fna.gz
bacteria.582.1.genomic.fna.gz
bacteria.860.1.genomic.fna.gz
bacteria.684.1.genomic.fna.gz
bacteria.1321.1.genomic.fna.gz
bacteria.1274.1.genomic.fna.gz
bacteria.212.1.genomic.fna.gz
bacteria.1316.1.genomic.fna.gz
bacteria.153.1.genomic.fna.gz
bacteria.606.1.genomic.fna.gz
bacteria.426.1.genomic.fna.gz
bacteria.1513.1.genomic.fna.gz
bacteria.496.1.genomic.fna.gz
bacteria.1620.1.genomic.fna.gz
bacteria.603.1.genomic.fna.gz
bacteria.1305.1.genomic.fna.gz
bacteria.952.1.genomic.fna.gz
bacteria.1304.1.genomic.fna.gz
bacteria.1719.1.genomic.fna.gz
bacteria.1732.1.genomic.fna.gz
bacteria.446.1.genomic.fna.gz
bacteria.1524.1.genomic.fna.gz
bacteria.751.1.genomic.fna.gz
bacteria.1956.1.genomic.fna.gz
bacteria.32.1.genomic.fna.gz
bacteria.935.1.genomic.fna.gz
bacteria.1552.1.genomic.fna.gz
bacteria.261.1.genomic.fna.gz
bacteria.642.1.genomic.fna.gz
bacteria.1143.1.genomic.fna.gz
bacteria.1953.1.genomic.fna.gz
bacteria.1161.1.genomic.fna.gz
bacteria.1865.1.genomic.fn

KeyboardInterrupt: 

In [38]:
print(len(os.listdir("refseq/release/bacteria/")))

961
