In [1]:
import requests
import pandas as pd
import numpy as np
import fileinput

* Options:
    * want to be able to save or return values
    * want to be able to stream or not, and if stream would only write to file because then it would be silly to put back into memory
    * Although the pangenome builder loads the whole thing into RAM probably

In [5]:
def genome_id_features(genome_id_files, genome_ids=[], limit=2500000, stream=False):
    if len(genome_ids) == 0:
        if len(genome_id_files) == 0 or genome_id_files[0] == "-":
            for line in fileinput.input():
                delim = "," if "," in line else "\t"
                line = line.strip().split(delim)
                for l in line:
                    genome_ids.append(l)
        else:
            for file in genome_id_files:
                with open(file) as src:
                    for line in src:
                        delim = "," if "," in line else "\t"
                        line = line.strip().split(delim)
                        for l in line:
                            genome_ids.append(l)
    
#     selectors = ["ne(feature_type)","eq(annotation,PATRIC)",f"in(genome_id,({','.join(genome_ids)}))"]
    selectors = ["ne(feature_type)","eq(annotation,PATRIC)",f"in(accession,({','.join(genome_ids)}))"]
    genomes = f"and({','.join(selectors)})"    
    limit = f"limit({limit})"
    select = "select(genome_id,genome_name,accession,annotation,feature_type,patric_id,refseq_locus_tag,alt_locus_tag,uniprotkb_accession,start,end,strand,na_length,gene,product,figfam_id,plfam_id,pgfam_id,go,ec,pathway,aa_sequence_md5)&sort(+genome_id,+sequence_id,+start)"
    base = "https://www.patricbrc.org/api/genome_feature/"
    query = "&".join([genomes, limit, select])
    headers = {"accept":"text/tsv", "content-type": "application/rqlquery+x-www-form-urlencoded"}

    r = requests.post(url=base, data=query, headers=headers, stream=stream)
    
    if r.encoding is None:
        r.encoding = "utf-8"
    for line in r.iter_lines(decode_unicode=True):
        line = line.replace('"', '')
        yield line
    

In [26]:
def genome_ids_from_accessions(accessions=[], limit=2500000, stream=False):
    
#     selectors = ["ne(feature_type)","eq(annotation,PATRIC)",f"in(genome_id,({','.join(genome_ids)}))"]
    selectors = ["ne(feature_type)","eq(annotation,PATRIC)",f"in(accession,({','.join(accessions)}))"]
    genomes = f"and({','.join(selectors)})"    
    limit = f"limit({limit})"
    select = "select(genome_id,genome_name,accession,annotation,feature_type,patric_id,refseq_locus_tag,alt_locus_tag,uniprotkb_accession,start,end,strand,na_length,gene,product,figfam_id,plfam_id,pgfam_id,go,ec,pathway,aa_sequence_md5)&sort(+genome_id,+sequence_id,+start)"
    base = "https://www.patricbrc.org/api/genome_feature/"
    query = "&".join([genomes, limit, select])
    headers = {"accept":"text/tsv", "content-type": "application/rqlquery+x-www-form-urlencoded"}

    genome_ids = []
    
    r = requests.post(url=base, data=query, headers=headers, stream=stream)
    
    if r.encoding is None:
        r.encoding = "utf-8"
    for ix, line in enumerate(r.iter_lines(decode_unicode=True)):
        line = line.replace('"', '')
        if ix > 0: 
            genome_ids.append(line.split()[0])
    return list(set(genome_ids))

In [27]:
genome_ids_from_accessions(['NC_002944', 'NC_008595', 'NC_008705'])

['189918.11', '243243.7', '262316.17']

In [6]:
with open("test", "w") as dest: 
#     for ix, i in enumerate(genome_id_features("", genome_ids="1041522.3 1078013.3 1078020.3".split())):
    for ix, i in enumerate(genome_id_features("", genome_ids=['189918.11'])):

#         print(type(i))
#         dest.write(i+"\n")
        if ix > 10: break
        print(i + "\n")
    

genome_id	genome_name	accession	annotation	feature_type	patric_id	refseq_locus_tag	alt_locus_tag	uniprotkb_accession	start	end	strand	na_length	gene	product	figfam_id	plfam_id	pgfam_id	go	ec	pathway	aa_sequence_md5

189918.11	Mycobacterium sp. KMS	NC_008703	PATRIC	source					1	302089	+	302089									

189918.11	Mycobacterium sp. KMS	NC_008703	PATRIC	CDS	fig|189918.11.peg.1	Mkms_5495	VBIMycSp70743_0001		7	480	-	474		hypothetical protein	FIG00638284	PLF_1763_00375893	PGF_10456034				f8538fe1e376a62c1ef3ff7f936ad581

189918.11	Mycobacterium sp. KMS	NC_008703	PATRIC	CDS	fig|189918.11.peg.2	Mkms_5496	VBIMycSp70743_0002		603	959	+	357		hypothetical protein		PLF_1763_00152303	PGF_12744835				199507e07a9d192ae7c1c1782aa1e8ce

189918.11	Mycobacterium sp. KMS	NC_008703	PATRIC	CDS	fig|189918.11.peg.3	Mkms_5497	VBIMycSp70743_0003		952	2067	+	1116		hypothetical protein	FIG00638284	PLF_1763_00297414	PGF_08225224				1bd30ab959c5eac8a4b9d3f54e3e9b24

189918.11	Mycobacterium sp. KMS	NC_008703	PATRIC	C

In [4]:
# with open("top_3.features.tsv", "w") as dest: 
with open("test", "w") as dest: 
#     for ix, i in enumerate(genome_id_features("", genome_ids="1041522.3 1078013.3 1078020.3".split())):
    for ix, i in enumerate(genome_id_features("", genome_ids=['NC_002944', 'NC_008595', 'NC_008705'])):

#         print(type(i))
#         dest.write(i+"\n")
        if ix > 10: break
        print(i + "\n")
    

genome_id	genome_name	accession	annotation	feature_type	patric_id	refseq_locus_tag	alt_locus_tag	uniprotkb_accession	start	end	strand	na_length	gene	product	figfam_id	plfam_id	pgfam_id	go	ec	pathway	aa_sequence_md5

189918.11	Mycobacterium sp. KMS	NC_008705	PATRIC	source					1	5737227	+	5737227									

189918.11	Mycobacterium sp. KMS	NC_008705	PATRIC	CDS	fig|189918.11.peg.518	Mkms_0001	VBIMycSp70743_0518		98	1111	-	1014		Chromosome (plasmid) partitioning protein ParB	FIG00021843	PLF_1763_00034581	PGF_03475877				471e0626a75941ca3aed5fbe341c0172

189918.11	Mycobacterium sp. KMS	NC_008705	PATRIC	CDS	fig|189918.11.peg.519	Mkms_0002	VBIMycSp70743_0519		1118	2050	-	933		Chromosome (plasmid) partitioning protein ParA	FIG00006461	PLF_1763_00199838	PGF_04370656				1af6c7a7b5c726216d6e95e61794e22c

189918.11	Mycobacterium sp. KMS	NC_008705	PATRIC	CDS	fig|189918.11.peg.520	Mkms_0003	VBIMycSp70743_0520		2122	2799	-	678	gidB	16S rRNA (guanine(527)-N(7))-methyltransferase (EC 2.1.1.170)	FIG010896

In [10]:
x = pd.read_csv("top_3.features.tsv", sep="\t")
x.dropna(subset=["pgfam_id"]).to_csv("top_3.features.tsv", sep="\t", index=False)

In [10]:
# with open("test.224914.11.features", "w") as dest: 
with open("test.1224150.8.features", "w") as dest: 
#     for ix, i in enumerate(genome_id_features("", genome_ids=["224914.11"])):
    for ix, i in enumerate(genome_id_features("", genome_ids=["310037.4"])):

#         print(type(i))
#         dest.write(i+"\n")
        if ix > 3: break
        print(i + "\n")
    

genome_id	genome_name	accession	annotation	feature_type	patric_id	refseq_locus_tag	alt_locus_tag	uniprotkb_accession	start	end	strand	na_length	gene	product	figfam_id	plfam_id	pgfam_id	go	ec	pathway	aa_sequence_md5	sequence_id

310037.4	Acaryochloris sp. CCMEE 5410	AFEJ01000001	PATRIC	source					1	59354	+	59354										AFEJ01000001

310037.4	Acaryochloris sp. CCMEE 5410	AFEJ01000001	PATRIC	CDS	fig|310037.4.peg.1		VBIAcaSp8704_0001		419	2422	-	2004		hypothetical protein	FIG00638284	PLF_155977_00006203	PGF_08225224				33d7887aa295f9b43ebaa3363b5baf63	AFEJ01000001

310037.4	Acaryochloris sp. CCMEE 5410	AFEJ01000001	PATRIC	CDS	fig|310037.4.peg.2		VBIAcaSp8704_0002		2820	4340	-	1521		Neurosporene C-3',4' desaturase	FIG00006429	PLF_155977_00002547	PGF_00025441				eee62520fafa90f923deb84030d7382f	AFEJ01000001



In [56]:
i

'224914.11\tBrucella melitensis bv. 1 str. 16M\tNC_003317\tPATRIC\tCDS\tfig|224914.11.peg.4\tBMEI0004\tVBIBruMel92729_0004\t\t3482\t3856\t-\t375\t\tUncharacterized protein YyaL\tFIG00000175\tPLF_234_00000161\tPGF_05864899\t\t\t'

In [39]:
def test(one=1, two=2):
    print(one)
    print(two)
    
def testg():
    for i in range(10):
        yield i
    
def test2(three=3):
    print(three)

if True:
    func = test
    kwargs = {"one":5}
else:
    func = test2
    kwargs = {"three":20}

i = testg()
for x in i:
    print(x)

0
1
2
3
4
5
6
7
8
9
