In [1]:
import entrezpy.esearch.esearcher
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import requests
import io

In [2]:
def extract_taxonomy(refseq):
    
    requestURL = f"https://www.ebi.ac.uk/proteins/api/uniparc/dbreference/{refseq:s}?offset=0&size=100&rfDdtype=RefSeq"

    r = requests.get(requestURL, headers={ "Accept" : "application/json"})

    if not r.ok:
        r.raise_for_status()
    
    try:
        uniparc_info = r.json()[0]
    except IndexError:
        return None
    try:
        for item in uniparc_info['dbReference'][0]['property']:
            if item["type"] == "NCBI_taxonomy_id":
                return int(item['value'])
    except KeyError:
        return None
    
def extract_taxid_from_name(refseq):
    
    url = f"https://rest.ensembl.org/taxonomy/name/{refseq}"
    #print (requestURL)

    r = requests.get(url, headers={ "Accept" : "application/json"})
    if not r.ok:
        return None
    data = r.json()
    return data

def get_taxID(taxa_info):
    if taxa_info is None:
        return None
    else:
        return taxa_info[0]['id']
    
def get_taxonomic_lineage(idx):
    """
    """
    url = f"https://rest.ensembl.org//taxonomy/classification/{idx}?"

    r = requests.get(url, headers={ "Accept" : "application/json"})
    if not r.ok:
        return None
    data = r.json()
    return data

def get_scientific_name(idx):
    """
    """
    url = f"https://rest.ensembl.org/taxonomy/id/{idx}?"

    r = requests.get(url, headers={ "Accept" : "application/json"})
    if not r.ok:
        return None
    data = r.json()
    return data['name']

def get_superkingdom(lineage):
    if lineage is None:
        return None
    else:
        return lineage[-1]['scientific_name']

def get_phylum(lineage):
    if lineage is None:
        return None
    else:
        try:
            return lineage[-2]['scientific_name']
        except:
            return None
    
def get_class(lineage):
    if lineage is None:
        return None
    else:
        try:
            return lineage[-3]['scientific_name']
        except:
            return None

def get_order(lineage):
    if lineage is None:
        return None
    else:
        try:
            return lineage[-4]['scientific_name']
        except:
            return None
        
def get_family(lineage):
    if lineage is None:
        return None
    else:
        try:
            return lineage[-5]['scientific_name']
        except:
            return None

In [16]:
HFSP_phy_genome_org_df = pd.read_csv("/mnt/researchdrive/Kaustubh/HFSP-Flagellar_proteins/data/sequences/uniq_genome_accession_organism_name_mapping.csv", header=None)
HFSP_phy_genome_org_df.columns = ["Genome_acc_id", "Organism_Name"]
HFSP_phy_genome_org_df

Unnamed: 0,Genome_acc_id,Organism_Name
0,GCF_000005845.2,Escherichia_coli_str._K-12_substr._MG1655
1,GCF_000006765.1,Pseudomonas_aeruginosa_PAO1
2,GCF_000007985.2,Geobacter_sulfurreducens_PCA
3,GCF_000008625.1,Aquifex_aeolicus_VF5
4,GCF_000009045.1,Bacillus_subtilis_subsp._subtilis_str._168
...,...,...
131,GCF_023016405.1,Treponema_pallidum_subsp._pallidum
132,GCF_900093645.1,Chlamydiales_bacterium_SCGC_AB-751-O23
133,GCF_900097105.1,Akkermansia_glycaniphila
134,GCF_900169565.1,Nitrospira_japonica


In [17]:
HFSP_phy_genome_org_df["Taxa_info"] = HFSP_phy_genome_org_df["Organism_Name"].apply(extract_taxid_from_name)
HFSP_phy_genome_org_df["Taxa_id"] = HFSP_phy_genome_org_df["Taxa_info"].apply(get_taxID)
HFSP_phy_genome_org_df["lineage"] = HFSP_phy_genome_org_df["Taxa_id"].apply(get_taxonomic_lineage)
HFSP_phy_genome_org_df["superkingdom"] = HFSP_phy_genome_org_df["lineage"].apply(get_superkingdom)
HFSP_phy_genome_org_df["phylum"] = HFSP_phy_genome_org_df["lineage"].apply(get_phylum)
HFSP_phy_genome_org_df["class"] = HFSP_phy_genome_org_df["lineage"].apply(get_class)
HFSP_phy_genome_org_df["order"] = HFSP_phy_genome_org_df["lineage"].apply(get_order)
HFSP_phy_genome_org_df["family"] = HFSP_phy_genome_org_df["lineage"].apply(get_family)

In [18]:
HFSP_phy_genome_org_df

Unnamed: 0,Genome_acc_id,Organism_Name,Taxa_info,Taxa_id,lineage,superkingdom,phylum,class,order,family
0,GCF_000005845.2,Escherichia_coli_str._K-12_substr._MG1655,[{'tags': {'synonym': ['Escherichia coli MG165...,511145,"[{'tags': {'name': ['Escherichia'], 'scientifi...",Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae
1,GCF_000006765.1,Pseudomonas_aeruginosa_PAO1,[{'tags': {'name': ['Pseudomonas aeruginosa PA...,208964,"[{'id': '286', 'tags': {'includes': ['Chryseom...",Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae
2,GCF_000007985.2,Geobacter_sulfurreducens_PCA,"[{'parent': {'id': '35554', 'tags': {'name': [...",243231,"[{'parent': {'leaf': 0, 'tags': {'name': ['Geo...",Bacteria,Proteobacteria,Deltaproteobacteria,Desulfuromonadales,Geobacteraceae
3,GCF_000008625.1,Aquifex_aeolicus_VF5,"[{'leaf': 1, 'id': '224324', 'scientific_name'...",224324,"[{'parent': {'name': 'Aquificaceae', 'leaf': 0...",Bacteria,Aquificae,Aquificales,Aquificaceae,Aquifex
4,GCF_000009045.1,Bacillus_subtilis_subsp._subtilis_str._168,"[{'id': '224308', 'children': [{'tags': {'equi...",224308,"[{'parent': {'scientific_name': 'Bacillaceae',...",Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae
...,...,...,...,...,...,...,...,...,...,...
131,GCF_023016405.1,Treponema_pallidum_subsp._pallidum,[{'name': 'Treponema pallidum subsp. pallidum'...,161,"[{'scientific_name': 'Treponema', 'tags': {'na...",Bacteria,Spirochaetes,Spirochaetales,Treponemataceae,Treponema
132,GCF_900093645.1,Chlamydiales_bacterium_SCGC_AB-751-O23,[{'name': 'Chlamydiales bacterium SCGC AB-751-...,1871322,"[{'scientific_name': 'Chlamydiales', 'tags': {...",Bacteria,Chlamydiae,Chlamydiales,,
133,GCF_900097105.1,Akkermansia_glycaniphila,[{'tags': {'includes': ['Akkermansia sp. Pyt']...,1679444,"[{'leaf': 0, 'children': [{'tags': {'authority...",Bacteria,Verrucomicrobia,Verrucomicrobiae,Verrucomicrobiales,Akkermansiaceae
134,GCF_900169565.1,Nitrospira_japonica,"[{'leaf': 1, 'id': '1325564', 'tags': {'type_m...",1325564,"[{'leaf': 0, 'name': 'Nitrospira', 'children':...",Bacteria,Nitrospirae,Nitrospirales,Nitrospiraceae,Nitrospira


In [19]:
HFSP_phy_genome_org_df.to_csv("/mnt/researchdrive/Kaustubh/HFSP-Flagellar_proteins/data/sequences/uniq_genome_accession_organism_name_taxa_info.csv")

In [20]:
HFSP_phy_genome_org_df[["Organism_Name","superkingdom","phylum","class","order","family"]].to_csv("/mnt/researchdrive/Kaustubh/HFSP-Flagellar_proteins/analysis/OGT_prediction/prediction_HFSP/species_taxonomic.txt", index=False, sep='\t')

## This part is for the Species Tree genomes taxa info

In [6]:
HFSP_genome_info = pd.read_csv("../data/species_tree/genomes_taxa_info.csv")
HFSP_genome_info

Unnamed: 0,assembly,organismName,taxid
0,GCA_000005845.2_ASM584v2,Escherichia_coli_str._K-12_substr._MG1655,511145
1,GCA_000006765.1_ASM676v1,Pseudomonas_aeruginosa_PAO1,208964
2,GCA_000007025.1_ASM702v1,Rickettsia_conorii_str._Malish_7,272944
3,GCA_000007205.1_ASM720v1,Chlamydia_pneumoniae_TW-183,182082
4,GCA_000007365.1_ASM736v1,Buchnera_aphidicola_str._Sg,198804
...,...,...,...
192,GCF_003812925.1_ASM381292v1,Klebsiella_oxytoca,571
193,GCF_006874605.1_ASM687460v1,Bacteriovorax_stolpii,960
194,GCF_006874645.1_ASM687464v1,Bdellovibrio_sp._ZAP7,2231053
195,GCF_011300455.1_ASM1130045v1,Sphingomonas_piscis,2714943


In [8]:
HFSP_genome_info["lineage"] = HFSP_genome_info["taxid"].apply(get_taxonomic_lineage)
HFSP_genome_info["superkingdom"] = HFSP_genome_info["lineage"].apply(get_superkingdom)
HFSP_genome_info["phylum"] = HFSP_genome_info["lineage"].apply(get_phylum)
HFSP_genome_info["class"] = HFSP_genome_info["lineage"].apply(get_class)
HFSP_genome_info["order"] = HFSP_genome_info["lineage"].apply(get_order)
HFSP_genome_info["family"] = HFSP_genome_info["lineage"].apply(get_family)

In [9]:
HFSP_genome_info

Unnamed: 0,assembly,organismName,taxid,lineage,superkingdom,phylum,class,order,family
0,GCA_000005845.2_ASM584v2,Escherichia_coli_str._K-12_substr._MG1655,511145,"[{'tags': {'name': ['Escherichia'], 'scientifi...",Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae
1,GCA_000006765.1_ASM676v1,Pseudomonas_aeruginosa_PAO1,208964,[{'children': [{'scientific_name': 'Pseudomona...,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae
2,GCA_000007025.1_ASM702v1,Rickettsia_conorii_str._Malish_7,272944,[{'children': [{'scientific_name': 'Rickettsia...,Bacteria,Proteobacteria,Alphaproteobacteria,Rickettsiales,Rickettsiaceae
3,GCA_000007205.1_ASM720v1,Chlamydia_pneumoniae_TW-183,182082,"[{'leaf': 0, 'children': [{'scientific_name': ...",Bacteria,Chlamydiae,Chlamydiales,Chlamydiaceae,Chlamydia/Chlamydophila group
4,GCA_000007365.1_ASM736v1,Buchnera_aphidicola_str._Sg,198804,"[{'id': '32199', 'tags': {'scientific_name': [...",Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Erwiniaceae
...,...,...,...,...,...,...,...,...,...
192,GCF_003812925.1_ASM381292v1,Klebsiella_oxytoca,571,"[{'leaf': 0, 'children': [{'scientific_name': ...",Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae
193,GCF_006874605.1_ASM687460v1,Bacteriovorax_stolpii,960,"[{'scientific_name': 'Bacteriovorax', 'leaf': ...",Bacteria,Proteobacteria,Oligoflexia,Bacteriovoracales,Bacteriovoracaceae
194,GCF_006874645.1_ASM687464v1,Bdellovibrio_sp._ZAP7,2231053,[{'parent': {'tags': {'authority': ['Bdellovib...,Bacteria,Proteobacteria,Oligoflexia,Bdellovibrionales,Bdellovibrionaceae
195,GCF_011300455.1_ASM1130045v1,Sphingomonas_piscis,2714943,"[{'id': '2714943', 'tags': {'scientific_name':...",Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae


In [10]:
HFSP_genome_info[["organismName","superkingdom","phylum","class","order","family"]].to_csv("/mnt/researchdrive/Kaustubh/HFSP-Flagellar_proteins/analysis/OGT_prediction/prediction_HFSP_species_tree/species_taxonomic.txt", index=False, sep='\t')