**Note:** requires the ete3 tool that is external to the requirements of the SSN workflow.

In [1]:
import pandas as pd
from ete3 import NCBITaxa

In [4]:
# load UniProt table
df = pd.read_csv('IPR001761.tsv', sep='\t')
df.head()

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Organism (ID),PDB,Pfam,InterPro
0,G3XD97,reviewed,PTXS_PSEAE,HTH-type transcriptional regulator PtxS,ptxS PA2259,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,340,208964,,PF00356;PF00532;,IPR000843;IPR010982;IPR001761;IPR028082;
1,P02924,reviewed,ARAF_ECOLI,L-arabinose-binding periplasmic protein (ABP),araF b1901 JW1889,Escherichia coli (strain K12),329,83333,1ABE;1ABF;1APB;1BAP;2WRZ;5ABP;6ABP;7ABP;8ABP;9...,PF00532;,IPR026266;IPR001761;IPR028082;
2,P0ACP1,reviewed,CRA_ECOLI,Catabolite repressor/activator (Fructose repre...,cra fruC fruR shl b0080 JW0078,Escherichia coli (strain K12),334,83333,1UXC;1UXD;2IKS;,PF00356;PF00532;,IPR012781;IPR000843;IPR010982;IPR001761;IPR028...
3,Q88HH7,reviewed,PTXS_PSEPK,HTH-type transcriptional regulator PtxS,ptxS PP_3380,Pseudomonas putida (strain ATCC 47054 / DSM 61...,339,160488,,PF00356;PF00532;,IPR000843;IPR010982;IPR001761;IPR028082;
4,Q9KM69,reviewed,FRUR_VIBCH,Fructose operon regulatory protein (Catabolite...,fruR cra VC_A0519,Vibrio cholerae serotype O1 (strain ATCC 39315...,326,243277,,PF00356;PF00532;,IPR012781;IPR000843;IPR010982;IPR001761;IPR028...


Isolate just desired data.

In [7]:
df=df[['Entry', 'Reviewed', 'Length','Organism (ID)', 'PDB']]
df.head()

Unnamed: 0,Entry,Reviewed,Length,Organism (ID),PDB
0,G3XD97,reviewed,340,208964,
1,P02924,reviewed,329,83333,1ABE;1ABF;1APB;1BAP;2WRZ;5ABP;6ABP;7ABP;8ABP;9...
2,P0ACP1,reviewed,334,83333,1UXC;1UXD;2IKS;
3,Q88HH7,reviewed,339,160488,
4,Q9KM69,reviewed,326,243277,


In [21]:
# convert taxid to superkingdom and phylum and add to dataframe
ncbi = NCBITaxa()

def GetTaxaLevel(taxid, rank):
    lineage = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage)
    ranks = ncbi.get_rank(lineage)
    ranks = {ranks[taxid]: names[taxid] for taxid in lineage}
    if rank in ranks:
        return ranks[rank]
    else:
        return None
    

In [23]:
df['Superkingdom'] = df['Organism (ID)'].apply(lambda x: GetTaxaLevel(x, 'superkingdom'))
df['Phylum'] = df['Organism (ID)'].apply(lambda x: GetTaxaLevel(x, 'phylum'))



In [24]:
df.head()

Unnamed: 0,Entry,Reviewed,Length,Organism (ID),PDB,Superkingdom,Phylum
0,G3XD97,reviewed,340,208964,,Bacteria,Pseudomonadota
1,P02924,reviewed,329,83333,1ABE;1ABF;1APB;1BAP;2WRZ;5ABP;6ABP;7ABP;8ABP;9...,Bacteria,Pseudomonadota
2,P0ACP1,reviewed,334,83333,1UXC;1UXD;2IKS;,Bacteria,Pseudomonadota
3,Q88HH7,reviewed,339,160488,,Bacteria,Pseudomonadota
4,Q9KM69,reviewed,326,243277,,Bacteria,Pseudomonadota


In [25]:
# save
df.to_csv('IPR001761_with_taxonomy.tsv', sep='\t', index=False)