In [None]:
import plotly

import plotly.express as px
import pandas as pd
import numpy as np
import plotly.io as pio

# enables the %%R magic to run python and R in different cells
%load_ext rpy2.ipython

In [None]:
def get_cofact_names(df, rename_dict, nestedlist=False):
    '''
    turns cofactor field into a list with cofactor names
    '''
    cofact_name = []
    for i, na in zip(df['Cofactor'], df['Cofactor'].isna()):
        if na:
            cofact_name.append(i)
        if not na:
            names = [j.split('=')[-1] for j in i.split(';') if 'COFACTOR' in j]
            #if defined in dictionary, rename cofactor
            for ind, name in enumerate(names):
                #classify the really long ones as 'other'
                if len(name) > 35:
                    names[ind] = 'Other'
                if name in rename_dict:
                    names[ind] = rename_dict[name]
            if nestedlist:
                cofact_name.append(names)
            else:
                cofact_name+=names
    return cofact_name


def integrate_cosubstrate(df, df_cosub, cosub_name):
    '''
    given a dataframe of enzymes (with uniprot Entry as a column)
    and a dataframe of enzymes with a certain cosubstrate, 
    integrates the cosubstrate into the cofactor column of the first dataframe
    '''
    df_out = df.copy()
    cosub_notna = df.Entry.isin(df_cosub.Entry) & df.Cofactor.notna()
    cosub_na = df.Entry.isin(df_cosub.Entry) & df.Cofactor.isna()

    df_out.loc[cosub_notna, 'Cofactor'] = df[cosub_notna].Cofactor.apply(lambda x:x+[cosub_name])
    df_out.loc[cosub_na, 'Cofactor'] = df[cosub_na].Cofactor.apply(lambda x:[cosub_name])
    return df_out

def prepare_sunburst_df(df):
    '''
    for a read in .tsv file from uniprot, with the 'Cofactor' and 'EC number' fields,
    gives a dataframe fit for making a sunburst plot or other functions in plotly. 
    '''
    #check if gene ontology has a DNA binding site
    dna_binding = df['Gene Ontology (molecular function)'].copy()
    dna_binding.loc[dna_binding.isna()] = 'no'
    dna_binding = dna_binding.apply(lambda x:(('DNA binding' in x) or ('RNA binding' in x)))
    #column for whether or not entry is an enzyme depending on if the EC number is NaN, 
    # column for whether cofactor is present based on if Cofactor is NaN
    Enzyme = df['EC number'].isna().map({True: 'Not Enzyme', False: 'Enzyme'})
    Cofactor_present = df['Cofactor'].isna().map({True: 'No Cofactor', False: ' Cofactor'}) 
    Cofactor_present.loc[dna_binding] = Cofactor_present.loc[dna_binding].map({'No Cofactor':' Nucl. acid', ' Cofactor':' Cofactor+Nucl'}) 
    #go over each cofactor entry. skip if NaN, shorten/rename if it has an entry
    cofact_name = []
    cofact_nr = []
    for i, na in zip(df['Cofactor'], df['Cofactor'].isna()):
        if na:
            cofact_nr.append(i)
        if not na:
            #names = [j.split('=')[-1] for j in i.split(';') if 'COFACTOR' in j]    
            cofact_nr.append(str(len(i)))
    #prepare dataframe for sunburst plot and group by all columns to get count of each occurence
    df_sunburst = pd.DataFrame({'Enzyme':Enzyme, 'Cofactor present':Cofactor_present, 
                                'Cofactor count':cofact_nr})
    df_sunburst = df_sunburst.groupby(df_sunburst.columns.tolist(), dropna=False, as_index=False).size()
    return df_sunburst

#dictionary of cofactor names that should be renamed in order to group or shorten for clarity
rename_cofactors={"pyridoxal 5'-phosphate":"PLP", 
                  "S-adenosyl-L-methionine":"SAM",
                  "thiamine diphosphate":"TPP",
                  "a divalent metal cation":"Metal(2+)",
                  "Cu cation":"Cu(2+)",
                  "Fe cation":"Fe(2+)",
                  "Ni cation":"Ni(2+)",
                  "siroheme":"heme",
                  "ferriheme a":"heme",
                  "heme b":"heme",
                  "heme c":"heme",
                  "heme d cis-diol":"heme",
                  "Fe(II)-heme o":"heme",
                  "Heme A3.":"heme",
                  "Heme A3. {ECO:0000250}":"heme",
                  "NADPH":"NADP(+)",
                  "FMNH2":"FMN",
                  "prenyl-FMN":"FMN",
                  "iron-sulfur cluster":"Fe-S cluster",
                  "[4Fe-4S] cluster":"Fe-S cluster",
                  "[2Fe-2S] cluster":"Fe-S cluster",
                  "[3Fe-4S] cluster":"Fe-S cluster",
                  "[8Fe-7S] cluster":"Fe-S cluster",
                  "[7Fe-Mo-9S-C-homocitryl] cluster":"Fe-S cluster",
                  "[Ni-4Fe-4S] cluster":"Ni-Fe-S cluster",
                  "[Ni-4Fe-5S] cluster":"Ni-Fe-S cluster",
                  "[Ni-Fe-S] cluster":"Ni-Fe-S cluster",
                  "hybrid [4Fe-2O-2S] cluster":"Fe-S cluster",
                  "Mo-molybdopterin":"MPT",
                  "Mo-bis(molybdopterin guanine dinucleotide)":"MPT",
                  "Mo-molybdopterin cytosine dinucleotide":"MPT",
                  "W-bis(molybdopterin guanine dinucleotide)":"MPT",
                  "Has a very strong preference for NAD(+) over NADP(+). {ECO:0000250|UniProtKB:P41793}":"NAD(+)",
                 }

In [None]:
#Load in all data to create dataframe
#============================================================================================================
#load in uniprot data for all (e. coli) proteins, including Entry name, EC number,and info on their cofactor
ecoli_prot = pd.read_csv('uniprot-ecoli-2023.02.04-13.43.09.27.tsv', sep='\t')
#for all prot: (reviewed:true)
all_prot = pd.read_csv('uniprot-all-GO-2023.02.15-13.12.03.40.tsv', sep='\t')

all_cofact = get_cofact_names(all_prot, rename_cofactors, nestedlist=True)
all_prot['Cofactor'] = all_cofact

#for coenzymes uniprot sometimes considers them cosubstrates and thefore does not include them as cofactor.
#the list for these can be downloaded sperately and integrated into the data

#for NAD/NADH: ((chebi:16908) OR (chebi:13389)) AND (reviewed:true)
all_nad = pd.read_csv('uniprot-NAD-all-2023.02.10-10.13.50.46.tsv', sep='\t')
#for NADP/NADPH: ((chebi:44409) OR (chebi:16474)) AND (reviewed:true)
all_nadp = pd.read_csv('uniprot-NADP-all-2023.02.10-13.28.21.55.tsv', sep='\t')
#for SAM: (chebi:67040) AND (reviewed:true)
all_sam = pd.read_csv('uniprot-SAM-all-2023.02.10-13.49.09.81.tsv', sep='\t')
#for ATP: (chebi:15422) AND (reviewed:true)
all_atp = pd.read_csv('uniprot-ATP-all-2023.02.10-13.56.10.12.tsv', sep='\t') 
#for GTP: (chebi:15996) AND (reviewed:true)
all_gtp = pd.read_csv('uniprot-GTP-all-2023.02.10-14.00.39.61.tsv', sep='\t')

#integrate all the other cofactors into the list
all_prot = integrate_cosubstrate(all_prot, all_nad, 'NAD(+)')
all_prot = integrate_cosubstrate(all_prot, all_nadp, 'NADP(+)')
all_prot = integrate_cosubstrate(all_prot, all_sam, 'SAM')
all_prot = integrate_cosubstrate(all_prot, all_atp, 'ATP')
all_prot = integrate_cosubstrate(all_prot, all_gtp, 'GTP')

#use gene ontology to make column on DNA/RNA binding
gene_o = all_prot['Gene Ontology (molecular function)'].copy()
gene_o.loc[gene_o.isna()] = 'no'
dna_binding = gene_o.apply(lambda x:(('DNA binding' in x)))
rna_binding = gene_o.apply(lambda x:(('RNA binding' in x)))
all_prot['DNA'] = dna_binding
all_prot['RNA'] = rna_binding

#drop the gene ontology column
all_prot = all_prot.drop('Gene Ontology (molecular function)', axis=1)

#create column with enzyme/not enzyme, and cofactor present/not present
all_prot['Enzyme'] = all_prot['EC number'].notna()
all_prot['Cofactor present'] = all_prot['Cofactor'].notna()

In [None]:
#output processed dataframe
#============================================================================================================
all_prot_out = all_prot.copy()
all_prot_out.loc[all_prot.Cofactor.notna(), 'Cofactor'] = all_prot_out[all_prot.Cofactor.notna()].Cofactor.apply(lambda x:' '.join(x))
all_prot_out.loc[all_prot.Cofactor.isna(), 'Cofactor'] = all_prot_out[all_prot.Cofactor.isna()].Cofactor.apply(lambda x:x)
all_prot_out.to_csv('uniprot-all-prot-processed.tsv', sep="\t")

In [None]:
#Create a treemap
#============================================================================================================

list_inorganic = ['Fe-S cluster', 'Ni-Fe-S cluster', 'Mg(2+)', 'Zn(2+)', 'Mn(2+)', 'Fe(2+)', 'Metal(2+)', 
                'Ca(2+)', 'K(+)', 'Cu(2+)', 'Ni(2+)', 'Co(2+)', 'Co(3+)',
                'Cu(+)', 'Fe(3+)', 'a metal cation', 'a monovalent cation', 'vanadium cation',
                  'phosphate', 'chloride', 'NH4(+)', 'Na(+)'
               ]

#make into big list of cofactors
all_cofact= []
for i in all_prot.Cofactor:
    if type(i) == list:
        all_cofact+=i
        
#seperate into organic and inorganic
org_inorg = []
for i in all_cofact:
    if i in list_inorganic:
        org_inorg.append('Inorganic')
    else:
        org_inorg.append('Organic')
        
#restructure list of cofactors to be compatible with treemap
all_cofact = pd.DataFrame({'Class':org_inorg,'Cofactor':all_cofact})
fig = px.treemap(all_cofact, path=['Class', 'Cofactor'])
fig.data[0].textinfo = 'label+text+value'
fig.show()
fig.update_layout(font=dict(size=24))
#fig.update_layout(treemapcolorway = ["pink", "green"])
pio.write_image(fig, 'treemap.png',scale=6, width=1080, height=1080)

In [None]:
#Create a sunburst plot
#============================================================================================================
all_sunburst = prepare_sunburst_df(all_prot)
fig = px.sunburst(all_sunburst, path=['Enzyme', 'Cofactor present', 'Cofactor count'], values='size')
#makes it sort alphabetically instead of on value size. can be manipulated by adding spaces
fig.update_traces(sort=False) 
fig.show()
fig.update_layout(font=dict(size=18))
pio.write_image(fig, 'sunburst.png',scale=6, width=1080, height=1080)