## Topt Dataset
- The current Topt dataset is from 2018
- Re-download and parse the brenda data to see if there's more data

In [25]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict

In [4]:
brenda_file = "/projects/robustmicrob/jlaw/inputs/brenda/brenda_2022_2.json"
brenda_dict = json.load(open(brenda_file))

In [13]:
list(brenda_dict['data'].keys())[-10:]

['7.6.2.15',
 '7.6.2.16',
 '7.6.2.2',
 '7.6.2.3',
 '7.6.2.4',
 '7.6.2.5',
 '7.6.2.6',
 '7.6.2.7',
 '7.6.2.8',
 '7.6.2.9']

In [10]:
brenda_dict['data']['spontaneous'].keys()

dict_keys(['id', 'name', 'systematic_name', 'generic_reaction'])

In [14]:
brenda_dict['data']['7.6.2.8'].keys()

dict_keys(['id', 'name', 'systematic_name', 'synonyms', 'localization', 'cofactor', 'activating_compound', 'inhibitor', 'metals_ions', 'subunits', 'engineering', 'cloned', 'general_information', 'crystallization', 'purification', 'renaturation', 'general_stability', 'generic_reaction', 'natural_reaction', 'reaction', 'turnover_number', 'km_value', 'ph_optimum', 'specific_activity', 'temperature_optimum', 'molecular_weight', 'organisms', 'references', 'proteins'])

In [21]:
brenda_dict['data']['7.6.2.8']['references']['29']

{'id': '29',
 'title': 'Mechanistic basis of vitamin B12 and cobinamide salvaging by the Vibrio species. Biochim. Biophys',
 'authors': ['Agarwal, S.',
  'Dey, S.',
  'Ghosh, B.',
  'Biswas, M.',
  'Dasgupta, J.'],
 'journal': 'Acta',
 'year': 2019,
 'pages': '140-151',
 'vol': '1867',
 'pmid': 30463026}

In [17]:
brenda_dict['data']['7.6.2.8']['ph_optimum']

[{'comment': '#12# BtuF binding assay at <29>',
  'proteins': ['12'],
  'organisms': ['12'],
  'references': ['29'],
  'num_value': 7.0},
 {'comment': '#5,10,11# assay at <15,31,37,38>',
  'proteins': ['11', '5', '10'],
  'organisms': ['11', '5', '10'],
  'references': ['15', '31', '37', '38'],
  'num_value': 7.5},
 {'comment': '#8# assay at <34>',
  'proteins': ['8'],
  'organisms': ['8'],
  'references': ['34'],
  'num_value': 8.0},
 {'comment': '#9# assay at <28>',
  'proteins': ['9'],
  'organisms': ['9'],
  'references': ['28'],
  'num_value': 7.6}]

In [18]:
brenda_dict['data']['7.6.2.8']['temperature_optimum']

[{'comment': '#10# assay at <37>',
  'proteins': ['10'],
  'organisms': ['10'],
  'references': ['37'],
  'num_value': 23.0},
 {'comment': '#11# assay at <31>',
  'proteins': ['11'],
  'organisms': ['11'],
  'references': ['31'],
  'num_value': 30.0},
 {'comment': '#12# BtuF binding assay at <29>',
  'proteins': ['12'],
  'organisms': ['12'],
  'references': ['29'],
  'num_value': 20.0},
 {'comment': '#5,10# assay at <15,38>; #8# assay at room temperature <34>',
  'proteins': ['5', '8', '10'],
  'organisms': ['5', '8', '10'],
  'references': ['15', '34', '38'],
  'num_value': 22.0},
 {'comment': '#9# assay at <28>',
  'proteins': ['9'],
  'organisms': ['9'],
  'references': ['28'],
  'num_value': 37.0}]

In [16]:
brenda_dict['data']['7.6.2.8']['proteins']

{'5': [{'accessions': ['P06611'],
   'source': 'uniprot',
   'comment': '#5# alpha-subunit of Fdh3 <12>'}],
 '6': [{'accessions': ['P37028'], 'source': 'uniprot'}],
 '7': [{'accessions': ['P06609'], 'source': 'uniprot'}],
 '8': [{'accessions': ['Q3SFD8'], 'source': 'uniprot'}],
 '9': [{'accessions': ['O14678'], 'source': 'uniprot'}],
 '10': [{'accessions': ['P06609', 'P06611', 'P37028'], 'source': 'uniprot'}],
 '11': [{'accessions': ['Q1GBI8'], 'source': 'uniprot'}],
 '12': [{'accessions': ['A0A0H3AMA6', 'A5F1V0', 'A5F5P5'],
   'source': 'uniprot'}]}

In [22]:
example = brenda_dict['data']['7.6.2.8']

In [34]:
def get_proteins(ec_dict):
    uniprot_ids = defaultdict(list)
    for prot_id, sources in ec_dict['proteins'].items():
        for source in sources:
            if source['source'] == 'uniprot':
                uniprot_ids[prot_id] += source['accessions']
    if len(uniprot_ids) == 0:
        print("Warning: no uniprot ids")
    return uniprot_ids
get_proteins(example)

defaultdict(list,
            {'5': ['P06611'],
             '6': ['P37028'],
             '7': ['P06609'],
             '8': ['Q3SFD8'],
             '9': ['O14678'],
             '10': ['P06609', 'P06611', 'P37028'],
             '11': ['Q1GBI8'],
             '12': ['A0A0H3AMA6', 'A5F1V0', 'A5F5P5']})

In [30]:
# not all entries have pmid
# def get_references(ec_dict):
#     references = {}
#     for ref_id, data in ec_dict['references'].items():
#         print(data)
#         references['ref_id'] = data['pmid']
#     return references
# get_references(example)

In [68]:
def get_topt(ec_dict, proteins_dict):
    uniprot_topt = defaultdict(list)
    # extra_data = {}
    for data in ec_dict['temperature_optimum']:
        if 'proteins' not in data:
            continue
        for prot in data['proteins']:
            for u_id in proteins_dict[prot]:
                if 'num_value' in data:
                    val = data['num_value']
                elif 'min_value' in data:
                    val = f"{data['min_value']}-{data['max_value']}"
                # if u_id in uniprot_topt and uniprot_topt[u_id][0] != val:
                #     print(f"WARNING: {u_id} present twice. "
                #           f"Orig topt: {uniprot_topt[u_id][0]}; New topt: {val}")
                uniprot_topt[u_id].append((val, data['references'], data.get('comment')))
    return uniprot_topt
get_topt(example, get_proteins(example))

defaultdict(list,
            {'P06609': [(23.0, ['37'], '#10# assay at <37>'),
              (22.0,
               ['15', '34', '38'],
               '#5,10# assay at <15,38>; #8# assay at room temperature <34>')],
             'P06611': [(23.0, ['37'], '#10# assay at <37>'),
              (22.0,
               ['15', '34', '38'],
               '#5,10# assay at <15,38>; #8# assay at room temperature <34>'),
              (22.0,
               ['15', '34', '38'],
               '#5,10# assay at <15,38>; #8# assay at room temperature <34>')],
             'P37028': [(23.0, ['37'], '#10# assay at <37>'),
              (22.0,
               ['15', '34', '38'],
               '#5,10# assay at <15,38>; #8# assay at room temperature <34>')],
             'Q1GBI8': [(30.0, ['31'], '#11# assay at <31>')],
             'A0A0H3AMA6': [(20.0, ['29'], '#12# BtuF binding assay at <29>')],
             'A5F1V0': [(20.0, ['29'], '#12# BtuF binding assay at <29>')],
             'A5F5P5': [(20.0, ['2

In [None]:
len(brenda_dict['data'])

7754

In [62]:
len(uniprot_topts)

9485

In [65]:
len(set(ec_num for ec_nums in uniprot_ec_num.values() for ec_num in ec_nums))

3754

In [72]:
uniprot_topts = defaultdict(list)
uniprot_ec_num = defaultdict(list)
for ec_num, ec_data in brenda_dict['data'].items():
    if ec_num == "spontaneous" or 'proteins' not in ec_data:
        continue
    # print(ec_num)
    if 'temperature_optimum' in ec_data:
        for u_id, data in get_topt(ec_data, get_proteins(ec_data)).items():
            if u_id in uniprot_topts:
                print(f"u_id: {u_id} already in uniprot_topts")
            uniprot_topts[u_id] += data
        uniprot_ec_num[u_id] += [ec_num]

u_id: A4IP64 already in uniprot_topts
u_id: A4ISB9 already in uniprot_topts
u_id: P15121 already in uniprot_topts
u_id: P16116 already in uniprot_topts
u_id: H8XA83 already in uniprot_topts
u_id: A0A097ZMY7 already in uniprot_topts
u_id: J7M9D0 already in uniprot_topts
u_id: P28625 already in uniprot_topts
u_id: Q9X5C9 already in uniprot_topts
u_id: A4QB65 already in uniprot_topts
u_id: A0A0E4AY21 already in uniprot_topts
u_id: A0A0E4AX59 already in uniprot_topts
u_id: Q308C1 already in uniprot_topts
u_id: A0A0M4HL56 already in uniprot_topts
u_id: Q7SD67 already in uniprot_topts
u_id: A0A166G7A0 already in uniprot_topts
u_id: A0A172EJV6 already in uniprot_topts
u_id: B2NI94 already in uniprot_topts
u_id: P42330 already in uniprot_topts
u_id: P23457 already in uniprot_topts
u_id: Q04828 already in uniprot_topts
u_id: B9WJQ5 already in uniprot_topts
u_id: O08349 already in uniprot_topts
u_id: A0A223LRZ7 already in uniprot_topts
u_id: Q5UY95 already in uniprot_topts
u_id: D4GP33 already i

In [76]:
uniprot_topts['M9UYB0']

[(40.0, ['22'], '#16# recombinant enzyme <22>'),
 (40.0, ['17', '25'], '#20# recombinant enzyme <25>')]

In [71]:
df = pd.DataFrame(uniprot_topts)
df

ValueError: All arrays must be of the same length

In [55]:
uniprot_topts

defaultdict(list,
            {'Q6L0S1': [83.0, ['173'], None],
             'A3MVR8': [80.0, ['234', '236', '285'], '#109# assay at <234>'],
             'D4GSN2': [80.0, ['234', '236', '285'], '#109# assay at <234>'],
             'P39462': [80.0, ['234', '236', '285'], '#109# assay at <234>'],
             'P28625': ['25.0-30.0', ['120', '307'], '#11# assay at <120>'],
             'B2KJ46': [35.0,
              ['122', '168', '196', '231', '284'],
              '#11# immobilized enzyme <196>; #11# Zn-ADH and Co-ADH <122>; #46# reductive reaction, recombinant enzymes expressed from Saccharomyces cerevisiae and Hansenula polymorpha <231>; #46# recombinant enzyme expressed in Hansenula polymorpha <231>; #46# recombinant enzyme expressed in Saccharomyces cerevisiae <231>; #11# reduction of glycolaldehyde, furfural, butyraldehyde, and propionaldehyde <284>'],
             'G2I689': [30.0,
              ['84',
               '144',
               '182',
               '185',
            

In [38]:
brenda_dict['data']['1.1.1.1']['temperature_optimum']

[{'proteins': ['103'],
  'organisms': ['103'],
  'references': ['173'],
  'num_value': 83.0},
 {'comment': '#109# assay at <234>',
  'proteins': ['163', '136', '109'],
  'organisms': ['163', '136', '109'],
  'references': ['234', '236', '285'],
  'num_value': 80.0},
 {'organisms': ['11'],
  'references': ['3'],
  'min_value': 45.0,
  'max_value': 50.0},
 {'comment': '#11# assay at <120>',
  'proteins': ['178'],
  'organisms': ['11', '178'],
  'references': ['120', '307'],
  'min_value': 25.0,
  'max_value': 30.0},
 {'comment': '#29# assay at <129>; #11# Cu-ADH <122>; #11# enzyme covalently immobilized to magnetic Fe3O4 nanoparticles via glutaraldehyde <182>',
  'organisms': ['11', '53', '57', '29'],
  'references': ['99', '122', '129', '182', '278'],
  'num_value': 40.0},
 {'comment': '#11# immobilized enzyme <196>; #11# Zn-ADH and Co-ADH <122>; #46# reductive reaction, recombinant enzymes expressed from Saccharomyces cerevisiae and Hansenula polymorpha <231>; #46# recombinant enzyme e