## BRENDA pH activity data
- BRENDA has "pH optimum" and "pH range" fields
- We need to match up the data from these two sheets

### BRENDA pH range
- First we will extract the pH range data

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import os
import re
import regex
from tqdm.auto import tqdm

In [2]:
# These are 
inputs_dir = Path("/projects/robustmicrob/jlaw/inputs/brenda/202301_tables")
data_dir = "/projects/robustmicrob/jlaw/projects/prot_stability_engineering/inputs/brenda"

ph_range_file = inputs_dir / "202301_ph_range.tsv.gz"
header = ["ec_num", "name", "ph_min", "ph_max", "comments", "organism", "uniprot_id", "ref", "empty"]
data = pd.read_csv(ph_range_file, sep='\t', names=header)
data = data.drop(columns=['empty'])
print(len(data))
data = data[data.ph_min != "additional information"]
# data = data[data.ph_min != "additional information"]
print(len(data))
data.head(2)

12881
12404


Unnamed: 0,ec_num,name,ph_min,ph_max,comments,organism,uniprot_id,ref
0,1.1.1.1,alcohol dehydrogenase,2,8.0,-,Thermoplasma acidophilum,Q9HIM3,700124
1,1.1.1.1,alcohol dehydrogenase,4,7.5,"pH 4.0: about 80% of maximal activity, pH 7.5:...",Candida parapsilosis,B2KJ46,684556


In [None]:
# remove "mutant comments
# TODO apply mutations listed(?)

In [3]:
data.organism.value_counts()

Homo sapiens                    587
Rattus norvegicus               450
Escherichia coli                423
Bos taurus                      241
Saccharomyces cerevisiae        229
                               ... 
Anisodus acutangulus              1
Saitozyma flava                   1
Bradyrhizobium elkanii            1
Advenella mimigardefordensis      1
Dicentrarchus labrax              1
Name: organism, Length: 2926, dtype: int64

In [10]:
data[data.uniprot_id.apply(len) > 15].head(5)

Unnamed: 0,ec_num,name,ph_min,ph_max,comments,organism,uniprot_id,ref
181,1.1.1.195,cinnamyl-alcohol dehydrogenase,2.5,9.0,profile overview,Populus tomentosa,"A0A023RBJ1, KJ159967, T1WUT6, T1WUU2, T1WVG5, ...",741217
182,1.1.1.195,cinnamyl-alcohol dehydrogenase,4.0,8.5,profile overview,Populus tomentosa,"A0A023RBJ1, KJ159967, T1WUT6, T1WUU2, T1WVG5, ...",741217
183,1.1.1.195,cinnamyl-alcohol dehydrogenase,4.5,7.5,"activity range, profile overview",Populus tomentosa,"A0A023RBJ1, KJ159967, T1WUT6, T1WUU2, T1WVG5, ...",741217
185,1.1.1.195,cinnamyl-alcohol dehydrogenase,5.5,8.5,"activity range, the enzyme is completely inact...",Hibiscus cannabinus,"A0A0U1ZF25, D9ZKR7",740176
233,1.1.1.206,tropinone reductase I,5.8,8.4,-,Przewalskia tangutica,"A0A6B7HCZ2, A0A6B7HCZ2",760748


#### Find a uniprot ID for rows where it's missing
See if we can use the mapping from the uniprot ID's ec numbers to get a protein

In [4]:
uniprot_ec_file = f"{data_dir}/uniprot/uniprotkb_ec_AND_reviewed_true_2023_07_06.tsv.gz"
uniprot_ec_df = pd.read_table(uniprot_ec_file)
print(len(uniprot_ec_df))
uniprot_ec_df.head(2)

274744


Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,EC number
0,A0A009IHW8,reviewed,ABTIR_ACIB9,2' cyclic ADP-D-ribose synthase AbTIR (2'cADPR...,J512_3302,Acinetobacter baumannii (strain 1295743),269,3.2.2.-; 3.2.2.6
1,A0A023I7E1,reviewed,ENG1_RHIMI,"Glucan endo-1,3-beta-D-glucosidase 1 (Endo-1,3...",ENG1 LAM81A,Rhizomucor miehei,796,3.2.1.39


In [5]:
data_no_uniprot = data[data.uniprot_id == "-"]
len(data_no_uniprot)

9183

In [6]:
data_matched_uniprot = pd.DataFrame()
for ec_num, df in tqdm(data_no_uniprot.groupby('ec_num')):
    uniprot_ec_rows = uniprot_ec_df[uniprot_ec_df['EC number'].apply(lambda x: ec_num in x.split('; '))]
    if len(uniprot_ec_rows) == 0:
        continue

    org_count = df.organism.value_counts().to_dict()
    
    def match_uniprot_ec_organism(org, 
                                  num_brenda_org_cutoff=2,
                                  num_matching_rows_cutoff=2):
        # only try and automatically match for brenda entries that have fewer than 3 entries for the same enzyme
        if org_count[org] > num_brenda_org_cutoff:
            return np.nan

        matching_rows = uniprot_ec_rows[uniprot_ec_rows.Organism.apply(lambda x: org in x)]
        if len(matching_rows) > num_matching_rows_cutoff:
            return np.nan

        return ",".join(matching_rows.Entry.values)

    df['uniprot_id'] = df.organism.apply(match_uniprot_ec_organism)
    data_matched_uniprot = pd.concat([data_matched_uniprot, df])

print(len(data_matched_uniprot))

  0%|          | 0/2653 [00:00<?, ?it/s]

8325


In [7]:
data_matched_uniprot.uniprot_id = data_matched_uniprot.uniprot_id.replace("",np.nan)
data_matched_uniprot = data_matched_uniprot.dropna(subset='uniprot_id')

In [8]:
print(len(data_matched_uniprot), data_matched_uniprot.uniprot_id.nunique())

2156 1802


In [47]:
data_uniprot = data[data.uniprot_id != '-'].dropna(subset='uniprot_id')
print(len(data_uniprot), data_uniprot.uniprot_id.nunique())
data_uniprot = pd.concat([data_uniprot, data_matched_uniprot])
print(len(data_uniprot), data_uniprot.uniprot_id.nunique())
data_uniprot = data_uniprot.drop_duplicates()
print(len(data_uniprot), data_uniprot.uniprot_id.nunique())

3221 2642
5377 4375
5377 4375


In [48]:
data_uniprot.uniprot_id.value_counts()

Q7X2H8            8
P9WJ61            7
Q9S467            6
G0S9G3            6
I3ZTN9            6
                 ..
Q9WXN1            1
D2DRB6            1
Q97VS7, Q97X08    1
D3GDK4            1
P9WQM1,P9WQM0     1
Name: uniprot_id, Length: 4375, dtype: int64

In [49]:
# split rows with multiple uniprot IDs
data_uniprot["uniprot_id"] = data_uniprot.uniprot_id.apply(lambda x: x.replace(' AND ',',').replace(' ','').split(','))
# data_uniprot["uniprot_id"] = data_uniprot.uniprot_id.apply(lambda x: x.replace(' AND ',',').replace(' and',',').replace(' ','').split(','))
# don't fix the "ID_AandID_B" issue here because that will mess up the exp_index downstream.
# instead, add those rows to the end
data_uniprot = data_uniprot.explode(column='uniprot_id', ignore_index=True)
print(len(data_uniprot), data_uniprot.uniprot_id.nunique())

6399 4993


In [50]:
data_uniprot.uniprot_id.value_counts()

Q6LXQ3    12
P9WJ61     8
Q97YG7     8
Q7X2H8     8
P00784     6
          ..
D2B1P1     1
Q43998     1
Q65GW7     1
J9QQ32     1
P9WQM0     1
Name: uniprot_id, Length: 4993, dtype: int64

In [51]:
data_uniprot

Unnamed: 0,ec_num,name,ph_min,ph_max,comments,organism,uniprot_id,ref
0,1.1.1.1,alcohol dehydrogenase,2,8,-,Thermoplasma acidophilum,Q9HIM3,700124
1,1.1.1.1,alcohol dehydrogenase,4,7.5,"pH 4.0: about 80% of maximal activity, pH 7.5:...",Candida parapsilosis,B2KJ46,684556
2,1.1.1.1,alcohol dehydrogenase,4.5,9,high catalytic activity within the range of pH...,Saccharomyces cerevisiae,P28625,763285
3,1.1.1.1,alcohol dehydrogenase,4.9,5.8,"pH 4.9: about 25% of maximal activity, pH 5.8:...",Sulfolobus acidocaldarius,Q4J702,719428
4,1.1.1.1,alcohol dehydrogenase,4.9,5.8,"pH 4.9: about 25% of maximal activity, pH 5.8:...",Sulfolobus acidocaldarius,Q4J9F2,719428
...,...,...,...,...,...,...,...,...
6394,7.2.2.19,H+/K+-exchanging ATPase,6.5,8.5,at pH 6.5 30% and at pH 8.5 less than 1% of al...,Sus scrofa,P19156,672636
6395,7.2.2.19,H+/K+-exchanging ATPase,6.8,7.8,"pH 6.8: about 60% of maximal activity, pH 7.8:...",Sus scrofa,P19156,246853
6396,7.3.2.1,ABC-type phosphate transporter,3.5,10,-,Pseudomonas aeruginosa,Q51546,698134
6397,7.3.2.3,ABC-type sulfate transporter,5,8,no activity below pH 5.0 and above pH 8.0,Mycobacterium tuberculosis,P9WQM1,697446


In [60]:
data_uniprot.index.max()

6398

In [65]:
def fix_ids_with_and(df):
    """ We want to "explode" the rows with multiple uniprot IDs I missed
    e.g., P16027andP14775, but don't change the original index since I used that downstream.
    """
    df = df.copy()
    df2 = df[df.uniprot_id.apply(lambda x: 'and' in x)]
    
    # for each of the original rows where this happens, keep only the first
    df['uniprot_id'] = df.uniprot_id.apply(lambda x: x.split('and')[0])
    
    df2['uniprot_id'] = df2.uniprot_id.apply(lambda x: x.split('and')[1:])
    df2 = df2.explode(column='uniprot_id', ignore_index=True)
    print(len(df2), df2.index)
    
    # append these rows to the end of the original df so they get their own exp_idx
    df2.index += df.index.max()
    df = pd.concat([df, df2], ignore_index=True)
    return df
data_uniprot_test = fix_ids_with_and(data_uniprot)
print(len(data_uniprot), len(data_uniprot_test))
print(data_uniprot_test[data_uniprot_test.uniprot_id.isin(['P16027', 'P14775'])])
# data_uniprot_test.tail()

41 RangeIndex(start=0, stop=41, step=1)
6399 6440
       ec_num                                   name ph_min ph_max  \
250   1.1.2.7  methanol dehydrogenase (cytochrome c)      7   10.5   
6401  1.1.2.7  methanol dehydrogenase (cytochrome c)      7   10.5   

                  comments                  organism uniprot_id     ref  
250   pH profile, overview  Methylorubrum extorquens     P16027  684950  
6401  pH profile, overview  Methylorubrum extorquens     P14775  684950  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['uniprot_id'] = df2.uniprot_id.apply(lambda x: x.split('and')[1:])


In [66]:
data_uniprot = data_uniprot_test

In [120]:
data_uniprot[data_uniprot.uniprot_id.isin(["A0A8D3X5C3", "A0A8D3X5J3"])]

Unnamed: 0,ec_num,name,ph_min,ph_max,comments,organism,uniprot_id,ref
853,1.7.1.3,nitrate reductase (NADPH),6.5,-,more than 99% of maximum activity,Priestia megaterium,A0A8D3X5C3,764856
6423,1.7.1.3,nitrate reductase (NADPH),6.5,-,more than 99% of maximum activity,Priestia megaterium,A0A8D3X5J3,764856


### Extract the pH range values from the comments

In [67]:
def extract_ph_range_from_comment(comment, pattern, captures, error_limit=2):
    """
    *captures*: list of what is captured in each position
        e.g., ('act', 'ph', 'act', 'ph', 'ph') for 
        "about 40% activity at pH 6.5, more than 70% activity between pH 7.0 and 8.0"
    half-maximal and half-maximum will be replaced by 50%
    """
    comment = comment.replace("half-maximal", "50%") \
                     .replace("half-maximum", "50%") \
                     .replace("half-optima", "50%")
    # Rather than always allow errors, I found sometimes 
    # the regex would include an error even though it didn't have to
    # first check the plain regular expression
    real_pattern = re.compile(pattern)
    extracted_vals = real_pattern.findall(comment)
    texts = [x.group() for x in real_pattern.finditer(comment)]
    matches = [[t] + (list(e) if isinstance(e, tuple) else [e])  for t, e in zip(texts, extracted_vals)]

    # now allow errors and see if more matches are found
    fuzzy_pattern = f'({pattern}){{e<={error_limit}}}'
    fuzzy_matches = regex.findall(fuzzy_pattern, comment)
    if len(fuzzy_matches) > len(matches):
        matches = fuzzy_matches
    
    def clean_number(x):
        x = x.replace(':', '').replace(' ','')
        if '%' in x:
            x = x.replace('%','')
            x = float(x) / 100
        return float(x)
    
    cleaned_matches = []
    # keep track of how much of the comment isn't captured by the regex
    leftover_comment = comment
    for m in matches:
        # find which indices are ph and which are activity
        opt_ph_idx = [i for i, c in enumerate(captures) if c == 'opt_ph']
        no_act_ph_idx = [i for i, c in enumerate(captures) if c == 'no_act_ph']
        ph_idx = [i for i, c in enumerate(captures) if c == 'ph']
        act_idx = [i for i, c in enumerate(captures) if c == 'act']
        # print(opt_ph_idx, ph_idx, act_idx)
        
        match = m[0]
        # make sure the semicolon is not skipped by the error matching
        if '):' in pattern and ':' not in match:
            continue
        # print(m)
        try:
            if len(opt_ph_idx) != 0:
                # just use 95% as the 'optimal' activity for now
                act = 0.95
                # if the pH is two values joined by '-'
                for ph in m[opt_ph_idx[0]+1].split('-'):
                    ph = clean_number(ph)
                    cleaned_matches.append((match, ph, act))
            if len(no_act_ph_idx) != 0:
                # this covers "no activity at pH X.X"
                act = 0
                ph = clean_number(m[no_act_ph_idx[0]+1])
                cleaned_matches.append((match, ph, act))
            if len(ph_idx) == 0 and len(act_idx) == 1:
                # "more than 95% of maximum activity"
                act = clean_number(m[1])
                cleaned_matches.append((match, None, act))
            elif len(ph_idx) == 1 and len(act_idx) == 1:
                ph = clean_number(m[ph_idx[0]+1])
                act = clean_number(m[act_idx[0]+1])
                cleaned_matches.append((match, ph, act))
            elif len(ph_idx) > len(act_idx):
                # "about 40% activity at pH 6.5, more than 70% activity between pH 7.0 and 8.0"
                curr_ph_idx = 0
                curr_act_idx = 0
                while curr_ph_idx < len(ph_idx):
                    # check if there are two phs next to each other
                    curr_idx = curr_ph_idx + curr_act_idx + len(opt_ph_idx)
                    # if the current capture index is at 'act',
                    # then the next two must be pHs
                    curr_idx += 1 if captures[curr_idx] != 'pH' else curr_idx
                    if (curr_idx + 1 < len(captures) and (
                        captures[curr_idx + 1] == 'ph' and 
                        captures[curr_idx] == 'ph')):
                        ph1 = clean_number(m[ph_idx[curr_ph_idx]+1])
                        curr_ph_idx += 1
                        ph2 = clean_number(m[ph_idx[curr_ph_idx]+1])
                        act = clean_number(m[act_idx[curr_act_idx]+1])
                        cleaned_matches.append((match, ph1, act))
                        cleaned_matches.append((match, ph2, act))
                    else:
                        ph = clean_number(m[ph_idx[curr_ph_idx]+1])
                        act = clean_number(m[act_idx[curr_act_idx]+1])
                        cleaned_matches.append((match, ph, act)) 
                    curr_ph_idx += 1
                    curr_act_idx += 1

            # if match_ph:
            #     match, ph, act = m
            #     ph = clean_number(ph)
            # else:
            #     match, act = m
            #     ph = None
            # act = clean_number(act)
            # cleaned_matches.append((match, ph, act))
            leftover_comment = leftover_comment.replace(match, "")
        except ValueError as e:
            # if we aren't able to cast the ph or activity level as a float,
            # then skip this entry
            print("Unable to extract float for one of the values:")
            print(m, comment)
    leftover_comment = leftover_comment.strip()
    # if there is a lot of comment still left after the matches, 
    # then this regex is not a good match for this comment
    if len(leftover_comment) > 60 and 'pH' in leftover_comment:
        good_match = False
        return good_match, matches, leftover_comment
            
    good_match = True if len(cleaned_matches) > 0 else False
    return good_match, cleaned_matches, leftover_comment

comment = "pH 7: about 41% of maximal activity, pH 8: 59% of maximal activity, oxidase activity"
# pattern = r"pH (\d+\.?\d*): [\s\w]*(\d\d%) [\s\w]*maximal activity,?"
pattern = r"pH (\d+\.?\d*): [about ]*(\d\d%) [\s\w]*maximal activity,?"

extract_ph_range_from_comment(comment, pattern, captures=['ph', 'act'])

(True,
 [('pH 7: about 41% of maximal activity,', 7.0, 0.41),
  ('pH 8: 59% of maximal activity,', 8.0, 0.59)],
 'oxidase activity')

In [43]:
# # testing patterns
# comments = ["more than 70% of maximum activity within",
#             # "more than 95% of maximum activity",
#             # "more than 60% activity between pH 5.0 and 8.0",
#             # "over 80% of amximal activity",
#             # "50% of maximum activity",
#             # "about 40% activity at pH 6.5, more than 70% activity between pH 7.0 and 8.0",
#             # "more than 60% of maximal activity between pH 7.5 and 8.5",
#             # "approx. a 20% of maximal activity at pH 6.5, approx. 90% of maximal activity at pH 9.0",
#             "about 55% activity at pH 7.0, 100% activity at pH around 8.0, about 80% activity at pH 9.0"
#            ]
# # pattern = r"[more than]*[over]*(\d\d%)[of ]*[maximumal]* activity,? ?"
# pattern = (r"[about ]*(\d+%)[of ]*[maximumal ]* activity[ is observed]*"
#            r" at pH[ around]* (\d+\.?\d*),? ?")
# fuzzy_pattern = f'({pattern}){{e<={1}}}'
# real_pattern = re.compile(pattern)
# for comment in comments:
#     print(regex.findall(fuzzy_pattern, comment))
#     extracted_vals = real_pattern.findall(comment)
#     texts = [x.group() for x in real_pattern.finditer(comment)]
#     matches = [[t] + (list(e) if isinstance(e, tuple) else [e])  for t, e in zip(texts, extracted_vals)]
#     print(matches)
#     # print(real_pattern.findall(comment))

In [68]:
def get_ph_act_from_comments(patterns, data, error_limit=1):
    """ Function to extract the pH activity from the Brenda comments fields 
    *patterns*: list of tuples with 1) regex pattern, 2) capture labels (e.g., ('ph', 'act'))
    *data*: dataframe with a 'comments' column
    """
    data_remaining = data.copy()
    all_results = []
    for pattern, captures in patterns:
        print('\n' + pattern)
        results = data_remaining.comments.apply(lambda x: extract_ph_range_from_comment(x, pattern, captures, error_limit=error_limit))
        # too_much_leftover = {i: (r, leftover) for i, (gm, r, leftover) in results.to_dict().items()
        #                      if gm is False and len(r) > 0}
        # print(len(too_much_leftover))
        # 
        matching_results = {i: (r, leftover) for i, (gm, r, leftover) in results.to_dict().items()
                            if gm}
        print(len(matching_results))
        if 'ph' in captures:
            table = [[i, ph, act, leftover] 
                 for i, (rs, leftover) in matching_results.items() 
                 for match, ph, act in rs]
        else:
            table = []
            for i, (rs, leftover) in matching_results.items():
                ph_min = data_remaining.loc[i].ph_min
                ph_min = float(ph_min) if ph_min is not None and ph_min != "-" else None
                ph_max = data_remaining.loc[i].ph_max
                ph_max = float(ph_max) if ph_max is not None and ph_max != "-" else None
                if ph_min is None and ph_max is None:
                    continue
                    
                for match, _, act in rs:
                    table.append([i, ph_min, act, leftover])
                    table.append([i, ph_max, act, leftover])

        data_act = pd.DataFrame(table, columns=['exp_index', 'ph', 'activity', 'note'])
        print(data_act)
        data_act = data_act.replace("None", np.nan).dropna(subset=['ph'])
        data_act['note'] = data_act.note.replace("#\d+", "", regex=True) \
                                        .replace("\d+#", "", regex=True) \
                                        .replace("#", "", regex=True) \
                                        .replace("<\d+>", "", regex=True)
        data_act = data_act.sort_values("exp_index")
        print(len(data_act))


        all_results.append((data_act, matching_results))

        data_remaining = data_remaining[~data_remaining.index.isin(data_act.exp_index)]
        print(len(data_remaining), " data remaining")
        
    # TODO for activity range comments, give those a value of 10% (?)
    # may not always match
        
    all_data_act = pd.concat([da for da, _ in all_results])
    return all_data_act, all_results, data_remaining

In [69]:
# build the list of regex patterns 
activity_words = r"(?:about)?(?:over)?(?:approx\.)?(?:more than)?(?:less than)?"
observed_words = r"(?:is observed)?(?:remains)?(?:remaining)?(?:around)?(?:is maintained)?(?: from)? ?(?:between)?(?:within this)? ?"
maximum_words = r" ?(?:of )?(?:the )?(?:optimal)?(?:optimum)?(?:maximum)?(?:maximal)?(?:residual)? ?"
patterns = [
    # leave this pattern first since it seems to capture pretty specific cases
    (r"pH (\d+\.?\d*): [about ]*(\d+%)" + maximum_words + "activity,?", 
     ['ph', 'act']),
    # about 40% activity at pH 6.5, more than 70% activity between pH 7.0 and 8.0
    ((activity_words + r" ?(\d\d%)" + maximum_words + "activity[ is observed]* at pH (\d+\.?\d*), " +
      activity_words + r" ?(\d\d%)" + maximum_words + "activity " +
      r"[between]*[at]* pH (\d+\.?\d*) and [at ]*[pH ]*(\d+\.?\d*)"),
     ['act', 'ph', 'act', 'ph', 'ph']),
    # optimal activity at pH 8.0-9.0, 80% of maximal activity between pH 7.0 and pH 10.0
    ((r"optimal activity[ is observed]* at pH (\d+\.?\d*-?\d*\.?\d*), " +
      activity_words + r" ?(\d+%)" + maximum_words + "activity ?" +
      observed_words + r"[ at]* pH (\d+\.?\d*) and [at ]*[pH ]*(\d+\.?\d*)"),
     ['opt_ph', 'act', 'ph', 'ph']),
    # optimal activity at pH 8.0-9.0, 80% of maximal activity at pH 7.0
    ((r"optimal activity[ is observed]* at pH (\d+\.?\d*-?\d*\.?\d*), " +
      activity_words + r" ?(\d+%)" + maximum_words + "activity" +
      r"[ at]* pH (\d+\.?\d*)"),
     ['opt_ph', 'act', 'ph']),
    # 70% of maximum activity at pH 3, no activity at pH 7.5
    ((activity_words + r" ?(\d+%)" + maximum_words + "activity[ is observed]* [at]* pH (\d+\.?\d*), " +
      r"no activity at pH (\d+\.?\d*)"), 
     ['act', 'ph', 'no_act_ph']),
    # more than 60% of maximal activity between pH 7.5 and 8.5
    ((activity_words + r" ?(\d+%)" + maximum_words + "activity " +
      observed_words + r"[ at]* pH (\d+\.?\d*) and [at ]*[pH ]*(\d+\.?\d*)"), 
     ['act', 'ph', 'ph']),
    # about 55% activity at pH 7.0, 100% activity at pH around 8.0, about 80% activity at pH 9.0
    # ((r"[about]{4,5}?[over]{3,4}?[approx\.]{6,7}?[more than]{8,9}? ?(\d+%)[of ]*[maximumal ]* activity[ is observed]*"
    ((activity_words + r" ?(\d+%)" + maximum_words + "activity " +
    # ((r"[about ]*(\d++%)[of ]*[maximumal ]* activity[ is observed]*"
      observed_words + r"[ at]* pH[ around]* (\d+\.?\d*),? ?"),
     ['act', 'ph']),
    # 8.0-9.0: maximal activity, 10.5: 70% of maximal activity
    ((r"(\d+\.?\d*-?\d?\.?\d?): maximal activity, (\d+\.?\d*): (\d+%)" +
      maximum_words + "activity"),
     ['opt_ph', 'ph', 'act']),
    # 7.0: about 45% of maximal activity, 9.5: about 95% of maximal activity
    ((r"(\d+\.?\d*): " + activity_words + " ?(\d+%)" + maximum_words + "activity,?"),
     ['ph', 'act']),
    # over 70% of the maximum activity within this pH range
    ((activity_words + r" ?(\d+%)" + maximum_words + "activity " +
      observed_words + r"(?:pH ?)(?:range)"), 
     ['act']),
    # this is kind of the catch-all at the end
    # more than 95% of maximum activity
    # 50% of maximum activity
    (activity_words + r" ?(\d+%)" + maximum_words + "activity,? ?", ['act']),
]

In [70]:
# TODO automate these test cases
test_cases = [
    dict(ph_min=4.0, ph_max=7.5,
         comments="pH 4.0: about 80% of maximal activity, pH 7.5: about 60% of maximal activity",
         activities=[(4.0, 80), (7.5, 60)]),
    dict(ph_min=6.5, ph_max=8.0, 
         comments="about 40% activity at pH 6.5, more than 70% activity between pH 7.0 and 8.0",
         activities=[(6.5, 40), (7.0, 70), (8.0, 70)]),
    dict(ph_min=7.5, ph_max=8.5, 
         comments="more than 60% of maximal activity between pH 7.5 and 8.5",
         activities=[(7.5, 60), (8.5, 60)]),
    dict(ph_min=7.0, ph_max=9.0,
         comments="optimal activity at pH 8.0-9.0, 80% of maximal activity at pH 7.0", 
         activities=[(8.0, 95), (9.0, 95), (7.0, 80)]),
    dict(ph_min=7.0, ph_max=10.0,
         comments="optimal activity at pH 8.0-9.0, 80% of maximal activity at pH 7.0 and pH 10.0", 
         activities=[(8.0, 95), (9.0, 95), (7.0, 80), (10.0, 80)]),
    dict(ph_min=None, ph_max=None,
         comments="70% of maximum activity at pH 3, no activity at pH 7.5",
        activities=[(3.0, 70), (7.5, 0)]),
    # this comment would just use the pH listed in the 'ph_min' field
    dict(ph_min=7.0,
         comments="more than 95% of maximum activity",
         activities=[(7.0, 95)]),
    dict(ph_min=7.0, ph_max=10.0,
         comments="about 55% activity at pH 7.0, 100% activity at pH around 8.0, about 80% activity at pH 9.0, about 40% activity at pH 10.0",
         activities=[(7.0, 55), (8.0, 100), (9.0, 80), (10.0, 40)]),
    dict(ph_min=5.5, ph_max=9.5,
         comments="about 80% activity at pH 5.5, 100% activity at pH 6.5, about 90% activity at pH 7.5",
         activities=[(5.5, 80), (6.5, 100), (7.5, 90)]),
    #TODO: 68% of maximal activity at pH 6.3 and 88% at pH 8.8
    dict(ph_min=7.0, ph_max=9.5,
         comments="7.0: about 45% of maximal activity, 9.5: about 95% of maximal activity ",
         activities=[(7.0, 45), (9.5, 95)]),
    dict(ph_min=8.0, ph_max=10.5,
         comments="8.0-9.0: maximal activity, 10.5: 70% of maximal activity",
         activities=[(8.0, 95), (9.0, 95), (10.5, 70)]),
    dict(ph_min=5, ph_max=7,
         comments="about half-maximal activity at pH 5.0 and 7.0",
         activities=[(5, 50), (7, 50)]),
]

In [71]:
# test_data = pd.DataFrame(pd.Series([c for c, _ in test_cases]), columns=['comments'])
test_data = pd.DataFrame(test_cases)
# test_data = test_data.tail(1)
test_data

Unnamed: 0,ph_min,ph_max,comments,activities
0,4.0,7.5,"pH 4.0: about 80% of maximal activity, pH 7.5:...","[(4.0, 80), (7.5, 60)]"
1,6.5,8.0,"about 40% activity at pH 6.5, more than 70% ac...","[(6.5, 40), (7.0, 70), (8.0, 70)]"
2,7.5,8.5,more than 60% of maximal activity between pH 7...,"[(7.5, 60), (8.5, 60)]"
3,7.0,9.0,"optimal activity at pH 8.0-9.0, 80% of maximal...","[(8.0, 95), (9.0, 95), (7.0, 80)]"
4,7.0,10.0,"optimal activity at pH 8.0-9.0, 80% of maximal...","[(8.0, 95), (9.0, 95), (7.0, 80), (10.0, 80)]"
5,,,"70% of maximum activity at pH 3, no activity a...","[(3.0, 70), (7.5, 0)]"
6,7.0,,more than 95% of maximum activity,"[(7.0, 95)]"
7,7.0,10.0,"about 55% activity at pH 7.0, 100% activity at...","[(7.0, 55), (8.0, 100), (9.0, 80), (10.0, 40)]"
8,5.5,9.5,"about 80% activity at pH 5.5, 100% activity at...","[(5.5, 80), (6.5, 100), (7.5, 90)]"
9,7.0,9.5,"7.0: about 45% of maximal activity, 9.5: about...","[(7.0, 45), (9.5, 95)]"


In [72]:
test_data_act, test_results, test_data_remaining = get_ph_act_from_comments(patterns, test_data)
test_data_act


pH (\d+\.?\d*): [about ]*(\d+%) ?(?:of )?(?:the )?(?:optimal)?(?:optimum)?(?:maximum)?(?:maximal)?(?:residual)? ?activity,?
1
   exp_index   ph  activity note
0          0  4.0       0.8     
1          0  7.5       0.6     
2
11  data remaining

(?:about)?(?:over)?(?:approx\.)?(?:more than)?(?:less than)? ?(\d\d%) ?(?:of )?(?:the )?(?:optimal)?(?:optimum)?(?:maximum)?(?:maximal)?(?:residual)? ?activity[ is observed]* at pH (\d+\.?\d*), (?:about)?(?:over)?(?:approx\.)?(?:more than)?(?:less than)? ?(\d\d%) ?(?:of )?(?:the )?(?:optimal)?(?:optimum)?(?:maximum)?(?:maximal)?(?:residual)? ?activity [between]*[at]* pH (\d+\.?\d*) and [at ]*[pH ]*(\d+\.?\d*)
1
   exp_index   ph  activity note
0          1  6.5       0.4     
1          1  7.0       0.7     
2          1  8.0       0.7     
3
10  data remaining

optimal activity[ is observed]* at pH (\d+\.?\d*-?\d*\.?\d*), (?:about)?(?:over)?(?:approx\.)?(?:more than)?(?:less than)? ?(\d+%) ?(?:of )?(?:the )?(?:optimal)?(?:optimum)?(?:maximum

Unnamed: 0,exp_index,ph,activity,note
0,0,4.0,0.8,
1,0,7.5,0.6,
0,1,6.5,0.4,
1,1,7.0,0.7,
2,1,8.0,0.7,
0,4,8.0,0.95,
1,4,9.0,0.95,
2,4,7.0,0.8,
3,4,10.0,0.8,
0,3,8.0,0.95,


In [73]:
# now extract the ph activity data from the whole dataset
data_act, all_results, data_remaining = get_ph_act_from_comments(patterns, data_uniprot)
data_act


pH (\d+\.?\d*): [about ]*(\d+%) ?(?:of )?(?:the )?(?:optimal)?(?:optimum)?(?:maximum)?(?:maximal)?(?:residual)? ?activity,?
Unable to extract float for one of the values:
('pH 8-0: about 50% of maximal activity', '8-0', '50%') pH 5.5: about 30% of maximal activity, pH 8-0: about 50% of maximal activity
Unable to extract float for one of the values:
('pH 8-0: about 50% of maximal activity', '8-0', '50%') pH 5.5: about 30% of maximal activity, pH 8-0: about 50% of maximal activity
Unable to extract float for one of the values:
('pH 8-0: about 50% of maximal activity', '8-0', '50%') pH 5.5: about 30% of maximal activity, pH 8-0: about 50% of maximal activity
Unable to extract float for one of the values:
('pH 5.8: about% of maximal activity,', '5.8', '%') pH 5.8: about% of maximal activity, pH 8.8: about% of maximal activity
Unable to extract float for one of the values:
('pH 8.8: about% of maximal activity', '8.8', '%') pH 5.8: about% of maximal activity, pH 8.8: about% of maximal activ

Unnamed: 0,exp_index,ph,activity,note
0,1,4.0,0.80,
1,1,7.5,0.60,
2,3,4.9,0.25,reduction reaction
3,3,5.8,0.40,reduction reaction
4,4,4.9,0.25,reduction reaction
...,...,...,...,...
1502,6391,10.0,0.75,ATP synthesis in washed membranes of Haloferax...
1504,6422,6.8,0.95,
1506,6423,6.5,0.99,
1508,6434,8.5,0.85,L-glutamine-dependent activity catalyzed by IG...


In [74]:
data_uniprot.tail()

Unnamed: 0,ec_num,name,ph_min,ph_max,comments,organism,uniprot_id,ref
6435,4.3.1.B2,imidazole glycerol phosphate synthase,9.5,-,"22% of maximal activity, L-glutamine-dependent...",Escherichia coli,P60664,726928
6436,4.4.1.31,phycoerythrocyanin alpha-cysteine-84 phycoviol...,7.0,8.5,"pH 7.0: about 40% of maximal activity, pH 8.5:...",Mastigocladus laminosus,P29730,731742
6437,5.4.99.64,2-hydroxyisobutanoyl-CoA mutase,6.2,7,"pH 6.2: about 60% of maximal activity, pH 7.0:...",Aquincola tertiaricarbonis,I3VE74,736409
6438,7.1.1.7,"quinol oxidase (electrogenic, proton-motive fo...",5.5,-,"35% of the activity at pH 7.0, substrate H2O2",Escherichia coli,P0ABK2,752197
6439,7.1.1.7,"quinol oxidase (electrogenic, proton-motive fo...",5.5,-,"35% of the activity at pH 7.0, substrate H2O2",Escherichia coli,P56100,752197


In [75]:
data_uniprot.iloc[6375].comments

'7.2-8.8: maximal activity, 9.6: about 50% of maximal activity'

In [76]:
data_act.tail()

Unnamed: 0,exp_index,ph,activity,note
1502,6391,10.0,0.75,ATP synthesis in washed membranes of Haloferax...
1504,6422,6.8,0.95,
1506,6423,6.5,0.99,
1508,6434,8.5,0.85,L-glutamine-dependent activity catalyzed by IG...
1510,6435,9.5,0.22,L-glutamine-dependent activity catalyzed by IG...


#### Apply manual fixes to data
TODO unfortunately I made the manual fixes based on the exp_index, but that has changed...
- In most cases I should be able to line up the uniprot_id, species, ph_min and ph_max

In [77]:
# add the manually parsed data and manually fixed data
manual_data_act_file = Path(data_dir, "parsed_from_bulk_download/ph_range/20230907_fixed_manual.csv")
manual_data_act = pd.read_csv(manual_data_act_file)
manual_data_act.dropna(subset=['activity'], inplace=True)
manual_data_act = manual_data_act[['exp_index', 'ph', 'activity', 'note', 'ph_range_comment']]
print(manual_data_act.exp_index.nunique(), len(manual_data_act))
manual_data_act.head(2)

274 687


Unnamed: 0,exp_index,ph,activity,note,ph_range_comment
1,20,11.0,0.6,"oxidation of D-arabitol with NAD+, within this...","oxidation of D-arabitol with NAD+, over 60% o..."
2,20,6.0,0.6,"oxidation of D-arabitol with NAD+, within this...","oxidation of D-arabitol with NAD+, over 60% o..."


In [78]:
ph_range_file = f"{data_dir}/parsed_from_bulk_download/ph_range/20230719_ph_range_processed.csv"
prev_data = pd.read_csv(ph_range_file, index_col=0)
print(len(prev_data))
prev_data.head(2)

5842


Unnamed: 0,ec_num,uniprot_id,species,ph_min,ph_max,references,comments
0,1.1.1.1,B2KJ46,Candida parapsilosis,4.0,7.5,17435004,"pH 4.0: about 80% of maximal activity, pH 7.5..."
1,1.1.1.1,Q6L0S1,Picrophilus torridus,7.0,8.0,17989975,"pH 7: 41% of maximal activity, pH 8: 59% of m..."


In [79]:
prev_data2 = prev_data.loc[[int(i) for i in manual_data_act.exp_index.unique()]]
len(prev_data2)

274

In [80]:
prev_data2.tail()

Unnamed: 0,ec_num,uniprot_id,species,ph_min,ph_max,references,comments
1633,3.2.1.14,Q47SE9,Thermobifida fusca,4.8,7.5,30094208,the enzyme tends to be active under an acidic...
1640,3.2.1.14,B3A042,Ananas comosus,2.0,12.0,18323646,"activity range, inactive at pH 13 and pH 1.0"
19,1.1.1.105,Q8VIJ7,Mus musculus,7.5,8.0,11562362,"pH 7.5: 54% of maximal activity, pH 8: maxima..."
179,1.1.1.9,A0A141BGH5,Gluconobacter oxydans,10.0,12.0,26975753,over 60% of maximal activity within this rang...
178,1.1.1.9,A0A141BGH5,Gluconobacter oxydans,4.0,8.0,1365425726975753,


In [81]:
manual_data_act.exp_index.astype(int).max()

5733

In [82]:
tqdm.pandas()

In [83]:
# see if we can match up the previous data rows with the current rows
def build_exp_idx_mapping(row):
    ph_max = str(float(row.ph_max)) if row.ph_max != "-" else "None"
    ph_min = str(float(row.ph_min)) if row.ph_min != "-" else "None"
    # print(row, ph_min, ph_max)
    matching_rows = prev_data2[(prev_data2.ec_num == row.ec_num) &
                                    (prev_data2.ph_min == ph_min) &
                                    (prev_data2.ph_max == ph_max) &
                                    (prev_data2.species == row.organism) &
                                    (prev_data2.uniprot_id == row.uniprot_id)
                                    # (prev_data2.comments == row.comments)
                                   ]
    if len(matching_rows) == 1:
        # print(matching_rows)
        return matching_rows.index[0]
    if len(matching_rows) > 1:
        print(len(matching_rows), row)
    
# map the current index to the old index
manual_exp_idx_map = data_uniprot.progress_apply(build_exp_idx_mapping, axis=1)
manual_exp_idx_map = manual_exp_idx_map.dropna()
len(manual_exp_idx_map)

  0%|          | 0/6440 [00:00<?, ?it/s]

275

In [84]:
prev_data2.head(2)

Unnamed: 0,ec_num,uniprot_id,species,ph_min,ph_max,references,comments
20,1.1.1.11,A0A2K9VPX3,Gluconobacter sp. JX-05,6.0,11.0,28364633,"oxidation of D-arabitol with NAD+, over 60% o..."
21,1.1.1.11,A0A2K9VPX3,Gluconobacter sp. JX-05,3.0,7.0,28364633,"reduction of D-xylulose with NADH, over 80% o..."


In [85]:
manual_exp_idx_map = manual_exp_idx_map.dropna()

In [86]:
manual_exp_idx_map.value_counts()

941.0     2
1684.0    2
1099.0    2
1103.0    2
1133.0    2
         ..
651.0     1
654.0     1
666.0     1
665.0     1
5733.0    1
Length: 269, dtype: int64

In [87]:
len(prev_data2)

274

In [88]:
prev_data2[~prev_data2.index.isin(manual_exp_idx_map.values)]

Unnamed: 0,ec_num,uniprot_id,species,ph_min,ph_max,references,comments
118,1.1.1.307,A0A0M4HL56,Debaryomyces nepalensis,5.0,10.0,32356119.0,"over 50% of maximal activity at pH 5.0-10.0, ..."
3105,1.1.1.40,P43279,Oryza sativa,7.3,8.3,16290176.0,"pH: about 65% of maximal activity, pH: about ..."
3792,2.3.1.24,Q3ZBF8,Bos taurus,6.5,8.0,8593432.0,"over 50% ox maximal activity at pH 6.5-8.0, i..."
3795,2.3.1.24,Q5E9R6,Bos taurus,6.5,8.0,8593432.0,"over 50% ox maximal activity at pH 6.5-8.0, i..."
1547,3.13.1.8,O58212,Pyrococcus horikoshii,5.0,10.0,,the enzyme is inactive below pH 5.0 and above...


In [89]:
data_uniprot.loc[209]

ec_num                                                1.1.1.431
name                                 D-xylose reductase (NADPH)
ph_min                                                        5
ph_max                                                       10
comments      over 50% of maximal activity at pH 5.0-10.0, r...
organism                                Debaryomyces nepalensis
uniprot_id                                           A0A0M4HL56
ref                                                      761017
Name: 209, dtype: object

In [90]:
manual_exp_idx_map = manual_exp_idx_map.to_dict()
# looks like the ec number changed
manual_exp_idx_map[209] = 118

In [91]:
manual_exp_idx_map_to_new = {int(j): int(i) for i, j in manual_exp_idx_map.items()}

In [92]:
new_manual_fix_data = manual_data_act.copy()
new_manual_fix_data['exp_index'] = new_manual_fix_data.exp_index.astype(int).replace(manual_exp_idx_map_to_new)

new_manual_fix_data

Unnamed: 0,exp_index,ph,activity,note,ph_range_comment
1,23,11.0,0.6,"oxidation of D-arabitol with NAD+, within this...","oxidation of D-arabitol with NAD+, over 60% o..."
2,23,6.0,0.6,"oxidation of D-arabitol with NAD+, within this...","oxidation of D-arabitol with NAD+, over 60% o..."
3,22,7.0,0.8,"reduction of D-xylulose with NADH, within this...","reduction of D-xylulose with NADH, over 80% o..."
4,22,3.0,0.8,"reduction of D-xylulose with NADH, within this...","reduction of D-xylulose with NADH, over 80% o..."
5,221,9.5,0.7,,"about 80% activity at pH 5.5, 100% activity a..."
...,...,...,...,...,...
723,240,10.0,0.6,,over 60% of maximal activity within this rang...
724,240,11.0,1.0,,
725,232,5.0,1.0,,over 60% of maximal activity within this range...
726,232,4.0,0.6,,over 60% of maximal activity within this range...


In [93]:
print(new_manual_fix_data.exp_index.nunique(), len(new_manual_fix_data))

274 687


In [94]:
print(data_act.exp_index.nunique(), len(data_act))
data_act = data_act[~data_act.exp_index.isin(new_manual_fix_data.exp_index)]
print(data_act.exp_index.nunique(), len(data_act))
data_act = pd.concat([data_act, new_manual_fix_data], ignore_index=True).reset_index(drop=True)
print(data_act.exp_index.nunique(), len(data_act))

3030 5830
2904 5601
3178 6288


In [100]:
data_act['ph_range_comment'] = data_act.exp_index.apply(
    lambda x: data_uniprot.loc[x].comments)
data_act['uniprot_id'] = data_act.exp_index.apply(
    lambda x: data_uniprot.loc[x].uniprot_id)

In [101]:
data_act

Unnamed: 0,exp_index,ph,activity,note,ph_range_comment,uniprot_id
0,1,4.0,0.80,,"pH 4.0: about 80% of maximal activity, pH 7.5:...",B2KJ46
1,1,7.5,0.60,,"pH 4.0: about 80% of maximal activity, pH 7.5:...",B2KJ46
2,3,4.9,0.25,reduction reaction,"pH 4.9: about 25% of maximal activity, pH 5.8:...",Q4J702
3,3,5.8,0.40,reduction reaction,"pH 4.9: about 25% of maximal activity, pH 5.8:...",Q4J702
4,4,4.9,0.25,reduction reaction,"pH 4.9: about 25% of maximal activity, pH 5.8:...",Q4J9F2
...,...,...,...,...,...,...
6283,240,10.0,0.60,,over 60% of maximal activity within this range...,A0A141BGH5
6284,240,11.0,1.00,,over 60% of maximal activity within this range...,A0A141BGH5
6285,232,5.0,1.00,,over 60% of maximal activity within this range...,A0A141BGH5
6286,232,4.0,0.60,,over 60% of maximal activity within this range...,A0A141BGH5


In [96]:
data_act.exp_index.max()

6439

In [98]:
data_uniprot.index.max()

6439

In [102]:
# Also correct these mistakes
data_act[data_act.ph > 13]

Unnamed: 0,exp_index,ph,activity,note,ph_range_comment,uniprot_id
2775,628,76.2,0.95,,95% of maximal activity at pH 76.2 and pH 7.8,Q84V83
2912,1683,66.0,0.87,the recombinant enzyme exhibits% at pH 9.0,the recombinant enzyme exhibits 87% of maximal...,Q8RJP5
2965,2255,20.0,0.9,"recombinant His-tagged isozyme betafruct2, % a...","recombinant His-tagged isozyme betafruct2, abo...",Q2XQ19
2968,2257,20.0,0.9,"recombinant His-tagged isozyme betafruct2, % a...","recombinant His-tagged isozyme betafruct2, abo...",Q2XQ21
3238,4386,90.0,0.5,"tryptamine, % at pH 8.8, transferase B","tryptamine, about half-maximal activity at pH ...",O97972
3304,4592,30.0,0.1,% of maximal activity at pH 10.0,10% of maximal activity at pH 6.5 and 30% of m...,B4G072
3306,4593,30.0,0.1,% of maximal activity at pH 10.0,10% of maximal activity at pH 6.5 and 30% of m...,Q8W2B7


In [103]:
data_act[data_act.ph > 13].ph_range_comment.values

array(['95% of maximal activity at pH 76.2 and pH 7.8',
       'the recombinant enzyme exhibits 87% of maximal activity at pH 6.0 and 66% at pH 9.0',
       'recombinant His-tagged isozyme betafruct2, about 90% of maximal activity at pH 2.0 and 20% at pH 4.5',
       'recombinant His-tagged isozyme betafruct2, about 90% of maximal activity at pH 2.0 and 20% at pH 4.5',
       'tryptamine, about half-maximal activity at pH 7.5 and 90% at pH 8.8, transferase B',
       '10% of maximal activity at pH 6.5 and 30% of maximal activity at pH 10.0',
       '10% of maximal activity at pH 6.5 and 30% of maximal activity at pH 10.0'],
      dtype=object)

In [104]:
fixed_rows = [
    [7.6, 0.95, ""],
    [9.0, .66, ""],
    [4.5, .2, ""],
    [4.5, .2, ""],
    [8.8, 0.9, ""],
    [10.0, 0.3, ""],
    [10.0, 0.3, ""],
]
data_act.loc[data_act.ph > 13,["ph", "activity", "note"]] = fixed_rows

In [105]:
# Also correct these mistakes
print(data_act[data_act.ph < 1].ph_range_comment.values)
data_act[data_act.ph < 1]

['the enzyme has about 60% activity at pH 8.0 and about 70% activity at pH10.0']


Unnamed: 0,exp_index,ph,activity,note,ph_range_comment,uniprot_id
4036,3972,0.0,0.7,the enzyme hasand,the enzyme has about 60% activity at pH 8.0 an...,P08170


In [106]:
data_act.loc[data_act.ph < 1, "ph"] = 10.0

In [108]:
# apply other manual fixes here
print(data_act[data_act.exp_index == 6337])
data_uniprot.loc[6337]

     exp_index   ph  activity note  \
2637      6337  9.0      0.55        
2638      6337  9.0      0.50        

                                       ph_range_comment uniprot_id  
2637  pH 9: 55% of maximal activity, pH 9.0: 50% of ...     Q9L9C1  
2638  pH 9: 55% of maximal activity, pH 9.0: 50% of ...     Q9L9C1  


ec_num                                                 6.2.1.30
name                                   phenylacetate-CoA ligase
ph_min                                                        7
ph_max                                                        9
comments      pH 9: 55% of maximal activity, pH 9.0: 50% of ...
organism                                    Aromatoleum evansii
uniprot_id                                               Q9L9C1
ref                                                      648921
Name: 6337, dtype: object

In [109]:
data_act.loc[data_act.exp_index == 6337, 'ph'] = [7.0, 9.0]

In [111]:
data_act.loc[data_act.exp_index == 6265, 'activity'] = [.03, .21]
data_act.loc[data_act.exp_index == 6264, 'activity'] = [.24, .36]

In [112]:
print(data_act.loc[data_act.exp_index == 4792].ph_range_comment.values)
data_act.loc[data_act.exp_index == 4792]

['over 80% of maximal activity at pH 7.5 and pH 11.5, about 60% of maximal activity at pH 6.0, enzyme is completely inactive below pH 5.6'
 'over 80% of maximal activity at pH 7.5 and pH 11.5, about 60% of maximal activity at pH 6.0, enzyme is completely inactive below pH 5.6']


Unnamed: 0,exp_index,ph,activity,note,ph_range_comment,uniprot_id
4162,4792,7.5,0.8,"and pH 11.5, enzyme is completely inactive bel...",over 80% of maximal activity at pH 7.5 and pH ...,P37530
4163,4792,6.0,0.6,"and pH 11.5, enzyme is completely inactive bel...",over 80% of maximal activity at pH 7.5 and pH ...,P37530


In [113]:
fix = data_act.loc[data_act.exp_index == 4792]
fix[['ph', 'activity']] = [[11.5, .8], [5.6, 0.0]]
data_act = pd.concat([data_act, fix])
data_act.tail(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fix[['ph', 'activity']] = [[11.5, .8], [5.6, 0.0]]


Unnamed: 0,exp_index,ph,activity,note,ph_range_comment,uniprot_id
6287,232,8.0,0.6,,over 60% of maximal activity within this range...,A0A141BGH5
4162,4792,11.5,0.8,"and pH 11.5, enzyme is completely inactive bel...",over 80% of maximal activity at pH 7.5 and pH ...,P37530
4163,4792,5.6,0.0,"and pH 11.5, enzyme is completely inactive bel...",over 80% of maximal activity at pH 7.5 and pH ...,P37530


In [114]:
print(data_act[data_act.exp_index == 2627].ph_range_comment.values)
data_act.loc[data_act.exp_index == 2627]

['the enzyme shows 30%, 55%, 100%, 70%, 15% and less than 5% activity at pH 3.0, 4.0, 5.0, 6.0, 7.0, and 8.0, respectively']


Unnamed: 0,exp_index,ph,activity,note,ph_range_comment,uniprot_id
3924,2627,3.0,0.05,"the enzyme shows 30%, 55%, 100%, 70%, 15% and ...","the enzyme shows 30%, 55%, 100%, 70%, 15% and ...",I6XPK9


In [115]:
fix = data_act.loc[data_act.exp_index == 2627]
fix = pd.concat([fix]*6)
fix[['ph', 'activity']] = [[3.0, .3], [4.0, .55], [5.0, 1.0], [6.0, .7], [7.0, .15], [8.0, .05]]
fix['note'] = ""
data_act = data_act[data_act.exp_index != 2627]
data_act = pd.concat([data_act, fix])
data_act.tail(3)

Unnamed: 0,exp_index,ph,activity,note,ph_range_comment,uniprot_id
3924,2627,6.0,0.7,,"the enzyme shows 30%, 55%, 100%, 70%, 15% and ...",I6XPK9
3924,2627,7.0,0.15,,"the enzyme shows 30%, 55%, 100%, 70%, 15% and ...",I6XPK9
3924,2627,8.0,0.05,,"the enzyme shows 30%, 55%, 100%, 70%, 15% and ...",I6XPK9


In [116]:
# write to file
out_file = Path(data_dir, "ph_range/20230907_ph_range_processed_act.csv")
print(out_file)
data_act.to_csv(out_file, index=False)

out_file = Path(data_dir, "ph_range/20230907_ph_range_processed.csv")
print(out_file)
data_uniprot.to_csv(out_file, index="exp_index")

/projects/robustmicrob/jlaw/projects/prot_stability_engineering/inputs/brenda/ph_range/20230907_ph_range_processed_act.csv
/projects/robustmicrob/jlaw/projects/prot_stability_engineering/inputs/brenda/ph_range/20230907_ph_range_processed.csv


In [117]:
data_remaining.head(2)

Unnamed: 0,ec_num,name,ph_min,ph_max,comments,organism,uniprot_id,ref
0,1.1.1.1,alcohol dehydrogenase,2.0,8,-,Thermoplasma acidophilum,Q9HIM3,700124
2,1.1.1.1,alcohol dehydrogenase,4.5,9,high catalytic activity within the range of pH...,Saccharomyces cerevisiae,P28625,763285


In [118]:
print(len(data_remaining))
data_remaining2 = data_remaining[~data_remaining.index.isin(data_act.exp_index.unique())]
print(len(data_remaining2))

3410
3262


In [None]:
# data_remaining['ec_num_link'] = data_remaining['ec_num'].apply(lambda ec_num: f"https://www.brenda-enzymes.org/enzyme.php?ecno={ec_num}#pH%20RANGE")

In [119]:
# write to file
out_file = Path(data_dir, "ph_range/20230907_ph_range_data_remaining.csv")
print(out_file)
data_remaining2.to_csv(out_file, index="exp_index")

/projects/robustmicrob/jlaw/projects/prot_stability_engineering/inputs/brenda/ph_range/20230907_ph_range_data_remaining.csv
