In [1]:
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
import itertools
from typing import List, Tuple
import string
from pathlib import Path
from tqdm.auto import tqdm, trange

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
os.chdir("../../")
os.getcwd()

'/lustre/eaglefs/projects/robustmicrob/jlaw/projects/prot_stability_engineering'

### Load and combine the ph range and ph opt datasets
1. Brenda ph range
2. E coli manual parsing
3. Brenda ph opt

In [8]:
def check_ph_act(df):
    """ check to make sure the ph and activity values are within the expected range
    """
    out_of_range = df[(df.ph > 13) | (df.ph < 1) | (df.activity > 1) | (df.activity < 0)]
    print(f"{len(out_of_range) = }")
    if len(out_of_range) > 0:
        print(out_of_range)

In [14]:
# load the pH range and opt values
data_file = Path(inputs_dir, "ph_range/20230719_ph_range_processed.csv")
print(data_file)
data = pd.read_csv(data_file, index_col=0)
print(len(data))

act_file = Path(inputs_dir, "ph_range/20230719_ph_range_processed_act_wopt.csv")
print(act_file)
data_act = pd.read_csv(act_file)
data_act['uniprot_id'] = data_act.exp_index.apply(lambda x: data.loc[x].uniprot_id)
print(len(data_act))
data_act.head(2)

inputs/brenda/ph_range/20230719_ph_range_processed.csv
5842
inputs/brenda/ph_range/20230719_ph_range_processed_act_wopt.csv
6341


Unnamed: 0,exp_index,ph,activity,note,ph_range_comment,uniprot_id
0,0,4.0,0.8,,"pH 4.0: about 80% of maximal activity, pH 7.5...",B2KJ46
1,0,4.5,1.0,,,B2KJ46


In [10]:
check_ph_act(data_act)

len(out_of_range) = 0


In [15]:
data_act.uniprot_id.value_counts()

P00784    33
Q9UKK9    20
Q9UHL4    17
Q6LXQ3    16
Q8H8T0    16
          ..
P19791     1
P10281     1
Q43153     1
Q9ZTK5     1
Q9ZH77     1
Name: uniprot_id, Length: 2125, dtype: int64

In [13]:
# also load the manual E coli dataset
data_file = Path(inputs_dir, "ph_range/ecoli/20230712_ph_curve_data_processed.csv")
print(data_file)
data_ecoli = pd.read_csv(data_file, index_col=0)
print(len(data_ecoli))

act_file = Path(inputs_dir, "ph_range/ecoli/20230712_ph_curve_data_processed_activities.csv")
print(act_file)
data_act_ecoli = pd.read_csv(act_file)
data_act_ecoli['uniprot_id'] = data_act_ecoli.exp_index.apply(lambda x: data_ecoli.loc[x].uniprot_id)
print(len(data_act_ecoli))
data_act_ecoli.head(2)

inputs/brenda/ph_range/ecoli/20230712_ph_curve_data_processed.csv
282
inputs/brenda/ph_range/ecoli/20230712_ph_curve_data_processed_activities.csv
3165


Unnamed: 0,exp_index,pH,activity,uniprot_id
0,0,7.2,0.012428,P21515
1,0,7.5,0.090313,P21515


In [30]:
data_ecoli.head(2)

Unnamed: 0_level_0,gene,uniprot_id,b_number,data_type,mutation,doi,figure_id,time,condition,reaction_note,curator
exp_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,acpH,P21515,b0404,ph,,10.1016/S0021-9258(18)96069-2,fig4,30,33,,JL
3,speA,P21170,b2938,ph,wt,10.1016/j.molcatb.2015.06.008,,15,4,,JL


In [17]:
data_act_ecoli.uniprot_id.value_counts()

P25524    96
P19926    84
P06721    70
P00864    53
P10902    53
          ..
P17444     5
P37197     5
P0A9T0     4
P0A725     3
P45568     2
Name: uniprot_id, Length: 195, dtype: int64

In [18]:
# load the pHopt values
data_opt = pd.read_csv("inputs/brenda/ph_opt_data_no_assay_at_no_mutant.tsv", sep='\t')
print(data_opt.uniprot_id.nunique(), len(data_opt))
data_opt.head(2)

6777 8367


Unnamed: 0,ec_num,uniprot_id,opt,species,references,comments
0,1.1.1.1,D4GSN2,11.0,Haloferax volcanii,70113753816781278999822526808,"#82# oxidation of ethanol, pyrazole-insensitiv..."
1,1.1.1.1,D4GSN2,6.0,Haloferax volcanii,"12834271,16000771,17072683,16390205,16243283,1...","#96,122# reduction of acetaldehyde <144,255>; ..."


In [19]:
data_opt.uniprot_id.value_counts()

P08659    9
P13006    8
Q9YBQ2    8
P07102    7
P00784    7
         ..
P19938    1
Q9P9M8    1
P24298    1
Q8NT48    1
Q0PCR9    1
Name: uniprot_id, Length: 6777, dtype: int64

In [20]:
data_opt[data_opt.uniprot_id.isin(data_opt.uniprot_id.value_counts().head(2).index)]

Unnamed: 0,ec_num,uniprot_id,opt,species,references,comments
726,1.1.3.4,P13006,5.8,Aspergillus niger,18584742,#13# immobilized and free form of the enzyme <8>
727,1.1.3.4,P13006,4.0,Aspergillus niger,,
728,1.1.3.4,P13006,5.0-7.0,Aspergillus niger,30197862,#29# recombinant enzyme <160>
729,1.1.3.4,P13006,6.86,Aspergillus niger,18421724,#29# glucose oxidase-immobilized polypyrrole/a...
730,1.1.3.4,P13006,5.5-6.0,Aspergillus niger,234210283542821932088,
731,1.1.3.4,P13006,5.5,Aspergillus niger,"3592653,16788719,23096254,23682574,33329498,28...","#3# free enzyme <16>; #29,44# recombinant enzy..."
732,1.1.3.4,P13006,5.0,Aspergillus niger,19549540250619352356273627869125,#3# enzyme immobilized onto alumina <14>; #9# ...
733,1.1.3.4,P13006,6.0,Aspergillus niger,"12182830,15932267,19784554,21040747,10749686,1...",#26# native and recombinant enzyme <72>; #37# ...
1151,1.13.12.7,P08659,8.0,Photinus pyralis,"18465021,18251715,19661129,22155276,26049097,2...",#1# wild-type <111>; #17# wild-type enzyme <125>
1152,1.13.12.7,P08659,7.6,Photinus pyralis,719121719859663,#1# tricine buffer <22>; #17# yellow-green emi...


In [21]:
# for each uniprot ID, take the average pH as the optimum
data_opt['opt_mean'] = data_opt.opt.apply(lambda x: np.mean([float(x.split('-')[0]), float(x.split('-')[1])])  if '-' in x else x)
data_opt['opt_mean'] = data_opt['opt_mean'].astype(float)

In [22]:
data_opt_mean = data_opt[['uniprot_id', 'opt_mean']].groupby('uniprot_id').mean()
data_opt_mean

Unnamed: 0_level_0,opt_mean
uniprot_id,Unnamed: 1_level_1
A0A016VAH2,7.500000
A0A023I760,8.000000
A0A023J5W7,4.250000
A0A023MIF8,4.650000
A0A023UHK6,11.000000
...,...
X5DTB0,5.000000
X5I0R2,6.000000
X5IDD2,7.500000
X5JA14,4.500000


In [23]:
# we already matched the ph_opt data to the ph_range, so just keep the unique uniprot IDs here
data_opt_mean_uniq = data_opt_mean[~data_opt_mean.index.isin(data_act.uniprot_id.unique())]
data_opt_mean_uniq = data_opt_mean_uniq.reset_index()
data_opt_mean_uniq = data_opt_mean_uniq.rename(columns={'opt_mean': 'ph'})
data_opt_mean_uniq['activity'] = 1
print(len(data_opt_mean_uniq))
data_opt_mean_uniq.head(2)

5729


Unnamed: 0,uniprot_id,ph,activity
0,A0A016VAH2,7.5,1
1,A0A023I760,8.0,1


In [24]:
check_ph_act(data_opt_mean_uniq)

len(out_of_range) = 0


In [28]:
data_act_ecoli['note'] = 'ecoli_man'
data_act_ecoli.rename(columns={'pH': 'ph'}, inplace=True)
# Fix the activities just below 0
data_act_ecoli.loc[data_act_ecoli.activity < 0, 'activity'] = 0

In [29]:
check_ph_act(data_act_ecoli)

len(out_of_range) = 14
      exp_index    ph  activity uniprot_id       note
75           16   9.1  1.005900     P21514  ecoli_man
196          37  10.0  1.001300     P0A6C5  ecoli_man
197          37  10.5  1.004000     P0A6C5  ecoli_man
205          38   7.0  1.050000     P0A6C5  ecoli_man
335          48   8.0  1.001639     P0AC88  ecoli_man
359          50   7.9  1.001270     P09126  ecoli_man
525          66   7.5  1.001060     P27306  ecoli_man
545          68   5.3  1.002049     P07658  ecoli_man
546          68   5.8  1.016393     P07658  ecoli_man
547          68   6.2  1.057377     P07658  ecoli_man
1180        133   4.5  1.002179     P19926  ecoli_man
1207        134   6.5  1.002179     P19926  ecoli_man
1223        135   3.5  1.002179     P19926  ecoli_man
1783        230  13.5  0.002390     P21179  ecoli_man


In [30]:
data_act_ecoli.head(2)

Unnamed: 0,exp_index,ph,activity,uniprot_id,note
0,0,7.2,0.012428,P21515,ecoli_man
1,0,7.5,0.090313,P21515,ecoli_man


In [31]:
data_act_merged = pd.concat([data_act, data_act_ecoli, data_opt_mean_uniq], ignore_index=True).reset_index(drop=True)
print(data_act_merged.uniprot_id.nunique(), len(data_act_merged))
data_act_merged.head(2)

7999 15235


Unnamed: 0,exp_index,ph,activity,note,ph_range_comment,uniprot_id
0,0.0,4.0,0.8,,"pH 4.0: about 80% of maximal activity, pH 7.5...",B2KJ46
1,0.0,4.5,1.0,,,B2KJ46
