In [1]:
import numpy as np 
import pandas as pd 
import itertools
import sys
import seaborn as sns 
import matplotlib.pyplot as plt 

from sklearn.preprocessing import OneHotEncoder
from sklearn.gaussian_process import GaussianProcessRegressor

# Direct to UCB_opt_tools module 
sys.path.append('../')
# from UCB_opt_tools import UCB_opt
# from UCB_opt_tools import UCB_batch_mode

from UCB_opt_tools import GetUCB


import warnings

warnings.filterwarnings('ignore')

# Load the data 

In [2]:
df = pd.read_excel('../Demos/ATR_engineering/data/RL08_Relabeled.xlsx', index_col=0)

# Get rid of unnecessary columns
df = df[['Final_Label', 'Block_seq', 'Sum C6-C16', 'AA_seq']]

# Fill NaNs in 'Block_seq' and AA-seq
df['Block_seq'].fillna('-', inplace=True)
df['AA_seq'].fillna('-', inplace=True)


# Rename titer column 
df = df.rename(columns={'Sum C6-C16':'titer', 'Final_Label':'name'})

df.head()

Unnamed: 0,name,Block_seq,titer,AA_seq
0,ATR-23,A-ATBBBAAT,3.623375,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...
1,ATR-23,A-ATBBBAAT,4.160071,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...
2,ATR-23,A-ATBBBAAT,3.672095,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...
3,Empty Vector,-,1.268835,-
4,Empty Vector,-,1.109339,-


## Cleaning and preprocessing

In [3]:
# Average the data
df_avg = df.groupby(by=['name','Block_seq', 'AA_seq']).mean().reset_index()
df_avg

Unnamed: 0,name,Block_seq,AA_seq,titer
0,ATR-01,A-ATAATTBB,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,1.528668
1,ATR-02,A-TATTTTAB,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,0.809306
2,ATR-03,A-TTTTBTBA,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,0.932477
3,ATR-04,A-ATTBAATB,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,0.595051
4,ATR-05,A-ABTATTTA,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,0.917234
...,...,...,...,...
99,MA-ACR (Parent A),A-AAAAAAAA,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,11.079517
100,MB-ACR,B-BBBBBBBB,NYFVTGGTGFIGRFLIAKLLARGAIVHVLVREQSVQKLADLREKLG...,26.477074
101,MT-ACR,T-tTTTTTT,QYFVTGATGFIGKRLVRKLLDRRGSTVHFLLRPESERKLPELLAYW...,4.012505
102,Parent B (Fusion A-B),A-BBBBBBBB,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,36.675911


In [4]:
# Clean up the data frame

df_ucb = df_avg[['name', 'Block_seq', 'titer']]

# Take the log of the titer
df_ucb['log_titer'] = np.log(df_ucb['titer'])

# Drop out instances that are problematic ()
df_ucb = df_ucb[(~df_ucb['name'].str.contains(r'M[TB]-ACR')) & (~df_ucb['name'].str.contains(r'^Fusion'))].reset_index(drop=True) 

# Reformat the sequence column to make it easier to encode 
df_ucb['seq'] = df_ucb['Block_seq'].str.extract(r'(.)-') + df_ucb['Block_seq'].str.extract(r'-(.+)')
df_ucb.drop('Block_seq', axis=1)
df_ucb = df_ucb.dropna().reset_index(drop=True)
df_ucb

Unnamed: 0,name,Block_seq,titer,log_titer,seq
0,ATR-01,A-ATAATTBB,1.528668,0.424397,AATAATTBB
1,ATR-02,A-TATTTTAB,0.809306,-0.211578,ATATTTTAB
2,ATR-03,A-TTTTBTBA,0.932477,-0.069911,ATTTTBTBA
3,ATR-04,A-ATTBAATB,0.595051,-0.519107,AATTBAATB
4,ATR-05,A-ABTATTTA,0.917234,-0.086393,AABTATTTA
...,...,...,...,...,...
91,ATR-92,A-ABBBAAAB,35.281228,3.563351,AABBBAAAB
92,ATR-93,A-BTBBATAB,34.566492,3.542885,ABTBBATAB
93,MA-ACR (Parent A),A-AAAAAAAA,11.079517,2.405098,AAAAAAAAA
94,Parent B (Fusion A-B),A-BBBBBBBB,36.675911,3.602120,ABBBBBBBB


## Encode the data

In [5]:
df_encode = df_ucb['seq'].str.split(r'',expand=True).drop([0,10],axis=1)
df_encode

ohc = OneHotEncoder()

df_ohc = pd.DataFrame(ohc.fit_transform(df_encode).toarray())

## Test out class

In [6]:
# Specify a set of 3 proteins to use as a starting point (in this case, the three parental enzymes)
df_start = df_ucb[df_ucb['name'].str.contains('Parent')]
df_start_locs = list(df_start.index)

# Get the initial one-hot encodings
X_start = df_ohc.loc[df_start_locs]

# Get the initial experimental measurements
y_start = df_start['log_titer']

# Set up the initial prediction space 
X_pred_start = df_ohc.drop(df_start_locs)

In [7]:
UCB = GetUCB()
UCB.fit(X_start, y_start).transform(X_pred_start)



array([1.03745343, 1.09949942, 1.10137563, 1.03745343, 1.0529476 ,
       1.04837038, 1.0594936 , 1.0529476 , 1.03745343, 1.0529476 ,
       1.0529476 , 1.05732451, 1.03745343, 1.05992387, 1.04837038,
       1.04255349, 1.04255349, 1.03745343, 1.05732451, 1.04837038,
       1.0594936 , 1.04496163, 1.04496163, 1.03745343, 1.04255349,
       1.04496163, 1.09949942, 1.12624742, 1.10137563, 1.13198055,
       1.10137563, 1.0529476 , 1.03745343, 1.04837038, 1.06976391,
       1.04496163, 1.04255349, 1.03745343, 1.12900953, 1.48196794,
       1.32100616, 1.32558633, 1.24925778, 1.32558633, 1.32558633,
       1.32558633, 1.48196794, 1.32558633, 1.32100616, 1.48196794,
       1.48196794, 1.32558633, 1.48196794, 1.32100616, 1.48346008,
       1.32100616, 1.48196794, 1.32100616, 1.48467189, 1.32100616,
       1.32558633, 1.48467189, 1.48467189, 1.32558633, 1.32558633,
       1.48346008, 1.48467189, 1.18477173, 1.18565763, 1.19481503,
       1.19481503, 1.19481503, 1.18565763, 1.18565763, 1.18565

In [8]:
UCB.x_pred_opt

array([1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1.,
       0., 0., 1., 0., 0., 0., 1., 0.])

In [9]:
UCB.y_pred_opt

0.49387506377477075

In [10]:
UCB.opt_ind

58

In [11]:
UCB2 = GetUCB()
UCB2.fit(X_start, y_start)
UCB2.batch_mode(X_pred_start, batch_size=5)

# .fit(X_start, y_start).transform(X_pred_start)

GetUCB()

In [12]:
UCB2.X_batch

[array([1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1.,
        0., 0., 1., 0., 0., 0., 1., 0.]),
 array([1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0.,
        0., 1., 0., 1., 0., 0., 1., 0.]),
 array([1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1.,
        0., 0., 0., 1., 0., 0., 1., 0.]),
 array([1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0.,
        0., 1., 0., 1., 0., 0., 1., 0.]),
 array([1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1.,
        0., 0., 0., 1., 0., 0., 1., 0.])]