In [3]:
import numpy as np 
import pandas as pd 
import itertools
import sys

from sklearn.preprocessing import OneHotEncoder
from sklearn.gaussian_process import GaussianProcessRegressor

# Direct to UCB_opt_tools module 
sys.path.append('../../')
from UCB_opt_tools import UCB_opt
from UCB_opt_tools import UCB_batch_mode



import warnings

warnings.filterwarnings('ignore')

# Load the data

In [None]:
df = pd.read_excel('data/RL08_Relabeled.xlsx', index_col=0)

# Get rid of unnecessary columns
df = df[['Final_Label', 'Block_seq', 'Sum C6-C16', 'AA_seq']]

# Fill NaNs in 'Block_seq' and AA-seq
df['Block_seq'].fillna('-', inplace=True)
df['AA_seq'].fillna('-', inplace=True)


# Rename titer column 
df = df.rename(columns={'Sum C6-C16':'titer', 'Final_Label':'name'})

df.head()

# Cleaning and preprocessing

In [None]:
# Average the data
df_avg = df.groupby(by=['name','Block_seq', 'AA_seq']).mean().reset_index()
df_avg

In [None]:
# Clean up the data frame

df_ucb = df_avg[['name', 'Block_seq', 'titer']]

# Drop out instances that are problematic ()
df_ucb = df_ucb[(~df_ucb['name'].str.contains(r'M[TB]-ACR')) & (~df_ucb['name'].str.contains(r'^Fusion'))].reset_index(drop=True) 

# Reformat the sequence column to make it easier to encode 
df_ucb['seq'] = df_ucb['Block_seq'].str.extract(r'(.)-') + df_ucb['Block_seq'].str.extract(r'-(.+)')
df_ucb.drop('Block_seq', axis=1)
df_ucb = df_ucb.dropna().reset_index(drop=True)
df_ucb

# Encode the data 

In [None]:
df_encode = df_ucb['seq'].str.split(r'',expand=True).drop([0,10],axis=1)
df_encode

ohc = OneHotEncoder()

df_ohc = pd.DataFrame(ohc.fit_transform(df_encode).toarray())

# Data Walk (Normal UCB method)


In [None]:
# Specify a set of 3 proteins to use as a starting point (in this case, the three parental enzymes)
df_start = df_ucb[df_ucb['name'].str.contains('Parent')]
df_start_locs = list(df_start.index)

# Get the initial one-hot encodings
X_start = df_ohc.loc[df_start_locs]

# Get the initial experimental measurements
y_start = df_start['titer']

# Set up the initial prediction space 
X_pred_start = df_ohc.drop(df_start_locs)


# Copy the start variables to a variable that can be updated 
X_walk = X_start.copy().to_numpy()
y_walk = list(y_start)
X_pred_walk = X_pred_start.copy()


# Start the walk through 
iters = 0 
y_actual = 0 

# Stop when reaching the actual optimum (since this is a simulated walk through an existing dataset)
y_optimum = df_ucb['titer'].max()

while y_actual < y_optimum and iters <=12:
    # Determine the next point to sample 
    X_pred_opt, y_pred, ind = UCB_opt(X_walk, y_walk, X_pred_walk)

    block_seq = ''.join(ohc.inverse_transform(X_pred_opt.reshape(1, -1))[0])

    # Do an 'experiment' to reveal the true value of that point
    y_actual = float(df_ucb[df_ucb['seq']==block_seq]['titer'])

    # Update X_walk, y_walk, and X_pred_walk
    y_walk.append(y_actual)
    X_walk = np.vstack([X_walk, X_pred_opt])
    X_pred_walk = X_pred_walk.drop(X_pred_walk.iloc[ind].name)

    print(f'Iteration {iters}: {block_seq}, {round(y_actual,1)}')

    iters += 1 

