In [1]:
import numpy as np 
import pandas as pd 
import itertools
import sys

from sklearn.preprocessing import OneHotEncoder
from sklearn.gaussian_process import GaussianProcessRegressor

# Direct to UCB_opt_tools module 
sys.path.append('../../')
from UCB_opt_tools import UCB_opt
from UCB_opt_tools import UCB_batch_mode


import warnings

warnings.filterwarnings('ignore')

# Load the data

In [2]:
df = pd.read_excel('data/RL08_Relabeled.xlsx', index_col=0)

# Get rid of unnecessary columns
df = df[['Final_Label', 'Block_seq', 'Sum C6-C16', 'AA_seq']]

# Fill NaNs in 'Block_seq' and AA-seq
df['Block_seq'].fillna('-', inplace=True)
df['AA_seq'].fillna('-', inplace=True)


# Rename titer column 
df = df.rename(columns={'Sum C6-C16':'titer', 'Final_Label':'name'})

df.head()

Unnamed: 0,name,Block_seq,titer,AA_seq
0,ATR-23,A-ATBBBAAT,3.623375,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...
1,ATR-23,A-ATBBBAAT,4.160071,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...
2,ATR-23,A-ATBBBAAT,3.672095,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...
3,Empty Vector,-,1.268835,-
4,Empty Vector,-,1.109339,-


# Cleaning and preprocessing

In [3]:
# Average the data
df_avg = df.groupby(by=['name','Block_seq', 'AA_seq']).mean().reset_index()
df_avg

Unnamed: 0,name,Block_seq,AA_seq,titer
0,ATR-01,A-ATAATTBB,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,1.528668
1,ATR-02,A-TATTTTAB,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,0.809306
2,ATR-03,A-TTTTBTBA,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,0.932477
3,ATR-04,A-ATTBAATB,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,0.595051
4,ATR-05,A-ABTATTTA,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,0.917234
...,...,...,...,...
99,MA-ACR (Parent A),A-AAAAAAAA,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,11.079517
100,MB-ACR,B-BBBBBBBB,NYFVTGGTGFIGRFLIAKLLARGAIVHVLVREQSVQKLADLREKLG...,26.477074
101,MT-ACR,T-tTTTTTT,QYFVTGATGFIGKRLVRKLLDRRGSTVHFLLRPESERKLPELLAYW...,4.012505
102,Parent B (Fusion A-B),A-BBBBBBBB,NYFLTGGTGFIGRFLVEKLLARGGTVYVLVREQSQDKLERLRERWG...,36.675911


In [4]:
# Clean up the data frame

df_ucb = df_avg[['name', 'Block_seq', 'titer']]

# Drop out instances that are problematic ()
df_ucb = df_ucb[(~df_ucb['name'].str.contains(r'M[TB]-ACR')) & (~df_ucb['name'].str.contains(r'^Fusion'))].reset_index(drop=True) 

# Reformat the sequence column to make it easier to encode 
df_ucb['seq'] = df_ucb['Block_seq'].str.extract(r'(.)-') + df_ucb['Block_seq'].str.extract(r'-(.+)')
df_ucb.drop('Block_seq', axis=1)
df_ucb = df_ucb.dropna().reset_index(drop=True)
df_ucb

Unnamed: 0,name,Block_seq,titer,seq
0,ATR-01,A-ATAATTBB,1.528668,AATAATTBB
1,ATR-02,A-TATTTTAB,0.809306,ATATTTTAB
2,ATR-03,A-TTTTBTBA,0.932477,ATTTTBTBA
3,ATR-04,A-ATTBAATB,0.595051,AATTBAATB
4,ATR-05,A-ABTATTTA,0.917234,AABTATTTA
...,...,...,...,...
91,ATR-92,A-ABBBAAAB,35.281228,AABBBAAAB
92,ATR-93,A-BTBBATAB,34.566492,ABTBBATAB
93,MA-ACR (Parent A),A-AAAAAAAA,11.079517,AAAAAAAAA
94,Parent B (Fusion A-B),A-BBBBBBBB,36.675911,ABBBBBBBB


# Encode the data 

In [5]:
df_encode = df_ucb['seq'].str.split(r'',expand=True).drop([0,10],axis=1)
df_encode

ohc = OneHotEncoder()

df_ohc = pd.DataFrame(ohc.fit_transform(df_encode).toarray())

# Data Walk (Normal UCB method)


In [6]:
# Specify a set of 3 proteins to use as a starting point (in this case, the three parental enzymes)
df_start = df_ucb[df_ucb['name'].str.contains('Parent')]
df_start_locs = list(df_start.index)

# Get the initial one-hot encodings
X_start = df_ohc.loc[df_start_locs]

# Get the initial experimental measurements
y_start = df_start['titer']

# Set up the initial prediction space 
X_pred_start = df_ohc.drop(df_start_locs)


# Copy the start variables to a variable that can be updated 
X_walk = X_start.copy().to_numpy()
y_walk = list(y_start)
X_pred_walk = X_pred_start.copy()


# Start the walk through 
iters = 0 
y_actual = 0 

# Stop when reaching the actual optimum (since this is a simulated walk through an existing dataset)
y_optimum = df_ucb['titer'].max()

while y_actual < y_optimum and iters <=12:
    # Determine the next point to sample 
    X_pred_opt, y_pred, ind = UCB_opt(X_walk, y_walk, X_pred_walk)

    block_seq = ''.join(ohc.inverse_transform(X_pred_opt.reshape(1, -1))[0])

    # Do an 'experiment' to reveal the true value of that point
    y_actual = float(df_ucb[df_ucb['seq']==block_seq]['titer'])

    # Update X_walk, y_walk, and X_pred_walk
    y_walk.append(y_actual)
    X_walk = np.vstack([X_walk, X_pred_opt])
    X_pred_walk = X_pred_walk.drop(X_pred_walk.iloc[ind].name)

    print(f'Iteration {iters}: {block_seq}, {round(y_actual,1)}')

    iters += 1 



Iteration 0: ABBBBBAAB, 49.1
Iteration 1: ABBBBBATB, 1.4
Iteration 2: ABBBBBTAB, 19.6
Iteration 3: ABBBBAAAB, 38.6
Iteration 4: AABBBBAAB, 45.0
Iteration 5: AABBBAAAB, 35.3
Iteration 6: AABBBBABB, 36.4
Iteration 7: ABABBBAAB, 33.9
Iteration 8: AAABBBAAB, 36.4
Iteration 9: ABTBBBAAB, 54.1
Iteration 10: AATBBBAAB, 45.0
Iteration 11: ABTBBAAAB, 60.1


# Batch mode walkthrough 

In [7]:
df_start = df_ucb[df_ucb['name'].str.contains('Parent')]
df_start_locs = list(df_start.index)

X_start = df_ohc.loc[df_start_locs]
y_start = df_start['titer']
X_pred_start = df_ohc.drop(df_start_locs)


# Copy the start variables to a variable that can be updated 
X_walk = X_start.copy().to_numpy()
y_walk = list(y_start)
X_pred_walk = X_pred_start.copy()

iters = 0 

while iters <= 11: 
    X_pred_opts, y_preds, inds = UCB_batch_mode(X_walk, y_walk, X_pred_walk, batchsize=5)
    block_seqs = [] 
    for x in X_pred_opts:
        block_seqs.append(''.join(ohc.inverse_transform(x.reshape(1, -1))[0]))

    # Update Variables

    X_walk = X_pred_opts

    # Do 'experiments' to unmask 
    y_actuals = [float(df_ucb[df_ucb['seq']==block_seq]['titer']) for block_seq in block_seqs]
    y_walk = y_actuals

    # Remove 
    for ind in list(set(inds)):
        print(ind)
        X_pred_walk = X_pred_walk.drop(X_pred_walk.iloc[ind].name)

    #TODO: FIX the output
    # print(f'Iteration {iters}: {block_seqs[-1]}, {round(y_walk[-1],1)}')


    iters += 1

54
56
58
61
62
56
57
57
62
63
62
72
65
60
71
39
60
64
60
63
62
