## Setting

In [None]:
import numpy as np
import pandas as pd
import os
import math
import time
import re
from collections import Counter, defaultdict
from einops import rearrange, repeat
from scipy.stats import spearmanr
from tqdm.auto import tqdm
from Bio import Seq

In [None]:
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
import matplotlib as mpl
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt

font_files = fm.findSystemFonts(fontpaths='/home/jupyter/ADAPT_PCR_share/safe/resources/Helvetica')
for font_file in font_files:
    fm.fontManager.addfont(font_file)
mpl.rcParams['font.sans-serif'] = 'Helvetica'
mpl.rcParams['axes.spines.right'] = 'off'
mpl.rcParams['axes.spines.top'] = 'off'
mpl.rcParams['figure.figsize'] = (2.5,2.5)
mpl.rcParams['axes.labelsize']: '10'
mpl.rcParams['xtick.labelsize']: '10'
mpl.rcParams['ytick.labelsize']: '10'

In [None]:
SAVEPATH = '/home/jupyter/ADAPT_PCR_share/safe/results/'

## Load datasets

In [None]:
train_df = pd.read_csv('/home/jupyter/ADAPT_PCR_share/safe/dataset/0717_dataset_train.csv',index_col=[0,1])
valid_df = pd.read_csv('/home/jupyter/ADAPT_PCR_share/safe/dataset/0717_dataset_valid.csv',index_col=[0,1])
test_df = pd.read_csv('/home/jupyter/ADAPT_PCR_share/safe/dataset/0717_dataset_test.csv',index_col=[0,1])

print(train_df.shape, valid_df.shape, test_df.shape)
train_df.head(1)

In [None]:
vs = train_df['score'].tolist() + test_df['score'].tolist() + valid_df['score'].tolist()
print(len(vs))

In [None]:
bins = list(np.arange(0,1.11,.05)-.025) + [2]
ys, ts = np.histogram(vs, bins=bins)
xs = np.arange(0,1.11,.05)
plt.bar(xs,ys,width=.05,color='gray')
plt.xlabel('Activity score')
plt.ylabel('PCR reactions')
plt.savefig(SAVEPATH + '0828_hist.png',bbox_inches='tight',dpi=500)

## Feature-based regressors

In [None]:
fs = ['f_length','f_Tm','f_GC','f_indel','f_mm','r_length','r_Tm','r_GC','r_indel','r_mm','prod_length','prod_Tm']
newfs = ['f_len','f_Tm','f_GC','f_indel','f_mm','r_len','r_Tm','r_GC','r_indel','r_mm','prod_len','prod_Tm']

train_feats = train_df[fs]
train_feats.columns = newfs
valid_feats = valid_df[fs]
test_feats = test_df[fs]
print(train_feats.shape, valid_feats.shape, test_feats.shape)

scaler = StandardScaler()
X_train = scaler.fit_transform(train_feats)

y_train = train_df['score'].values
y_val = valid_df['score'].values
y_test = test_df['score'].values

### Random Forest Regressor

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_val_pred = rf_model.predict(X_val)
rf_test_pred = rf_model.predict(X_test)

rf_val_r2 = r2_score(y_val, rf_val_pred)
rf_test_r2 = r2_score(y_test, rf_test_pred)

print("Random Forest:")
print(f"  Validation R2: {rf_val_r2:.4f}")
print(f"  Test R2: {rf_test_r2:.4f}")

In [None]:
plt.figure(figsize=(4,1.8))
plt.bar(range(len(fs)),rf_model.feature_importances_)
plt.xticks(range(len(fs)),fs,rotation=90)
plt.ylabel('Feature importance')

### SVM

In [None]:
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr_model.fit(X_train, y_train)

svr_val_pred = svr_model.predict(X_val)
svr_test_pred = svr_model.predict(X_test)

svr_val_r2 = r2_score(y_val, svr_val_pred)
svr_test_r2 = r2_score(y_test, svr_test_pred)

print("SVR:")
print(f"  Validation R2: {svr_val_r2:.4f}")
print(f"  Test R2: {svr_test_r2:.4f}")

### MLP

In [None]:
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)

mlp_val_pred = mlp_model.predict(X_val)
mlp_test_pred = mlp_model.predict(X_test)

mlp_val_r2 = r2_score(y_val, mlp_val_pred)
mlp_test_r2 = r2_score(y_test, mlp_test_pred)

print("MLPRegressor:")
print(f"  Validation R2: {mlp_val_r2:.4f}")
print(f"  Test R2: {mlp_test_r2:.4f}")

### Linear

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

lr_val_pred = lr_model.predict(X_val)
lr_test_pred = lr_model.predict(X_test)

lr_val_r2 = r2_score(y_val, lr_val_pred)
lr_test_r2 = r2_score(y_test, lr_test_pred)

print("Linear Regression:")
print(f"  Validation R2: {lr_val_r2:.4f}")
print(f"  Test R2: {lr_test_r2:.4f}")

### Ridge

In [None]:
ridge_model = Ridge(alpha=1000)  # alpha controls the regularization strength
ridge_model.fit(X_train, y_train)

ridge_val_pred = ridge_model.predict(X_val)
ridge_test_pred = ridge_model.predict(X_test)

ridge_val_r2 = r2_score(y_val, ridge_val_pred)
ridge_test_r2 = r2_score(y_test, ridge_test_pred)

print("Ridge Regression:")
print(f"  Validation R2: {ridge_val_r2:.4f}")
print(f"  Test R2: {ridge_test_r2:.4f}")

### GBR

In [None]:
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gbr_model.fit(X_train, y_train)

gbr_val_pred = gbr_model.predict(X_val)
gbr_test_pred = gbr_model.predict(X_test)

gbr_val_r2 = r2_score(y_val, gbr_val_pred)
gbr_test_r2 = r2_score(y_test, gbr_test_pred)

print("Gradient Boosting Regressor:")
print(f"  Validation R2: {gbr_val_r2:.4f}")
print(f"  Test R2: {gbr_test_r2:.4f}")

In [None]:
plt.figure(figsize=(4,1.8))
plt.bar(range(len(fs)),gbr_model.feature_importances_)
plt.xticks(range(len(fs)),fs,rotation=90)
plt.ylabel('Feature importance')

### All models

In [None]:
all_labels = ['Linear','Ridge','RF','SVR','MLP','GBR']
all_preds = [ lr_test_pred, ridge_test_pred, rf_test_pred, svr_test_pred, mlp_test_pred, gbr_test_pred ]
tbl = pd.DataFrame(index=all_labels,columns=['R2','MAE','RMSE'])
for label,y_pred in zip(all_labels,all_preds):
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    tbl.loc[label] = pd.Series({'R2':r2,'MAE':mae,'RMSE':rmse})
tbl

In [None]:
lr = 0.
wd = 0.
otherhp = 'na'
ds = 'Test'
enc = '12fs'
epoch = 1
t = '%.4f'%0

for label,y_pred in zip(all_labels,all_preds):
    r2 = '%.4f'%r2_score(y_test, y_pred)
    mae = '%.4f'%mean_absolute_error(y_test, y_pred)
    rmse = '%.4f'%mean_squared_error(y_test, y_pred, squared=False)
    row = [label, lr, wd, otherhp, ds, enc, epoch, r2, mae, rmse, t]
    with open(OUTFILE,'a') as out:
        out.write('\t'.join(map(str,row))+'\n')

In [None]:
!head $OUTFILE

## Sequence-based models

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from einops import rearrange, repeat
from tqdm.auto import tqdm
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
device

In [None]:
def one_hot_encode(seq, length=28):
    mapping = { 'A':[1, 0, 0, 0, 0],
                'T':[0, 1, 0, 0, 0],
                'C':[0, 0, 1, 0, 0],
                'G':[0, 0, 0, 1, 0],
                'N':[0, 0, 0, 0, 0],
                '-':[0, 0, 0, 0, 1] }
    seq = seq.ljust(length, 'N') # (6, ATCG) -> NNATCG
    return np.array([mapping[char.upper()] for char in seq])

def one_hot_encode_full_gap(df_seqs, maxl=1421):
    primer_encoded = []
    target_encoded = []
    for (tname,pname),row in df_seqs.iterrows():
        fseq, fst, rseq, rst, tseq = row[['f_seq','f_start','r_seq','r_start','target_seq']]
        fenc, ftenc, renc, rtenc = row[['f_penc','f_tenc','r_penc','r_tenc']]
        pseq = 'N'*fst + fenc + 'N'*(rst-(fst+len(fseq))) + renc + 'N'*(len(tseq)-(rst+len(rseq)))
        tseq = tseq[:fst] + ftenc + tseq[fst+len(fseq):rst] + rtenc + tseq[rst+len(rseq):]
        primer_encoded.append(one_hot_encode(pseq, maxl))
        target_encoded.append(one_hot_encode(tseq, maxl))
    final_encoded = np.append(np.array(target_encoded), np.array(primer_encoded), axis=2)
    print(final_encoded.shape)
    return torch.tensor(final_encoded, dtype=torch.float32)

def one_hot_encode_pbs_gap(df_seqs):
    primer_encoded = []
    target_encoded = []
    for (tname,pname),row in df_seqs.iterrows():
        fenc, ftenc, renc, rtenc = row[['f_penc','f_tenc','r_penc','r_tenc']].apply(one_hot_encode)
        prienc = np.append(fenc,renc,axis=0)
        tarenc = np.append(ftenc,rtenc,axis=0)
        primer_encoded.append(prienc)
        target_encoded.append(tarenc)
    primer_encoded = np.array(primer_encoded)
    target_encoded = np.array(target_encoded)
    final_encoded = np.append(target_encoded, primer_encoded, axis=2)
    print(final_encoded.shape)
    return torch.tensor(final_encoded, dtype=torch.float32)


In [None]:
class PcrDataset(Dataset):
    def __init__(self, encoded_input, ct_values):
        self.encoded_input = encoded_input
        self.ct_values = ct_values
    def __len__(self):
        return len(self.encoded_input)
    def __getitem__(self, idx):
        return self.encoded_input[idx], self.ct_values[idx]

In [None]:
X_train = one_hot_encode_full_gap(train_df)
X_val = one_hot_encode_full_gap(valid_df)
X_test = one_hot_encode_full_gap(test_df)

y_train = train_df['score'].values
y_val = valid_df['score'].values
y_test = test_df['score'].values

data_train = PcrDataset(X_train, y_train)
data_val = PcrDataset(X_val, y_val)
data_test = PcrDataset(X_test, y_test)

loader_train = DataLoader(data_train, batch_size=64, shuffle=True)
loader_val = DataLoader(data_val, batch_size=64, shuffle=False)
loader_test = DataLoader(data_test, batch_size=64, shuffle=False)

In [None]:
num_epochs = 50
input_dim = 10
output_dim = 1
learning_rates = [10**-3, 10**-4]
weight_decays = [10**-2, 10**-3]

### Lyra

In [None]:
class PGC(nn.Module):
    def __init__(self,model_dim,expansion_factor = 1.0,dropout = 0.0):
        super().__init__()
        self.model_dim = model_dim
        self.expansion_factor = expansion_factor
        self.dropout = dropout
        self.conv = nn.Conv1d(int(model_dim * expansion_factor), int(model_dim * expansion_factor),
                              kernel_size=3, padding=1, groups=int(model_dim * expansion_factor))
        self.in_proj = nn.Linear(model_dim, int(model_dim * expansion_factor * 2))
        self.out_norm = nn.RMSNorm(int(model_dim), eps=1e-8)
        self.in_norm = nn.RMSNorm(int(model_dim * expansion_factor * 2), eps=1e-8)
        self.out_proj = nn.Linear(int(model_dim * expansion_factor), model_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, u):
        xv = self.in_norm(self.in_proj(u))
        x,v = xv.chunk(2,dim=-1)
        x_conv = self.conv(x.transpose(-1,-2)).transpose(-1,-2)
        gate =  v * x_conv
        x = self.out_norm(self.out_proj(gate))
        return x
    
class DropoutNd(nn.Module):
    def __init__(self, p: float = 0.5, tie=True, transposed=True):
        """
        tie: tie dropout mask across sequence lengths (Dropout1d/2d/3d)
        """
        super().__init__()
        if p < 0 or p >= 1:
            raise ValueError("dropout probability has to be in [0, 1), " "but got {}".format(p))
        self.p = p
        self.tie = tie
        self.transposed = transposed
        self.binomial = torch.distributions.binomial.Binomial(probs=1-self.p)

    def forward(self, X):
        """X: (batch, dim, lengths...)."""
        if self.training:
            if not self.transposed: X = rearrange(X, 'b ... d -> b d ...')
            # binomial = torch.distributions.binomial.Binomial(probs=1-self.p) 
            # This is incredibly slow because of CPU -> GPU copying
            mask_shape = X.shape[:2] + (1,)*(X.ndim-2) if self.tie else X.shape
            # mask = self.binomial.sample(mask_shape)
            mask = torch.rand(*mask_shape, device=X.device) < 1.-self.p
            X = X * mask * (1.0/(1-self.p))
            if not self.transposed: X = rearrange(X, 'b d ... -> b ... d')
            return X
        return X

class S4DKernel(nn.Module):
    """Generate convolution kernel from diagonal SSM parameters."""

    def __init__(self, model_dim, N=64, dt_min=0.001, dt_max=0.1, lr=None):
        super().__init__()
        # Generate dt
        H = model_dim
        log_dt = torch.rand(H) * (
            math.log(dt_max) - math.log(dt_min)
        ) + math.log(dt_min)

        C = torch.randn(H, N // 2, dtype=torch.cfloat)
        self.C = nn.Parameter(torch.view_as_real(C))
        self.register("log_dt", log_dt, lr)

        log_A_real = torch.log(0.5 * torch.ones(H, N//2))
        A_imag = math.pi * repeat(torch.arange(N//2), 'n -> h n', h=H)
        self.register("log_A_real", log_A_real, lr)
        self.register("A_imag", A_imag, lr)

    def forward(self, L):
        """
        returns: (..., c, L) where c is number of channels (default 1)
        """

        # Materialize parameters
        dt = torch.exp(self.log_dt) # (H)
        C = torch.view_as_complex(self.C) # (H N)
        A = -torch.exp(self.log_A_real) + 1j * self.A_imag # (H N)

        # Vandermonde multiplication
        dtA = A * dt.unsqueeze(-1)  # (H N)
        K = dtA.unsqueeze(-1) * torch.arange(L, device=A.device) # (H N L)
        C = C * (torch.exp(dtA)-1.) / A
        K = 2 * torch.einsum('hn, hnl -> hl', C, torch.exp(K)).real

        return K

    def register(self, name, tensor, lr=None):
        """Register a tensor with a configurable learning rate and 0 weight decay"""

        if lr == 0.0:
            self.register_buffer(name, tensor)
        else:
            self.register_parameter(name, nn.Parameter(tensor))

            optim = {"weight_decay": 0.0}
            if lr is not None: optim["lr"] = lr
            setattr(getattr(self, name), "_optim", optim)


class S4D(nn.Module):
    def __init__(self, model_dim, state_dim=64, dropout=0.0, transposed=True, **kernel_args):
        super().__init__()

        self.h = model_dim
        self.n = state_dim
        self.output_dim = self.h
        self.transposed = transposed

        self.D = nn.Parameter(torch.randn(self.h))
        # SSM Kernel
        self.kernel = S4DKernel(self.h, N=self.n, **kernel_args)
        # Pointwise
        self.activation = nn.GELU()
        dropout_fn = DropoutNd
        self.dropout = dropout_fn(dropout) if dropout > 0.0 else nn.Identity()

        # position-wise output transform to mix features
        self.output_linear = nn.Sequential(
            nn.Conv1d(self.h, 2*self.h, kernel_size=1),
            nn.GLU(dim=-2),
        )

    def forward(self, u, **kwargs): # absorbs return_output and transformer src mask
        """ Input and output shape (B, H, L) """
        if not self.transposed: u = u.transpose(-1, -2)
        L = u.size(-1)
        # Compute SSM Kernel
        k = self.kernel(L=L) # (H L)

        # Convolution
        k_f = torch.fft.rfft(k, n=2*L)  # (H L)
        u_f = torch.fft.rfft(u, n=2*L) # (B H L)
        y = torch.fft.irfft(u_f*k_f, n=2*L)[..., :L] # (B H L)

        # Compute D term in state space equation - essentially a skip connection
        y = y + u * self.D.unsqueeze(-1)

        y = self.dropout(self.activation(y))
        y = self.output_linear(y)
        if not self.transposed: y = y.transpose(-1, -2)
        return y
    
class Janus(nn.Module):
    def __init__(self, input_dim, output_dim, model_dim, state_dim=64, dropout=0.2, transposed=False, **kernel_args):
        super().__init__()
        self.encoder = nn.Linear(input_dim, model_dim)
        self.pgc1 = PGC(model_dim, expansion_factor=0.25, dropout=dropout)
        self.pgc2 = PGC(model_dim, expansion_factor=2, dropout=dropout)
        self.s4d = S4D(model_dim, state_dim=state_dim, dropout=dropout, transposed=transposed, **kernel_args)
        self.norm = nn.RMSNorm(model_dim)
        self.decoder = nn.Linear(model_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, u):
        x = self.encoder(u)
        x = self.pgc1(x)
        x = self.pgc2(x)
        z = x
        z = self.norm(z)
        x = self.dropout(self.s4d(z)) + x
        x = x.mean(dim=1)
        #x = self.dropout(x)
        x = self.decoder(x)
        return x

In [None]:
model_dims = [64,128]
conds = [(lr,wd,md) for lr in learning_rates for wd in weight_decays for md in model_dims]
print(len(conds))

In [None]:
for lr, wd, model_dim in conds:
    model = Janus(input_dim=input_dim, output_dim=output_dim, model_dim=model_dim).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd) 
    best_state = None
    best_r2 = float('-inf')
    otherhp = 'model_dim=%i'%model_dim
    ## Training
    for epoch in tqdm(range(num_epochs), desc='LR %s; WD %s; MD %s, Epoch'%(lr,wd,model_dim)):
        start = time.time()
        model.train()
        for inputs, labels in loader_train:
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_time = (time.time() - start)
        
        model.eval()
        cross_loss = 0
        cross_true, cross_pred = [], []
        with torch.no_grad():
            for inputs, labels in loader_val:
                inputs, labels = inputs.to(device).float(), labels.to(device).float()
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                cross_loss += loss.item()
                cross_true.append(labels.detach().cpu().numpy())
                cross_pred.append(outputs.squeeze().detach().cpu().numpy())
            cross_true = np.concatenate(cross_true)
            cross_pred = np.concatenate(cross_pred)

        epoch_r2 = r2_score(cross_true, cross_pred)
        #print('%i\t%.3f'%(epoch,epoch_r2))
        
        mae = '%.4f'%mean_absolute_error(cross_true, cross_pred)
        rmse = '%.4f'%mean_squared_error(cross_true, cross_pred, squared=False)
        row = ['Lyra', lr, wd, otherhp, 'Val', 'PBS', epoch, '%.4f'%epoch_r2, mae, rmse, '%.4f'%train_time]
        with open(OUTFILE,'a') as out:
            out.write('\t'.join(map(str,row))+'\n')
        
        if epoch_r2 > best_r2:
            best_r2 = epoch_r2
            best_state = model.state_dict()

    ## Test
    model.load_state_dict(best_state)
    model.eval()
    test_true = []
    test_pred = []
    start = time.time()
    with torch.no_grad():
        for inputs, labels in loader_test:
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            outputs = model(inputs)
            test_true.append(labels.detach().cpu().numpy())
            test_pred.append(outputs.squeeze().detach().cpu().numpy())
        test_true = np.concatenate(test_true)
        test_pred = np.concatenate(test_pred)

    infer_time = (time.time() - start)
    val_r2 = best_r2
    test_r2 = r2_score(test_true, test_pred)
    
    mae = '%.4f'%mean_absolute_error(test_true, test_pred)
    rmse = '%.4f'%mean_squared_error(test_true, test_pred, squared=False)
    row = ['Lyra', lr, wd, otherhp, 'Test', 'PBS', 1, '%.4f'%test_r2, mae, rmse, '%.4f'%infer_time]
    with open(OUTFILE,'a') as out:
        out.write('\t'.join(map(str,row))+'\n')
    
    print("Lyra; LR %s; WD %s; MD %s"%(lr,wd,model_dim))
    print(f"  Validation R2: {val_r2:.4f}")
    print(f"  Test R2: {test_r2:.4f}")

### CNN

In [None]:
# adapted from Green et al. 2022

class CNN(nn.Module):
    def __init__(self, input_dim, input_len, lin_dim, output_dim):
        super(CNN, self).__init__()

        # input: (batch_size, 8, 56)

        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=64, kernel_size=4)
        self.conv2 = nn.Conv1d(64, 64, kernel_size=12)
        self.pool1 = nn.MaxPool1d(kernel_size=3)

        self.conv3 = nn.Conv1d(64, 32, kernel_size=3)
        self.conv4 = nn.Conv1d(32, 32, kernel_size=3)
        self.pool2 = nn.MaxPool1d(kernel_size=3)

        self.flatten = nn.Flatten()

        # flatten out linear input size
        dummy = torch.zeros(1, input_dim, input_len)
        with torch.no_grad():
            dummy_out = self._forward_conv(dummy)
            flat_size = dummy_out.numel()

        self.dense1 = nn.Linear(flat_size, lin_dim) # 256 -> 64
        self.dense2 = nn.Linear(lin_dim, lin_dim)
        self.output = nn.Linear(lin_dim, 1)
        # self.sigmoid = nn.Sigmoid() # deleted in regression

    def _forward_conv(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool1(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool2(x)
        return x

    def forward(self, x):
        # x: (batch_size, 56, 10) to (batch_size, 10, 56)
        x = x.permute(0, 2, 1)
        x = self._forward_conv(x)
        x = self.flatten(x)
        x = F.relu(self.dense1(x))
        x = F.relu(self.dense2(x))
        return self.output(x) # self.sigmoid(self.output(x))

In [None]:
input_len = 1421 # 56
lin_dims = [64,128]
conds = [(lr,wd,ld) for lr in learning_rates for wd in weight_decays for ld in lin_dims]
print(len(conds))

In [None]:
for lr, wd, lin_dim in conds:
    model = CNN(input_dim=input_dim, input_len=input_len, lin_dim=lin_dim, output_dim=output_dim).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd) 
    best_state = None
    best_r2 = float('-inf')
    otherhp = 'linear_dim=%i'%lin_dim
    ## Training
    for epoch in tqdm(range(num_epochs), desc='LR %s; WD %s; LD %s, Epoch'%(lr,wd,lin_dim)):
        start = time.time()
        model.train()
        for inputs, labels in loader_train:
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_time = (time.time() - start)
        
        model.eval()
        cross_loss = 0
        cross_true, cross_pred = [], []
        with torch.no_grad():
            for inputs, labels in loader_val:
                inputs, labels = inputs.to(device).float(), labels.to(device).float()
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                cross_loss += loss.item()
                cross_true.append(labels.detach().cpu().numpy())
                cross_pred.append(outputs.squeeze().detach().cpu().numpy())
            cross_true = np.concatenate(cross_true)
            cross_pred = np.concatenate(cross_pred)

        epoch_r2 = r2_score(cross_true, cross_pred)
        #print('%i\t%.3f'%(epoch,epoch_r2))
        
        mae = '%.4f'%mean_absolute_error(cross_true, cross_pred)
        rmse = '%.4f'%mean_squared_error(cross_true, cross_pred, squared=False)
        row = ['CNN', lr, wd, otherhp, 'Val', 'PBS', epoch, '%.4f'%epoch_r2, mae, rmse, '%.4f'%train_time]
        with open(OUTFILE,'a') as out:
            out.write('\t'.join(map(str,row))+'\n')
        
        if epoch_r2 > best_r2:
            best_r2 = epoch_r2
            best_state = model.state_dict()

    ## Test
    model.load_state_dict(best_state)
    model.eval()
    test_true = []
    test_pred = []
    start = time.time()
    with torch.no_grad():
        for inputs, labels in loader_test:
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            outputs = model(inputs)
            test_true.append(labels.detach().cpu().numpy())
            test_pred.append(outputs.squeeze().detach().cpu().numpy())
        test_true = np.concatenate(test_true)
        test_pred = np.concatenate(test_pred)

    infer_time = (time.time() - start)
    val_r2 = best_r2
    test_r2 = r2_score(test_true, test_pred)
    
    mae = '%.4f'%mean_absolute_error(test_true, test_pred)
    rmse = '%.4f'%mean_squared_error(test_true, test_pred, squared=False)
    row = ['CNN', lr, wd, otherhp, 'Test', 'PBS', 1, '%.4f'%test_r2, mae, rmse, '%.4f'%infer_time]
    with open(OUTFILE,'a') as out:
        out.write('\t'.join(map(str,row))+'\n')
    
    print("CNN; LR %s; WD %s; LD %s"%(lr,wd,lin_dim))
    print(f"  Validation R2: {val_r2:.4f}")
    print(f"  Test R2: {test_r2:.4f}")

### LSTM

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x: (batch, seq_len, input_size)
        batch_size = x.size(0)

        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))  # out: (batch, seq_len, hidden_size)
        out = self.fc(out[:, -1, :])     # 마지막 time step만 예측에 사용
        return out

In [None]:
hidden_sizes = [64,128]
nums_layers = [1,2]
conds = [(lr,wd,hs,nl) for lr in learning_rates for wd in weight_decays 
         for hs in hidden_sizes for nl in nums_layers]
print(len(conds))

In [None]:
for lr, wd, hidden_size, num_layers in conds:
    model = LSTM(input_size=input_dim, output_size=output_dim, hidden_size=hidden_size, num_layers=num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd) 
    best_state = None
    best_r2 = float('-inf')
    otherhp = 'hidden_size=%i;num_layers=%i'%(hidden_size,num_layers)
    ## Training
    for epoch in tqdm(range(num_epochs), desc='LR %s; WD %s; HS %s; NL %s, Epoch'%(lr,wd,hidden_size,num_layers)):
        start = time.time()
        model.train()
        for inputs, labels in loader_train:
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_time = (time.time() - start)
        
        model.eval()
        cross_loss = 0
        cross_true, cross_pred = [], []
        with torch.no_grad():
            for inputs, labels in loader_val:
                inputs, labels = inputs.to(device).float(), labels.to(device).float()
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                cross_loss += loss.item()
                cross_true.append(labels.detach().cpu().numpy())
                cross_pred.append(outputs.squeeze().detach().cpu().numpy())
            cross_true = np.concatenate(cross_true)
            cross_pred = np.concatenate(cross_pred)

        epoch_r2 = r2_score(cross_true, cross_pred)
        #print('%i\t%.3f'%(epoch,epoch_r2))
        
        mae = '%.4f'%mean_absolute_error(cross_true, cross_pred)
        rmse = '%.4f'%mean_squared_error(cross_true, cross_pred, squared=False)
        row = ['LSTM', lr, wd, otherhp, 'Val', 'PBS', epoch, '%.4f'%epoch_r2, mae, rmse, '%.4f'%train_time]
        with open(OUTFILE,'a') as out:
            out.write('\t'.join(map(str,row))+'\n')
        
        if epoch_r2 > best_r2:
            best_r2 = epoch_r2
            best_state = model.state_dict()

    ## Test
    model.load_state_dict(best_state)
    model.eval()
    test_true = []
    test_pred = []
    start = time.time()
    with torch.no_grad():
        for inputs, labels in loader_test:
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            outputs = model(inputs)
            test_true.append(labels.detach().cpu().numpy())
            test_pred.append(outputs.squeeze().detach().cpu().numpy())
        test_true = np.concatenate(test_true)
        test_pred = np.concatenate(test_pred)

    infer_time = (time.time() - start)
    val_r2 = best_r2
    test_r2 = r2_score(test_true, test_pred)
    
    mae = '%.4f'%mean_absolute_error(test_true, test_pred)
    rmse = '%.4f'%mean_squared_error(test_true, test_pred, squared=False)
    row = ['LSTM', lr, wd, otherhp, 'Test', 'PBS', 1, '%.4f'%test_r2, mae, rmse, '%.4f'%infer_time]
    with open(OUTFILE,'a') as out:
        out.write('\t'.join(map(str,row))+'\n')
    
    print("LSTM; LR %s; WD %s; HS %s; NL %s"%(lr,wd,hidden_size,num_layers))
    print(f"  Validation R2: {val_r2:.4f}")
    print(f"  Test R2: {test_r2:.4f}")

### Transformer

In [None]:
class Transformer(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, output_dim, num_layers=1, dropout=.1, max_len=1500):
        super(Transformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.positional_encoding = self._generate_positional_encoding(model_dim, max_len)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, 
                                                   dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(model_dim, output_dim)

    def _generate_positional_encoding(self, d_model, max_len):
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # [max_len, 1, model_dim]
        return pe  # [seq_len, 1, d_model]

    def forward(self, x):
        x = self.embedding(x)  # [batch, seq_len, model_dim]
        x = x.permute(1, 0, 2)  # [seq_len, batch, model_dim]
        x = x + self.positional_encoding[:x.size(0)].to(x.device)
        x = self.transformer(x)
        x = x.mean(dim=0)  # [batch, model_dim]
        return self.output_layer(x)

In [None]:
model_dims = [64]
nums_heads = [4]

conds = [(lr,wd,md,nh) for lr in learning_rates for wd in weight_decays 
         for md in model_dims for nh in nums_heads]
print(len(conds))

In [None]:
for lr, wd, model_dim, num_heads in conds:
    model = Transformer(input_dim=input_dim, model_dim=model_dim, num_heads=num_heads, 
                        output_dim=output_dim).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd) 
    best_state = None
    best_r2 = float('-inf')
    otherhp = 'model_dim=%i;nheads=%i'%(model_dim,num_heads)
    ## Training
    for epoch in tqdm(range(num_epochs), 
                      desc='LR %s; WD %s; MD %s; NH %s; Epoch'%(lr,wd,model_dim,num_heads)):
        start = time.time()
        model.train()
        for inputs, labels in loader_train:
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_time = (time.time() - start)
        
        model.eval()
        cross_loss = 0
        cross_true, cross_pred = [], []
        with torch.no_grad():
            for inputs, labels in loader_val:
                inputs, labels = inputs.to(device).float(), labels.to(device).float()
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                cross_loss += loss.item()
                cross_true.append(labels.detach().cpu().numpy())
                cross_pred.append(outputs.squeeze().detach().cpu().numpy())
            cross_true = np.concatenate(cross_true)
            cross_pred = np.concatenate(cross_pred)

        epoch_r2 = r2_score(cross_true, cross_pred)
        #print('%i\t%.3f'%(epoch,epoch_r2))
        
        mae = '%.4f'%mean_absolute_error(cross_true, cross_pred)
        rmse = '%.4f'%mean_squared_error(cross_true, cross_pred, squared=False)
        row = ['Transformer', lr, wd, otherhp, 'Val', 'PBS', epoch, '%.4f'%epoch_r2, mae, rmse, '%.4f'%train_time]
        with open(OUTFILE,'a') as out:
            out.write('\t'.join(map(str,row))+'\n')
        
        if epoch_r2 > best_r2:
            best_r2 = epoch_r2
            best_state = model.state_dict()

    ## Test
    model.load_state_dict(best_state)
    model.eval()
    test_true = []
    test_pred = []
    start = time.time()
    with torch.no_grad():
        for inputs, labels in loader_test:
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            outputs = model(inputs)
            test_true.append(labels.detach().cpu().numpy())
            test_pred.append(outputs.squeeze().detach().cpu().numpy())
        test_true = np.concatenate(test_true)
        test_pred = np.concatenate(test_pred)

    infer_time = (time.time() - start)
    val_r2 = best_r2
    test_r2 = r2_score(test_true, test_pred)
    
    mae = '%.4f'%mean_absolute_error(test_true, test_pred)
    rmse = '%.4f'%mean_squared_error(test_true, test_pred, squared=False)
    row = ['Transformer', lr, wd, otherhp, 'Test', 'PBS', 1, '%.4f'%test_r2, mae, rmse, '%.4f'%infer_time]
    with open(OUTFILE,'a') as out:
        out.write('\t'.join(map(str,row))+'\n')

    print("Transformer; LR %s; WD %s; MD %s; NH %s"%(lr,wd,model_dim,num_heads))
    print(f"  Validation R2: {val_r2:.4f}")
    print(f"  Test R2: {test_r2:.4f}")

### Hyperparameter selection

In [None]:
min_epoch = 20
outfile = '/home/jupyter/ADAPT_PCR_share/safe/results/0721_regression_performances.csv'
outfile_full = '/home/jupyter/ADAPT_PCR_share/safe/results/0721_regression_performances_full.csv'
outcols = ['R2','MAE','RMSE','Training time','Time']

In [None]:
results_pbs = pd.read_table(outfile)
res_top_pbs = ( results_pbs[(results_pbs['Dataset'] == 'Val')&(results_pbs['Epoch']>=min_epoch)]
                .sort_values('R2', ascending=False)
                .drop_duplicates(subset=['Architecture', 'LR', 'WD', 'Other_HP'], keep='first') )


In [None]:
sub

In [None]:
sub = results_pbs[results_pbs['Architecture']=='Transformer']
set(sub['Other_HP'].tolist())

In [None]:
results_full = pd.read_table(outfile_full)
res_top_full = ( results_full[(results_full['Dataset'] == 'Val')&(results_full['Epoch']>=min_epoch)]
            .sort_values('R2', ascending=False)
            .drop_duplicates(subset=['Architecture', 'LR', 'WD', 'Other_HP'], keep='first') )
# top3_per_arch = ( res_top_full
#                   .groupby('Architecture', group_keys=False)
#                   .apply(lambda df: df.nlargest(3, 'R2')) )
# top3_per_arch

In [None]:
selected_simple = results[(results['Dataset']=='Test')&(results['Other_HP']=='na')].copy()
selected_simple.loc[:,'Input'] = 'Features'
selected_simple.loc[:,'Training time'] = 0
selected_simple = selected_simple.set_index(['Input','Architecture'])[outcols]
selected_simple.head(1)

In [None]:
selected_full = []
r_top = res_top_full.drop_duplicates('Architecture').set_index('Architecture')
for arch,row in r_top.reindex(['CNN','Transformer','LSTM','Lyra']).iterrows():
    lr, wd, other = row.values[:3]
    sub_val = results_full[(results_full['Architecture']==arch)&(results_full['Dataset']=='Val')]
    sub_test = results_full[(results_full['Architecture']==arch)&(results_full['Dataset']=='Test')]
    
    select = sub_test[(sub_test['LR']==lr)&(sub_test['WD']==wd)&(sub_test['Other_HP']==other)].copy()
    t_train = sub_val.loc[(sub_val['LR']==lr)&(sub_val['WD']==wd)&(sub_val['Other_HP']==other),'Time'].mean()
    select.loc[:,'Training time'] = '%.4f'%t_train
    select.loc[:,'Input'] = 'Full sequence'
    selected_full.append(select)
    
selected_full = pd.concat(selected_full)
selected_full = selected_full.set_index(['Input','Architecture'])[outcols]
selected_full.head(1)

In [None]:
selected_pbs = []
r_top = res_top_pbs.drop_duplicates('Architecture').set_index('Architecture')
for arch,row in r_top.reindex(['CNN','Transformer','LSTM','Lyra']).iterrows():
    lr, wd, other = row.values[:3]
    print(other, lr, wd)
    sub_val = results_pbs[(results_pbs['Architecture']==arch)&(results_pbs['Dataset']=='Val')]
    sub_test = results_pbs[(results_pbs['Architecture']==arch)&(results_pbs['Dataset']=='Test')]
    
    select = sub_test[(sub_test['LR']==lr)&(sub_test['WD']==wd)&(sub_test['Other_HP']==other)].copy()
    t_train = sub_val.loc[(sub_val['LR']==lr)&(sub_val['WD']==wd)&(sub_val['Other_HP']==other),'Time'].mean()
    select.loc[:,'Training time'] = '%.4f'%t_train
    select.loc[:,'Input'] = 'PBS only'
    selected_pbs.append(select)
    
selected_pbs = pd.concat(selected_pbs)
selected_pbs = selected_pbs.set_index(['Input','Architecture'])[outcols]
selected_pbs.head(1)

In [None]:
selected_pbs

In [None]:
selected = pd.concat([selected_simple, selected_full, selected_pbs])
selected

## Hybrid model

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

In [None]:
class CombinedModel(nn.Module):
    def __init__(self, DL, mlp_dims, dl_dims, combined_hidden, final_output):
        super(CombinedModel, self).__init__()

        # Individual models
        self.mlp = MLP(*mlp_dims)
        self.dl = DL(*dl_dims)

        # Combining ml
        combined_input_dim = mlp_dims[1] + dl_dims[1]
        self.combiner = nn.Sequential(
            nn.Linear(combined_input_dim, combined_hidden),
            nn.ReLU(),
            nn.Linear(combined_hidden, final_output)
        )

    def forward(self, mlp_input, dl_input):
        mlp_out = self.mlp(mlp_input)  # Output from ml
        dl_out = self.dl(dl_input)  # Output from dl

        # Concatenate outputs
        combined = torch.cat((mlp_out, dl_out), dim=1)

        # Final prediction
        final_output = self.combiner(combined)
        return final_output

In [None]:
class PcrDataset(Dataset):
    def __init__(self, encoded_input, custom_features, scores):
        self.encoded_input = encoded_input
        self.custom_features = custom_features
        self.scores = scores
    def __len__(self):
        return len(self.encoded_input)
    def __getitem__(self, idx):
        return self.encoded_input[idx], self.custom_features[idx], self.scores[idx]

In [None]:
seqs_train = one_hot_encode_pbs_gap(train_df)
seqs_val = one_hot_encode_pbs_gap(valid_df)
seqs_test = one_hot_encode_pbs_gap(test_df)

scaler = StandardScaler()
feats_train = scaler.fit_transform(train_feats)
feats_val = scaler.transform(valid_feats)
feats_test = scaler.transform(test_feats)

y_train = train_df['score'].values
y_val = valid_df['score'].values
y_test = test_df['score'].values

data_train = PcrDataset(seqs_train, feats_train, y_train)
data_val = PcrDataset(seqs_val, feats_val, y_val)
data_test = PcrDataset(seqs_test, feats_test, y_test)

loader_train = DataLoader(data_train, batch_size=64, shuffle=True)
loader_val = DataLoader(data_val, batch_size=64, shuffle=False)
loader_test = DataLoader(data_test, batch_size=64, shuffle=False)

In [None]:
OUTFILE = '%s/0721_regression_performances_combined.csv'%SAVEPATH
cols = ['Architecture', 'LR', 'WD', 'Other_HP', 'Dataset', 'Encoding', 'Epoch', 'R2', 'MAE', 'RMSE', 'Time']
# with open(OUTFILE,'wt') as out:
#     out.write('\t'.join(cols)+'\n')

### Lyra + MLP

In [None]:
lyra_inp_dim = input_dim
lyra_out_dim = 4
lyra_mod_dim = [64,128]

mlp_inp_dim = train_feats.shape[1]
mlp_out_dim = 4
mlp_hid_dims = [64,128]

com_hid_dims = [16,32]
final_out_dim = output_dim

lr = .001
wds = [0,.01]

conds = [(d0,d1,d2,wd) for d0 in lyra_mod_dim for d1 in mlp_hid_dims for d2 in com_hid_dims for wd in wds]
print(len(conds))

In [None]:
for lyra_mod_dim, mlp_hid_dim, com_hid_dim, wd in conds:
    model = CombinedModel(DL=Janus,
                          mlp_dims=(mlp_inp_dim, mlp_out_dim, mlp_hid_dim), 
                          dl_dims=(lyra_inp_dim, lyra_out_dim, lyra_mod_dim), 
                          combined_hidden=com_hid_dim, final_output=final_out_dim).to(device)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd) 
    best_state = None
    best_r2 = float('-inf')
    otherhp = 'lyra_mod_dim=%i;mlp_hid_dim=%i;com_hid_dim=%i'%(lyra_mod_dim,mlp_hid_dim,com_hid_dim)
    ## Training
    for epoch in tqdm(range(num_epochs), desc='MLP_hidden %s; Comb_hidden %s; WD %s, Epoch'%\
                      (mlp_hid_dim,com_hid_dim,wd)):
        start = time.time()
        model.train()
        for inputs, mlp_inputs, labels in loader_train:
            inputs, mlp_inputs, labels = inputs.to(device).float(), mlp_inputs.to(device).float(), labels.to(device).float()
            outputs = model(mlp_inputs, inputs)
            loss = criterion(outputs.squeeze(), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_time = (time.time() - start)
        
        model.eval()
        cross_loss = 0
        cross_true, cross_pred = [], []
        with torch.no_grad():
            for inputs, mlp_inputs, labels in loader_val:
                inputs, mlp_inputs, labels = inputs.to(device).float(), mlp_inputs.to(device).float(), labels.to(device).float()
                outputs = model(mlp_inputs, inputs)
                loss = criterion(outputs.squeeze(), labels)
                cross_loss += loss.item()
                cross_true.append(labels.detach().cpu().numpy())
                cross_pred.append(outputs.squeeze().detach().cpu().numpy())
            cross_true = np.concatenate(cross_true)
            cross_pred = np.concatenate(cross_pred)

        epoch_r2 = r2_score(cross_true, cross_pred)
        #print('%i\t%.3f'%(epoch,epoch_r2))
        
        mae = '%.4f'%mean_absolute_error(cross_true, cross_pred)
        rmse = '%.4f'%mean_squared_error(cross_true, cross_pred, squared=False)
        row = ['Lyra+MLP', lr, wd, otherhp, 'Val', 'PBS', epoch, '%.4f'%epoch_r2, mae, rmse, '%.4f'%train_time]
        #with open(OUTFILE,'a') as out:
        #    out.write('\t'.join(map(str,row))+'\n')
        
        if epoch_r2 > best_r2:
            best_r2 = epoch_r2
            best_state = model.state_dict()

    ## Test
    model.load_state_dict(best_state)
    model.eval()
    test_true = []
    test_pred = []
    start = time.time()
    with torch.no_grad():
        for inputs, mlp_inputs, labels in loader_test:
            inputs, mlp_inputs, labels = inputs.to(device).float(), mlp_inputs.to(device).float(), labels.to(device).float()
            outputs = model(mlp_inputs, inputs)
            test_true.append(labels.detach().cpu().numpy())
            test_pred.append(outputs.squeeze().detach().cpu().numpy())
        test_true = np.concatenate(test_true)
        test_pred = np.concatenate(test_pred)

    infer_time = (time.time() - start)
    val_r2 = best_r2
    test_r2 = r2_score(test_true, test_pred)
    
    mae = '%.4f'%mean_absolute_error(test_true, test_pred)
    rmse = '%.4f'%mean_squared_error(test_true, test_pred, squared=False)
    row = ['Lyra+MLP', lr, wd, otherhp, 'Test', 'PBS', 1, '%.4f'%test_r2, mae, rmse, '%.4f'%infer_time]
    #with open(OUTFILE,'a') as out:
    #    out.write('\t'.join(map(str,row))+'\n')
    mname = ';'.join(map(str,['Lyra+MLP', lr, wd, otherhp]))
    modelout = '/home/jupyter/ADAPT_PCR_share/safe/dataset/%s.pth' % mname
    torch.save(model, modelout)
    print("Lyra+MLP; MLP_hidden %s; Comb_hidden %s; WD %s"%(mlp_hid_dim,com_hid_dim,wd))
    print(f"  Validation R2: {val_r2:.4f}")
    print(f"  Test R2: {test_r2:.4f}")

### LSTM + MLP

In [None]:
lstm_inp_dim = input_dim
lstm_out_dim = 4
lstm_hid_dims = [64,128]
lstm_nums_layers = [1,4]

mlp_inp_dim = train_feats.shape[1]
mlp_out_dim = 4
mlp_hid_dims = [64,128]

com_hid_dims = [16,32]
final_out_dim = output_dim

lr = .001
wds = [0,.01]

conds = [(d1,d2,d3,d4,wd) for d1 in mlp_hid_dims for d2 in com_hid_dims 
         for d3 in lstm_hid_dims for d4 in lstm_nums_layers for wd in wds]
print(len(conds))

In [None]:
for mlp_hid_dim, com_hid_dim, lstm_hid_dim, lstm_num_layers, wd in conds:
    model = CombinedModel(DL=LSTM,
                          mlp_dims=(mlp_inp_dim, mlp_out_dim, mlp_hid_dim), 
                          dl_dims=(lstm_inp_dim, lstm_out_dim, lstm_hid_dim, lstm_num_layers), 
                          combined_hidden=com_hid_dim, final_output=final_out_dim).to(device)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd) 
    best_state = None
    best_r2 = float('-inf')
    otherhp = 'mlp_hid_dim=%i;com_hid_dim=%i;lstm_hid_dim=%i;lstm_num_layers=%i'%\
                (mlp_hid_dim,com_hid_dim,lstm_hid_dim,lstm_num_layers)
    ## Training
    for epoch in tqdm(range(num_epochs), desc='MLP_hidden %s; Comb_hidden %s; WD %s; LSTM %s %s, Epoch'%\
                      (mlp_hid_dim,com_hid_dim,wd,lstm_hid_dim,lstm_num_layers)):
        start = time.time()
        model.train()
        for inputs, mlp_inputs, labels in loader_train:
            inputs, mlp_inputs, labels = inputs.to(device).float(), mlp_inputs.to(device).float(), labels.to(device).float()
            outputs = model(mlp_inputs, inputs)
            loss = criterion(outputs.squeeze(), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_time = (time.time() - start)
        
        model.eval()
        cross_loss = 0
        cross_true, cross_pred = [], []
        with torch.no_grad():
            for inputs, mlp_inputs, labels in loader_val:
                inputs, mlp_inputs, labels = inputs.to(device).float(), mlp_inputs.to(device).float(), labels.to(device).float()
                outputs = model(mlp_inputs, inputs)
                loss = criterion(outputs.squeeze(), labels)
                cross_loss += loss.item()
                cross_true.append(labels.detach().cpu().numpy())
                cross_pred.append(outputs.squeeze().detach().cpu().numpy())
            cross_true = np.concatenate(cross_true)
            cross_pred = np.concatenate(cross_pred)

        epoch_r2 = r2_score(cross_true, cross_pred)
        #print('%i\t%.3f'%(epoch,epoch_r2))
        
        mae = '%.4f'%mean_absolute_error(cross_true, cross_pred)
        rmse = '%.4f'%mean_squared_error(cross_true, cross_pred, squared=False)
        row = ['LSTM+MLP', lr, wd, otherhp, 'Val', 'PBS', epoch, '%.4f'%epoch_r2, mae, rmse, '%.4f'%train_time]
        with open(OUTFILE,'a') as out:
            out.write('\t'.join(map(str,row))+'\n')
        
        if epoch_r2 > best_r2:
            best_r2 = epoch_r2
            best_state = model.state_dict()

    ## Test
    model.load_state_dict(best_state)
    model.eval()
    test_true = []
    test_pred = []
    start = time.time()
    with torch.no_grad():
        for inputs, mlp_inputs, labels in loader_test:
            inputs, mlp_inputs, labels = inputs.to(device).float(), mlp_inputs.to(device).float(), labels.to(device).float()
            outputs = model(mlp_inputs, inputs)
            test_true.append(labels.detach().cpu().numpy())
            test_pred.append(outputs.squeeze().detach().cpu().numpy())
        test_true = np.concatenate(test_true)
        test_pred = np.concatenate(test_pred)

    infer_time = (time.time() - start)
    val_r2 = best_r2
    test_r2 = r2_score(test_true, test_pred)
    
    mae = '%.4f'%mean_absolute_error(test_true, test_pred)
    rmse = '%.4f'%mean_squared_error(test_true, test_pred, squared=False)
    row = ['LSTM+MLP', lr, wd, otherhp, 'Test', 'PBS', 1, '%.4f'%test_r2, mae, rmse, '%.4f'%infer_time]
    with open(OUTFILE,'a') as out:
        out.write('\t'.join(map(str,row))+'\n')
    
    print("LSTM+MLP; MLP_hidden %s; Comb_hidden %s; WD %s; LSTM %s %s"%\
          (mlp_hid_dim,com_hid_dim,wd,lstm_hid_dim,lstm_num_layers))
    print(f"  Validation R2: {val_r2:.4f}")
    print(f"  Test R2: {test_r2:.4f}")

### Select

In [None]:
outfile = '%s/0721_regression_performances_combined.csv'%SAVEPATH
results_com = pd.read_table(outfile)
res_top_com = ( results_com[(results_com['Dataset'] == 'Val')&(results_com['Epoch']>=min_epoch)]
                .sort_values('R2', ascending=False)
                .drop_duplicates(subset=['Architecture', 'LR', 'WD', 'Other_HP'], keep='first') )
top3_per_arch = ( res_top_com
                  .groupby('Architecture', group_keys=False)
                  .apply(lambda df: df.nlargest(3, 'R2')) )
top3_per_arch

In [None]:
selected_com = []
r_top = res_top_com.drop_duplicates('Architecture').set_index('Architecture')
for arch,row in r_top.iterrows():
    lr, wd, other = row.values[:3]
    sub_val = results_com[(results_com['Architecture']==arch)&(results_com['Dataset']=='Val')]
    sub_test = results_com[(results_com['Architecture']==arch)&(results_com['Dataset']=='Test')]
    
    select = sub_test[(sub_test['LR']==lr)&(sub_test['WD']==wd)&(sub_test['Other_HP']==other)].copy()
    t_train = sub_val.loc[(sub_val['LR']==lr)&(sub_val['WD']==wd)&(sub_val['Other_HP']==other),'Time'].mean()
    select.loc[:,'Training time'] = '%.4f'%t_train
    select.loc[:,'Input'] = 'PBS & Features'
    selected_com.append(select)
    
selected_com = pd.concat(selected_com)
selected_com = selected_com.set_index(['Input','Architecture'])[outcols]
selected_com

## Combine all

In [None]:
selected_all = pd.concat([selected,selected_com])
selected_all.columns = ['R2','MAE','RMSE','Training time','Inferece time']
selected_all

In [None]:
selected_all.to_csv(SAVEPATH+'0721_regression_model_selection.csv')

## Save the best model

In [None]:
seqs_train = one_hot_encode_pbs_gap(train_df)
seqs_val = one_hot_encode_pbs_gap(valid_df)
seqs_test = one_hot_encode_pbs_gap(test_df)

scaler = StandardScaler()
feats_train = scaler.fit_transform(train_feats)
feats_val = scaler.transform(valid_feats)
feats_test = scaler.transform(test_feats)

y_train = train_df['score'].values
y_val = valid_df['score'].values
y_test = test_df['score'].values

data_train = PcrDataset(seqs_train, feats_train, y_train)
data_val = PcrDataset(seqs_val, feats_val, y_val)
data_test = PcrDataset(seqs_test, feats_test, y_test)

loader_train = DataLoader(data_train, batch_size=64, shuffle=True)
loader_val = DataLoader(data_val, batch_size=64, shuffle=False)
loader_test = DataLoader(data_test, batch_size=64, shuffle=False)

In [None]:
outfile = '%s/0721_regression_performances_combined.csv'%SAVEPATH
results_com = pd.read_table(outfile)
res_top_com = ( results_com[(results_com['Dataset'] == 'Val')&(results_com['Epoch']>=min_epoch)]
                .sort_values('R2', ascending=False)
                .drop_duplicates(subset=['Architecture', 'LR', 'WD', 'Other_HP'], keep='first') )
top3_per_arch = ( res_top_com
                  .groupby('Architecture', group_keys=False)
                  .apply(lambda df: df.nlargest(3, 'R2')) )
top3_per_arch[top3_per_arch['Architecture']=='Lyra+MLP']

In [None]:
lr = .001
wd = .01

lyra_inp_dim = input_dim
lyra_out_dim = 4
lyra_mod_dim = 64

mlp_inp_dim = train_feats.shape[1]
mlp_hid_dim = 64
mlp_out_dim = 4

com_hid_dim = 16

final_out_dim = output_dim

In [None]:
model = CombinedModel(DL=Janus,
                      mlp_dims=(mlp_inp_dim, mlp_out_dim, mlp_hid_dim), 
                      dl_dims=(lyra_inp_dim, lyra_out_dim, lyra_mod_dim), 
                      combined_hidden=com_hid_dim, final_output=final_out_dim).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd) 
best_state = None
best_r2 = float('-inf')
## Training
for epoch in tqdm(range(num_epochs), desc='MLP_hidden %s; Comb_hidden %s; WD %s, Epoch'%\
                  (mlp_hid_dim,com_hid_dim,wd)):
    start = time.time()
    model.train()
    for inputs, mlp_inputs, labels in loader_train:
        inputs, mlp_inputs, labels = inputs.to(device).float(), mlp_inputs.to(device).float(), labels.to(device).float()
        outputs = model(mlp_inputs, inputs)
        loss = criterion(outputs.squeeze(), labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_time = (time.time() - start)

    model.eval()
    cross_loss = 0
    cross_true, cross_pred = [], []
    with torch.no_grad():
        for inputs, mlp_inputs, labels in loader_val:
            inputs, mlp_inputs, labels = inputs.to(device).float(), mlp_inputs.to(device).float(), labels.to(device).float()
            outputs = model(mlp_inputs, inputs)
            loss = criterion(outputs.squeeze(), labels)
            cross_loss += loss.item()
            cross_true.append(labels.detach().cpu().numpy())
            cross_pred.append(outputs.squeeze().detach().cpu().numpy())
        cross_true = np.concatenate(cross_true)
        cross_pred = np.concatenate(cross_pred)

    epoch_r2 = r2_score(cross_true, cross_pred)
    print('%i\t%.3f'%(epoch,epoch_r2))

    if epoch_r2 > best_r2:
        best_r2 = epoch_r2
        best_state = model.state_dict()

## Test
model.load_state_dict(best_state)
model.eval()
test_true = []
test_pred = []
start = time.time()
with torch.no_grad():
    for inputs, mlp_inputs, labels in loader_test:
        inputs, mlp_inputs, labels = inputs.to(device).float(), mlp_inputs.to(device).float(), labels.to(device).float()
        outputs = model(mlp_inputs, inputs)
        test_true.append(labels.detach().cpu().numpy())
        test_pred.append(outputs.squeeze().detach().cpu().numpy())
    test_true = np.concatenate(test_true)
    test_pred = np.concatenate(test_pred)

infer_time = (time.time() - start)
val_r2 = best_r2
test_r2 = r2_score(test_true, test_pred)

print("Lyra+MLP; MLP_hidden %s; Comb_hidden %s; WD %s"%(mlp_hid_dim,com_hid_dim,wd))
print(f"  Validation R2: {val_r2:.4f}")
print(f"  Test R2: {test_r2:.4f}")

In [None]:
readme = '''
Combined model consisting of Janus and MLP
- Janus input: 8 x 56 encoding of the sequence;
               the upper 4 rows are the one-hot encoding of the target seq, 
               the lower 4 rows are the one-hot encoding of the primer seq.
- MLP input: 12 features scaled by standard scaling.
- Output: Score (0-1) for each primer pair and target sequence.
'''

In [None]:
SUBPATH = '/home/jupyter/ADAPT_PCR_share/safe/dataset/0725_ml_combined_12feat'
if not os.path.exists(SUBPATH):
    os.makedirs(SUBPATH)
    
modelout = '%s/model.pth'%SUBPATH
scaleout = '%s/scaler.joblib'%SUBPATH
readmeout = '%s/README'%SUBPATH

In [None]:
from joblib import dump, load
scaleout = '/home/jupyter/ADAPT_PCR_share/safe/design/pipeline/0728_scaler.joblib'
#torch.save(model, modelout)
dump(scaler, scaleout)
# with open(readmeout,'wt') as out:
#     out.write(readme)

In [None]:
scaler

In [None]:
import joblib
joblib.dump(scaler, "scaler.joblib")
# 불러오기
scaler = joblib.load("scaler.joblib")