In [1]:
import sys
sys.path.append('..')

from deep import *

In [8]:
df = pd.read_csv('../Data/detectability_homo.csv')
df_train = df.iloc[:60000]
df_val = df.iloc[60000:70000]
df_test = df.iloc[70000:]
x_train, y_train = df_train[['peptide', 'detectability']].values.T
x_val, y_val = df_val[['peptide', 'detectability']].values.T
x_test, y_test = df_test[['peptide', 'detectability']].values.T
df.head()

Unnamed: 0,peptide,detectability
0,LLSEVEELNMSLTALREK,0
1,ERMDEEQKLYTD,0
2,YVPRAVLVDLEPGTMDSIR,0
3,TAHYGSLPQKSHGR,1
4,KFVADGIFK,1


In [4]:
class Objective:
    def __init__(self, train_dataset, test_dataset, model_getter, collate, epochs=100, early_stop=10):
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.collate = collate
        self.get_model = model_getter
        self.epochs = epochs
        self.early_stop = early_stop

    def __call__(self, space):
        params = space.copy()
        batch_size = params['batch_size']
        del params['batch_size']
        model = self.get_model(**params)
        
        train_loader = DataLoader(
            dataset=self.train_dataset,
            batch_size=batch_size,
            shuffle=True,
            collate_fn=self.collate,
        )
        test_loader = DataLoader(
            dataset=self.test_dataset,
            batch_size=1024,
            shuffle=False,
            collate_fn=self.collate,
        )
        hisotry = model.train(train_loader, epochs=self.epochs, early_stop=self.early_stop, verbose=False)
        return model.evaluate(test_loader)['loss']
    
    def get_training_history(self, train_dataset, test_dataset, params):
        params = params.copy()
        batch_size = params['batch_size']
        del params['batch_size']
        model = self.get_model(**params)
        model.path = 'detectability_model.pth'
        
        train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=batch_size,
            shuffle=True,
            collate_fn=self.collate,
        )
        test_loader = DataLoader(
            dataset=test_dataset,
            batch_size=1024,
            shuffle=False,
            collate_fn=self.collate,
        )
        history = model.train(train_loader, test_loader=test_loader,
                              epochs=self.epochs, verbose=False, metrics=['acc'])
        return model, history

In [5]:
optim_params = {}

### GRU

In [6]:
def gru_getter(output_dim, num_layers, dropout, lr, weight_decay, optimizer):
    net = nn.Sequential(
        RecurrentEncoder(
            input_dim=len(aminoacids)+2,
            embedding_dim=output_dim//2,
            output_dim=output_dim,
            bidirectional=True,
            rnn_type='gru',
            pool_type='avg',
            num_layers=num_layers, 
            dropout=dropout,
        ),
        nn.Dropout(dropout),
        nn.Linear(output_dim, 1),
        nn.Sigmoid()
    ).to(device)
    
    if optimizer == 'sgd':
        optim = torch.optim.SGD(net.parameters(), lr=lr,
                                momentum=0.9, nesterov=True, 
                                weight_decay=weight_decay)
    else:
        optim = torch.optim.AdamW(net.parameters(), lr=lr,
                                  weight_decay=weight_decay)
    model = Network(
        net=net,
        optimizer=optim,
        loss=nn.BCELoss(),
        gamma=0.97
    )
    return model

gru_space = {
    'output_dim': hyperopt.hp.qloguniform('output_dim', np.log(16), np.log(256), 4),
    'num_layers': hyperopt.hp.quniform('num_layers', 1, 4, 1),
    'dropout': hyperopt.hp.uniform('dropout', 0, 0.5),
    'batch_size': hyperopt.hp.qloguniform('batch_size', np.log(8), np.log(256), 4),
    'lr': hyperopt.hp.loguniform('lr', np.log(1e-5), np.log(0.1)),
    'weight_decay': hyperopt.hp.loguniform('weight_decay', np.log(1e-5), np.log(1)),
#     'optimizer': hyperopt.hp.choice('optimizer', [0, 1])
}
gru_mapping = dict(
    output_dim=lambda x: int(x),
    num_layers=lambda x: int(x),
    batch_size=lambda x: int(x),
#     optimizer=lambda x: ['sgd', 'adam'][x]
)

gru_objective = Objective(train_dataset=PeptideDataset(x_train, y_train, aminoacids),
                          test_dataset=PeptideDataset(x_val, y_val, aminoacids),
                          model_getter=gru_getter,
                          collate=Collate(),
                          epochs=1, early_stop=10)

optim_params['GRU'] = dict(
    objective=gru_objective,
    space=gru_space,
    mapping=gru_mapping
)

### CNNGRU

In [7]:
def cnngru_getter(output_dim, num_layers, dropout, lr, weight_decay, optimizer='adam'):
    net = nn.Sequential(
        ConvolutionalRecurrentEncoder(
            input_dim=len(aminoacids)+2,
            output_dim=output_dim,
            bidirectional=True,
            num_layers=num_layers,
            dropout=dropout,
        ),
        nn.Dropout(dropout),
        nn.Linear(output_dim, 1),
        nn.Sigmoid(),
    )
    
    if optimizer == 'sgd':
        optim = torch.optim.SGD(net.parameters(), lr=lr,
                                momentum=0.9, nesterov=True, 
                                weight_decay=weight_decay)
    else:
        optim = torch.optim.AdamW(net.parameters(), lr=lr,
                                  weight_decay=weight_decay)
    model = Network(
        net=net,
        optimizer=optim,
        loss=nn.BCELoss(),
        gamma=0.97
    )
    return model
        
cnngru_space = {
    'output_dim': hyperopt.hp.qloguniform('output_dim', np.log(16), np.log(256), 4),
    'num_layers': hyperopt.hp.quniform('num_layers', 1, 4, 1),
    'dropout': hyperopt.hp.uniform('dropout', 0, 0.5),
    'batch_size': hyperopt.hp.qloguniform('batch_size', np.log(8), np.log(256), 4),
    'lr': hyperopt.hp.loguniform('lr', np.log(1e-5), np.log(0.1)),
    'weight_decay': hyperopt.hp.loguniform('weight_decay', np.log(1e-5), np.log(1)),
#     'optimizer': hyperopt.hp.choice('optimizer', [0, 1])
}
cnngru_mapping = dict(
    output_dim=lambda x: int(x),
    num_layers=lambda x: int(x),
    batch_size=lambda x: int(x),
#     optimizer=lambda x: ['sgd', 'adam'][x]
)

cnngru_objective = Objective(train_dataset=PeptideDataset(x_train, y_train, aminoacids),
                             test_dataset=PeptideDataset(x_val, y_val, aminoacids),
                             model_getter=cnngru_getter,
                             collate=Collate(),
                             epochs=1, early_stop=10)

optim_params['CNNGRU'] = dict(
    objective=cnngru_objective,
    space=cnngru_space,
    mapping=cnngru_mapping,
)

## BERT

In [8]:
def bert_getter(output_dim, num_layers, num_heads, dropout, lr, weight_decay, optimizer='adam'):
    net = nn.Sequential(
        TransformerEncoder(
            input_dim=len(aminoacids)+2,
            output_dim = output_dim,
            hidden_dim = output_dim*2,
            kind='bert',
            num_heads=num_heads,
            num_layers=num_layers,
            dropout=dropout
        ),
        nn.Dropout(dropout),
        nn.Linear(output_dim, 1),
        nn.Sigmoid()
    ).to(device)
    
    if optimizer == 'sgd':
        optim = torch.optim.SGD(net.parameters(), lr=lr,
                                momentum=0.9, nesterov=True, 
                                weight_decay=weight_decay)
    else:
        optim = torch.optim.AdamW(net.parameters(), lr=lr,
                                  weight_decay=weight_decay)
    model = Network(
        net=net,
        optimizer=optim,
        loss=nn.BCELoss(),
        gamma=0.97
    )
    return model
        
bert_space = {
    'output_dim': hyperopt.hp.qloguniform('output_dim', np.log(16), np.log(256), 4),
    'num_layers': hyperopt.hp.quniform('num_layers', 1, 4, 1),
    'num_heads': hyperopt.hp.quniform('num_layers', 0, 2, 2),
    'dropout': hyperopt.hp.uniform('dropout', 0, 0.5),
    'batch_size': hyperopt.hp.qloguniform('batch_size', np.log(8), np.log(256), 4),
    'lr': hyperopt.hp.loguniform('lr', np.log(1e-5), np.log(0.1)),
    'weight_decay': hyperopt.hp.loguniform('weight_decay', np.log(1e-5), np.log(1)),    
#     'optimizer': hyperopt.hp.choice('optimizer', [0, 1])
}
bert_mapping = dict(
    output_dim=lambda x: int(x),
    num_layers=lambda x: int(x),
    num_heads=lambda x: int(2**x),
    batch_size=lambda x: int(x),
#     optimizer=lambda x: ['sgd', 'adam'][x]
)

bert_objective = Objective(train_dataset=PeptideDataset(x_train, y_train, aminoacids),
                           test_dataset=PeptideDataset(x_val, y_val, aminoacids),
                           model_getter=bert_getter,
                           collate=Collate(),
                           epochs=1, early_stop=10)

optim_params['BERT'] = dict(
    objective=bert_objective,
    space=bert_space,
    mapping=bert_mapping,
)

## BETRGRU

In [None]:
def bertgru_getter(output_dim, num_layers, num_heads, dropout, lr, weight_decay, optimizer='adam'):
    net = nn.Sequential(
        TransformerRecurrentEncoder(
            input_dim=len(aminoacids)+2,
            output_dim = output_dim,
            hidden_dim = output_dim*2,
            kind='bert',
            num_heads=num_heads,
            num_layers=num_layers,
            dropout=dropout,
            bidirectional=True
        ),
        nn.Dropout(dropout),
        nn.Linear(output_dim, 1),
        nn.Sigmoid()
    ).to(device)
    
    if optimizer == 'sgd':
        optim = torch.optim.SGD(net.parameters(), lr=lr,
                                momentum=0.9, nesterov=True, 
                                weight_decay=weight_decay)
    else:
        optim = torch.optim.AdamW(net.parameters(), lr=lr,
                                  weight_decay=weight_decay)
    model = Network(
        net=net,
        optimizer=optim,
        loss=nn.BCELoss(),
        gamma=0.97
    )
    return model
        
bertgru_space = {
    'output_dim': hyperopt.hp.qloguniform('output_dim', np.log(16), np.log(256), 4),
    'num_layers': hyperopt.hp.quniform('num_layers', 1, 2, 1),
    'num_heads': hyperopt.hp.quniform('num_layers', 0, 2, 2),
    'dropout': hyperopt.hp.uniform('dropout', 0, 0.5),
    'batch_size': hyperopt.hp.qloguniform('batch_size', np.log(8), np.log(256), 4),
    'lr': hyperopt.hp.loguniform('lr', np.log(1e-5), np.log(0.1)),
    'weight_decay': hyperopt.hp.loguniform('weight_decay', np.log(1e-5), np.log(1)),  
#     'optimizer': hyperopt.hp.choice('optimizer', [0, 1])
}
bertgru_mapping = dict(
    output_dim=lambda x: int(x),
    num_layers=lambda x: int(x),
    num_heads=lambda x: int(2**x),
    batch_size=lambda x: int(x),
#     optimizer=lambda x: ['sgd', 'adam'][x]
)

bertgru_objective = Objective(train_dataset=PeptideDataset(x_train, y_train, aminoacids),
                           test_dataset=PeptideDataset(x_val, y_val, aminoacids),
                           model_getter=bertgru_getter,
                           collate=Collate(),
                           epochs=1, early_stop=10)

optim_params['BERTGRU'] = dict(
    objective=bertgru_objective,
    space=bertgru_space,
    mapping=bertgru_mapping,
)

## Optimize

In [None]:
model_name = 'BERTGRU'
epochs = 20
early_stopping = 3
evals = 25

# Optimization
model_optim_params = optim_params[model_name]
model_optim_params['objective'].epochs = epochs
model_optim_params['objective'].early_stopping = early_stopping
params, optim_history = optimize(**model_optim_params, evals=evals)
pd.DataFrame(optim_history).to_csv(f'detectability_{model_name.lower()}_optim.csv', index=False)