In [2]:
import sys
sys.path.append('..')

from deep import *

In [2]:
params = {
    'batch_size': 132,
    'dropout': 0.449853682405601,
    'lr': 0.00048433947922833076,
    'num_layers': 3,
    'output_dim': 140,
    'weight_decay': 2.9229430030464116e-05
}

df = pd.read_csv('../Data/detectability_homo.csv')
df2 = pd.read_csv('../Data/detectability_mus.csv')
x2, y2 = df2[['peptide', 'detectability']].values.T

df_train = df.iloc[:67000]
df_val = df.iloc[67000:70000]
df_test = df.iloc[70000:]
x_train, y_train = df_train[['peptide', 'detectability']].values.T
x_val, y_val = df_val[['peptide', 'detectability']].values.T
x_test, y_test = df_test[['peptide', 'detectability']].values.T
df.head()

Unnamed: 0,peptide,detectability
0,LLSEVEELNMSLTALREK,0
1,ERMDEEQKLYTD,0
2,YVPRAVLVDLEPGTMDSIR,0
3,TAHYGSLPQKSHGR,1
4,KFVADGIFK,1


In [3]:
def test_transfer_learning(encoder, loader):
    embeds, targets = get_embeddings(encoder, loader)
    scores = cross_val_score(KNeighborsClassifier(30),
                             embeds, targets,
                             cv=10, scoring='accuracy')
    return scores.mean()

def run_encoder_experiments(encoder, name):
    # Encoder Metrics
    metrics = test_encoder(encoder,
                           x_train, y_train,
                           x_test, y_test)

    # Encoder metrics transfer learning
    loader2 = DataLoader(
        PeptideDataset(x2, y2, aminoacids),
        batch_size=1024,
        shuffle=False,
        collate_fn=Collate()
    )
    metrics['Transfer ACC'] = test_transfer_learning(encoder, loader2)
    pickle.dump(metrics, open(f'detectability_{name}_encoder_metrics.pkl', 'wb'))
    print('Encoder metrics: ')
    for k, v in metrics.items():
        print(k, v)

    # Embeddings visualization
    plt.figure()
    loader = DataLoader(
        PeptideDataset(x_test, y_test, aminoacids),
        batch_size=1024,
        shuffle=False,
        collate_fn=Collate()
    )
    embeds, targets = get_embeddings(encoder, loader)
    visualize_embeddings(embeds, targets, palette=None)
    plt.savefig(f'detectability_{name}_embeddings.jpg', dpi=300)

    # Embeddings visualization transfer learning
    plt.figure()
    embeds, targets = get_embeddings(encoder, loader2)
    visualize_embeddings(embeds, targets)
    plt.savefig(f'detectability_{name}_embeddings2.jpg', dpi=300)
    
def test_sample_efficiency(eval_fn, x_train, y_train, x_test, y_test, levels):
    results = defaultdict(list)
    for level in levels:
        n_t = int(0.9*level)
        n_v = (level - n_t)
        x_t, y_t = x_train[:n_t], y_train[:n_t]
        x_v, y_v = x_train[n_t:n_t+n_v], y_train[n_t:n_t+n_v]
        results['n_samples'].append(level)
        eval_results = eval_fn(x_t, y_t, x_v, y_v, x_test, y_test)
        for k, v in eval_results.items():
            results[k].append(v)
    return dict(results)

# GRU

In [4]:
def train_spervised(x_train, y_train, x_test, y_test,
                    output_dim, num_layers, dropout,
                    batch_size, lr, weight_decay,
                    epochs=1, early_stop=float('inf')):
    
    train_dataset = PeptideDataset(x_train, y_train, aminoacids)
    test_dataset = PeptideDataset(x_test, y_test, aminoacids)

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=Collate(),
    )
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=1024,
        shuffle=False,
        collate_fn=Collate(),
    )

    net = nn.Sequential(
        RecurrentEncoder(
            input_dim=len(aminoacids)+2,
            embedding_dim=output_dim//2,
            output_dim=output_dim,
            bidirectional=True,
            rnn_type='gru',
            pool_type='avg',
            num_layers=num_layers, 
            dropout=dropout,
        ),
        nn.Dropout(dropout),
        nn.Linear(output_dim, 1),
        nn.Sigmoid()
    ).to(device)
        
    model = Network(
        net=net,
        optimizer=torch.optim.AdamW(net.parameters(), lr=lr,
                                    weight_decay=weight_decay),
        loss=nn.BCELoss(),
        gamma=0.97,
        path='supervised_gru.pth'
    )

    history = model.train(train_loader, 
                          test_loader=test_loader,
                          epochs=epochs,
                          early_stop=early_stop,
                          verbose=False)
    return model, history

In [5]:
# # Training
# model, history = train_spervised(x_train, y_train, x_val, y_val,
#                                  **params, epochs=1, early_stop=10)
# history.to_csv('detectability_supervised_training.csv')
# plt.figure()
# history.plot()

# # Classifier metrics
# metrics = test_classifier(model,
#                           x_train, y_train,
#                           x_test, y_test)
# pickle.dump(metrics, open('detectability_supervised_clf_metrics.pkl', 'wb'))
# print('Classification Metrics:')
# for k, v in metrics.items():
#     print(k, v)
# print()

# run_encoder_experiments(model.net[0], 'supervised')

In [6]:
# def eval_fn(x_train, y_train, x_val, y_val, x_test, y_test):
#     model, history = train_spervised(x_train, y_train, x_val, y_val,
#                                      **params, epochs=1, early_stop=10)
#     return test_classifier(model, x_train, y_train, x_test, y_test)

# levels = [1000*2**i for i in range(7)]
# results = test_sample_efficiency(eval_fn, x_train, y_train, x_test, y_test, levels)
# pickle.dump(results, open('detectability_supervised_sample_efficiency.pkl', 'wb'))

# Triplet

In [7]:
def train_triplet(x_train, y_train, x_test, y_test,
                  output_dim, num_layers, dropout,
                  batch_size, lr, weight_decay,
                  n_train=1e2, n_test=1e2 ,
                  epochs=1, early_stop=float('inf')):
    
    train_dataset = PeptideTripletDataset(x_train, y_train, n_train, aminoacids)
    test_dataset = PeptideTripletDataset(x_test, y_test, n_test, aminoacids)

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=CollateTriplet(),
    )
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=1024,
        shuffle=False,
        collate_fn=CollateTriplet(),
    )

    net = SiameseNet(
        RecurrentEncoder(
            input_dim=len(aminoacids)+2,
            embedding_dim=output_dim//2,
            output_dim=output_dim,
            bidirectional=True,
            rnn_type='gru',
            pool_type='avg',
            num_layers=num_layers, 
            dropout=dropout,
        ),
    ).to(device)
        
    model = Network(
        net=net,
        optimizer=torch.optim.AdamW(net.parameters(), lr=lr,
                                    weight_decay=weight_decay),
        loss=TripletLoss(),
        gamma=0.9,
        path='triplet_gru.pth'
    )

    history = model.train(train_loader, 
                          test_loader=test_loader,
                          epochs=epochs,
                          early_stop=early_stop,
                          verbose=False)
    return model, history

In [8]:
# model, history = train_triplet(x_train, y_train, x_val, y_val,
#                                **params, n_train=3e5, n_test=1e4,
#                                epochs=30, early_stop=3)
# history.to_csv('detectability_triplet_training.csv')
# plt.figure()
# history.plot()

# run_encoder_experiments(model.net.encoder, 'triplet')

In [9]:
# def eval_fn(x_train, y_train, x_val, y_val, x_test, y_test):
#     n_t, n_v = len(y_train), len(y_val)
#     model, history = train_triplet(x_train, y_train, x_val, y_val,
#                                    **params, n_train=3*n_t, n_test=3*n_val,
#                                    epochs=30, early_stop=3)
#     return test_encoder(model.net.encoder, x_train, y_train, x_test, y_test)

# levels = [1000*2**i for i in range(7)]
# results = test_sample_efficiency(eval_fn, x_train, y_train, x_test, y_test, levels)
# pickle.dump(results, open('detectability_triplet_sample_efficiency.pkl', 'wb'))

# Supervised Triplet

In [10]:
def train_supervised_triplet(x_train, y_train, x_test, y_test,
                             output_dim, num_layers, dropout,
                             batch_size, lr, weight_decay,
                             n_train=1e2, n_test=1e2,
                             epochs=1, early_stop=float('inf')):
    
    train_dataset = PeptideTripletDataset(x_train, y_train, n_train, aminoacids)
    test_dataset = PeptideTripletDataset(x_test, y_test, n_test, aminoacids)

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=CollateSupervisedTriplet(),
    )
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=1024,
        shuffle=False,
        collate_fn=CollateSupervisedTriplet(),
    )

    net = SupervisedSiameseNet(
        encoder = RecurrentEncoder(
            input_dim=len(aminoacids)+2,
            embedding_dim=output_dim//2,
            output_dim=output_dim,
            bidirectional=True,
            rnn_type='gru',
            pool_type='avg',
            num_layers=num_layers, 
            dropout=dropout,
        ),
        predictor = nn.Sequential(
        nn.Dropout(dropout),
        nn.Linear(output_dim, 1),
        nn.Sigmoid()
        )
    ).to(device)
        
    model = Network(
        net=net,
        optimizer=torch.optim.AdamW(net.parameters(), lr=lr,
                                    weight_decay=weight_decay),
        loss=SSLoss(
            embed_loss=TripletLoss(),
            pred_loss=nn.BCELoss(),
        ),
        gamma=0.9,
        path='supervised_triplet_gru.pth'
    )

    history = model.train(train_loader, 
                          test_loader=test_loader,
                          epochs=epochs,
                          early_stop=early_stop)
    return model, history

In [11]:
# model, history = train_supervised_triplet(x_train, y_train, x_val, y_val,
#                                           **params, n_train=3e5, n_test=1e4,
#                                           epochs=30, early_stop=3)
# history.to_csv('detectability_supervised_triplet_training.csv')
# plt.figure()
# history.plot()

# run_encoder_experiments(model.net.encoder, 'supervised_triplet')

In [12]:
# def eval_fn(x_train, y_train, x_val, y_val, x_test, y_test):
#     n_t, n_v = len(y_train), len(y_val)
#     model, history = train_supervised_triplet(x_train, y_train, x_val, y_val,
#                                               **params, n_train=3*n_t, n_test=3*n_val,
#                                               epochs=30, early_stop=3)
#     return test_encoder(model.net.encoder, x_train, y_train, x_test, y_test)

# levels = [1000*2**i for i in range(7)]
# results = test_sample_efficiency(eval_fn, x_train, y_train, x_test, y_test, levels)
# pickle.dump(results, open('detectability_supervised_triplet_sample_efficiency.pkl', 'wb'))

# Ensemble

In [13]:
def train_ensemble(x_train, y_train,
                   output_dim, num_layers, dropout,
                   batch_size, lr, weight_decay,
                   epochs=1, early_stop=float('inf')):
    
    train_loader = DataLoader(
        dataset=PeptideDataset(x_train, y_train, aminoacids),
        batch_size=batch_size,
        shuffle=True,
        collate_fn=Collate(),
    )
#     test_loader = DataLoader(
#         dataset=PeptideDataset(x_test, y_test, aminoacids),
#         batch_size=1024,
#         shuffle=False,
#         collate_fn=Collate(),
#     )
    
    
    nets = []
    for rnn_type in ['gru', 'lstm']:
        for pool_type in ['avg', 'max', 'last']:
            nets.append(
                nn.Sequential(
                    RecurrentEncoder(
                        input_dim=len(aminoacids)+2,
                        embedding_dim=output_dim//2,
                        output_dim=output_dim,
                        bidirectional=True,
                        rnn_type=rnn_type,
                        pool_type=pool_type,
                        num_layers=num_layers, 
                        dropout=dropout,
                    ),
                    nn.Dropout(dropout),
                    nn.Linear(output_dim, 1),
                    nn.Sigmoid()
                ).to(device)
            )
    
    models = []
    for i, net in enumerate(nets):
        models.append(
            Network(
                net=net,
                optimizer=torch.optim.AdamW(net.parameters(), lr=lr,
                                            weight_decay=weight_decay),
                loss=nn.BCELoss(),
                gamma=0.97,
                path=f'model{i}.pth'
            )
        )
    
    model = VotingEnsemble(models)
    history = model.train(train_loader,
                          epochs=epochs, early_stop=early_stop,
                          verbose=False)
    return model, history

In [14]:
# # Training
# model = train_ensemble(np.hstack([x_train, x_val]),
#                        np.hstack([y_train, y_val]),
#                        **params, epochs=3)

# # Classifier metrics
# metrics = test_classifier(model,
#                           np.hstack([x_train, x_val]),
#                           np.hstack([y_train, y_val]),
#                           x_test, y_test)

# # pickle.dump(metrics, open('detectability_ensemble_clf_metrics.pkl', 'wb'))
# print('Classification Metrics:')
# for k, v in metrics.items():
#     print(k, v)
# print()

In [None]:
# def eval_fn(x_train, y_train, x_val, y_val, x_test, y_test):
#     model, history = train_ensemble(x_train, y_train, x_val, y_val,
#                                     **params, epochs=50, early_stop=100)
#     return test_classifier(model, x_train, y_train, x_test, y_test)

# levels = [1000*2**i for i in range(7)]
# results = test_sample_efficiency(eval_fn, x_train, y_train, x_test, y_test, levels)
# pickle.dump(results, open('detectability_ensemble_sample_efficiency.pkl', 'wb'))