## Connect to Google Drive and install requirements

Connect your Google Drive

In [0]:
import os
from google.colab import drive
drive.mount('./drive')

DL4papers source code and data must be uploaded previously to your Google Drive.

In [0]:
# Configure the path to DL4papers root directory in your Google Drive
os.chdir("./drive/My Drive/DL4papers/")

Install requirements (usually already satisfied in the colab machines)

In [0]:
!python -m pip install --user -r ./requirements.txt

## Train and test with 10-folds cross validation

This can take about 4 hs

In [0]:
import random, time,shutil
import torch
import numpy as np
import pandas as pd
from src.sampler import Sampler
from src.dataloader import Dataloader
from src.model import Model
from src.logger import Logger

In [0]:

# Global params =======================
device = torch.device("cuda")
n_folds = 10
n_batch = 64

res_dir = "results_colab/"
shutil.rmtree(res_dir, ignore_errors=True)
os.mkdir(res_dir)
logger=Logger(res_dir)

for entity in ["gene", "drug"]:

    # Reproducibility
    random.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(1)


    logger.start("test_%s_mutation" % entity)
    logger.log("entity\tfold\ttest_AUC\ttest_F1\ttest_precision\ttest_sensitivity\n")
    search_pair=[entity, "mutation"]

    dloader=Dataloader(data_dir="data/", device=device, search_pair=search_pair)

    sampler_list = []
    for fold in range(n_folds):
        sampler_list.append(Sampler(dloader.get_articles(), dloader.get_labels(),
                                    n_batch=n_batch, fold=fold, nfold=n_folds))

    fold = 0
    while fold < n_folds:
        
        model = Model(res_dir, n_batch=n_batch, device=device)

        msg = "iter\ttrain_loss\ttrain_AUC\ttrain_F1\n"
        logger.log(msg, "train_%s_%02d" % (entity, fold))

        for it in range(model.n_epochs):
            train_loss = 0
            data, labels, _ = dloader.get_batch(sampler_list[fold].batch_ind("train"))
            train_loss, train_auc, train_f1 = model.train(data,labels)

            msg = "%d\t%.3f\t%.3f\t%.3f\n" % \
                    (it, train_loss, train_auc, train_f1)
            verbose = False
            if it%50 == 0:
                verbose = True
            logger.log(msg, "train_%s_%02d" % (entity, fold), verbose=verbose)

        if train_auc < 0.8:  # train failed
            continue
        _, test_auc, test_f1, test_pre, test_rec = model.test(sampler_list[fold].batch_ind("test"), dloader)
        
        msg="%s\t%d\t%.3f\t%.3f\t%.3f\t%.3f\n" % (entity, fold, test_auc, test_f1, test_pre, test_rec)
        logger.log(msg,"test_%s_mutation" % entity)
        fold += 1

## Plot results

In [0]:
from matplotlib import pyplot as plt
res_dir="results_colab/"

plt.figure("DL4Papers with FastText",figsize=(8, 8))

for k,entity in enumerate(["gene","drug"]):
    plt.subplot(1,2,k+1)
    res=pd.read_csv("%stest_%s_mutation.log" %(res_dir,entity),sep="\t")    
    plt.boxplot([res["test_sensitivity"],res["test_precision"],res["test_F1"]])
    plt.ylim([0.4,1])
    plt.xticks([1,2,3],["Sensitivity","Precision","F1"])
    plt.title("mutation-%s" %entity)
plt.show()