In [1]:
import torch
import torch.nn as nn
import numpy as np
from cnn_model import SimilarityCNN
from torch.utils.data import DataLoader
from evaluation import eval_model, train_model
from gensim.models import KeyedVectors
from dataset import SentenceDataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataset import random_split

# Load  Model

In [2]:
embedding = KeyedVectors.load_word2vec_format('./models/Medical.txt', binary=False)
padding_num = len(embedding.vectors)
def collate_fn(data):
    s0, s1, labels = zip(*data)
    s0 = pad_sequence(s0, batch_first=True, padding_value=padding_num)
    s1 = pad_sequence(s1, batch_first=True, padding_value=padding_num)
    return s0, s1, torch.tensor(labels, dtype=torch.float).reshape((len(labels), 1))

# Load Data

In [3]:
medical_vector_dataset = SentenceDataset(embedding)
split = int(len(medical_vector_dataset)*0.8)
lengths = [split, len(medical_vector_dataset) - split]
train_dataset, val_dataset = random_split(medical_vector_dataset, lengths)
def load_data(train_dataset, val_dataset):
    batch_size = 32
    # your code here
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    return train_loader, val_loader


train_loader, val_loader = load_data(train_dataset, val_dataset)

# Train and Evaluate

In [4]:
model = SimilarityCNN(embedding)
n_epochs = 4
# load the loss function
criterion = nn.BCELoss()
# load the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
model = train_model(model, train_loader, n_epoch=n_epochs, optimizer=optimizer, criterion=criterion)
acc, p, r, f, roc_auc = eval_model(model, val_loader)
print(f'Validation acc: {acc}, p:{p}, f:{f}, roc_auc:{roc_auc}')

Validation acc: 0.9510416666666667, p:0.9549436795994993, f:0.8665530948324816, roc_auc:0.9392040856345182
