In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [14, 8]

import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import csv
from tqdm.auto import trange, tqdm
import pickle

import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, pipeline

from custom_classes import ContrastSampler, DataManager, Trainer, TrainerA, WeightedCosineSimilarityLoss, ContrastLoss2
from custom_classes import INT2LABEL as categories

Loaded WeightedCosineSimilarityLoss...
Loaded ContrastSampler...
Loaded Trainer...
Loaded TrainerA...
Loaded TrainerB...
Loaded DataManager...


In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
device

device(type='cuda')

In [3]:
data_path = '../input/semeval-data'
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
DEV = True
N_CLASSES = 14

N_FINETUNE_EPOCHS = 50
N_EPOCHS_BEFORE_FINETUNE = 50
N_EPOCHS_POST_FINETUNE = 50

MODEL_BATCH_SIZE = 26
HEAD_BATCH_SIZE = 200
MODEL_SAMPLER = 'contrast'
MIN_SAMPLES_FROM_CLASS = 1
HEAD_LR = 1e-3
HEAD_GAMMA = .99
MODEL_LR = 2e-5
MODEL_GAMMA = .98
BETA = 0.01

VALIDATE_EVERY = -1
CHECKPOINT_EVERY = 10
EARLIEST_CHECKPOINT = 1000

In [4]:
lang = 'en'
tokenizer = AutoTokenizer.from_pretrained(model_name)
datamanager = DataManager(
    tokenizer=tokenizer,
    data_dir=data_path,
    use_dev=DEV,
    languages_for_head_eval=[],
    languages_for_head_train=[lang],
    languages_for_contrastive=[lang],
)
dataset_contrastive = datamanager.get_contrastive_dataset()
dataset_head_train = datamanager.get_head_train_dataset()
dataset_head_eval = datamanager.get_head_eval_dataset()

In [5]:
model = AutoModel.from_pretrained(model_name)
EMBEDDING_DIM = model.embeddings.word_embeddings.embedding_dim
head = nn.Sequential(
    nn.Linear(EMBEDDING_DIM, 256),
    nn.Dropout(),
    nn.ReLU(),
    nn.Linear(256, 256),
    nn.Dropout(),
    nn.ReLU(),
    nn.Linear(256, N_CLASSES),
    nn.Dropout(),
)
trainer = TrainerA(
    model=model,
    head=head,
    device=device,
    head_loss=nn.BCEWithLogitsLoss(),
    model_loss=WeightedCosineSimilarityLoss(N_CLASSES),
    model_dataset=dataset_contrastive,
    head_dataset=dataset_head_train,
    eval_dataset=dataset_head_eval,        
    n_classes=N_CLASSES,
    model_loader_type=MODEL_SAMPLER,
    train_head_batch_size=HEAD_BATCH_SIZE,
    train_model_batch_size=MODEL_BATCH_SIZE,
    head_lr=HEAD_LR,
    model_lr=MODEL_LR,
    head_gamma=HEAD_GAMMA,
    model_gamma=MODEL_GAMMA,
    beta=BETA,
    min_samples_from_class=MIN_SAMPLES_FROM_CLASS,
    validate_every_n_epochs=VALIDATE_EVERY,
    checkpoint_every_n_epochs=CHECKPOINT_EVERY,
    earliest_checkpoint=EARLIEST_CHECKPOINT
)

In [21]:
@torch.no_grad()
def generate_token_embeddings(trainer, dataset, batch_size=MODEL_BATCH_SIZE):
    trainer.bert.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size)
    embeddings = []
    for batch in dataloader:
        batch = [v.to(device) for v in batch.values()]
        input_ids, attention_mask, ls = batch
        model_output = trainer.bert(input_ids, attention_mask=attention_mask)
        embeddings.append(model_output[0])
    embeddings = torch.cat(embeddings)
    return embeddings

def write_token_embeddings(trainer, dataset, name, batch_size=MODEL_BATCH_SIZE):
    token_embeddings = generate_token_embeddings(trainer, dataset, batch_size)
    with open(name, 'wb') as f:
        pickle.dump(token_embeddings.cpu().numpy(), f)

# Untrained Embedding Generation

In [7]:
write_token_embeddings(trainer, dataset_contrastive, 'token_embeddings_untrained.pickle')

In [8]:
embeddings_untrained_dataset = trainer.compute_embeddings(dataset_contrastive)
with open('embeddings_untrained_en_train_dev.pickle', 'wb') as f:
    pickle.dump(embeddings_untrained_dataset.tensors[0].cpu().numpy(), f)

# Pretrained Embedding Generation

In [9]:
access_token = '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'
model = AutoModel.from_pretrained('lambdasonly/miniLM-L12-v2-WCSL-multilang-pretrained', use_auth_token=access_token)
trainer = TrainerA(
    model=model,
    head=head,
    device=device,
    head_loss=nn.BCEWithLogitsLoss(),
    model_loss=WeightedCosineSimilarityLoss(N_CLASSES),
    model_dataset=dataset_contrastive,
    head_dataset=dataset_head_train,
    eval_dataset=dataset_head_eval,        
    n_classes=N_CLASSES,
    model_loader_type=MODEL_SAMPLER,
    train_head_batch_size=HEAD_BATCH_SIZE,
    train_model_batch_size=MODEL_BATCH_SIZE,
    head_lr=HEAD_LR,
    model_lr=MODEL_LR,
    head_gamma=HEAD_GAMMA,
    model_gamma=MODEL_GAMMA,
    beta=BETA,
    min_samples_from_class=MIN_SAMPLES_FROM_CLASS,
    validate_every_n_epochs=VALIDATE_EVERY,
    checkpoint_every_n_epochs=CHECKPOINT_EVERY,
    earliest_checkpoint=EARLIEST_CHECKPOINT
)

In [10]:
write_token_embeddings(trainer, dataset_contrastive, 'token_embeddings_pretrained.pickle')

In [11]:
embeddings_pretrained_dataset = trainer.compute_embeddings(dataset_contrastive)
with open('embeddings_pretrained_en_train_dev.pickle', 'wb') as f:
    pickle.dump(embeddings_pretrained_dataset.tensors[0].cpu().numpy(), f)

# Trained Embedding Generation

In [12]:
trainer.train_head(N_EPOCHS_BEFORE_FINETUNE)
trainer.train_joint(N_FINETUNE_EPOCHS)
trainer.train_head(N_EPOCHS_POST_FINETUNE)

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

In [13]:
# Sanity check
dataset_sanity = datamanager._get_single_named_dataset(lang, dev=True)
dataset_sanity = datamanager._preprocess_head_dataset(dataset_sanity)
embeddings = trainer.compute_embeddings(dataset_sanity)
predictions = trainer.predict(embeddings.tensors[0], 'cpu')
f1 = f1_score(dataset_sanity['labels'], predictions, average='micro')
print('  ', lang, ': ', f1)

   en :  0.9988109393579072


In [22]:
write_token_embeddings(trainer, dataset_contrastive, 'token_embeddings_trained.pickle')

In [15]:
embeddings_trained_dataset = trainer.compute_embeddings(dataset_contrastive)
with open('embeddings_trained_en_train_dev.pickle', 'wb') as f:
    pickle.dump(embeddings_trained_dataset.tensors[0].cpu().numpy(), f)