In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
root = '/content/drive/MyDrive/fourthbrain-capstone/nlp4ifchallenge/'
import sys
sys.path.append(root)

In [None]:
!pip install transformers

In [5]:
from typing import List, Tuple, Callable, TypeVar, Any, overload, Dict
from typing import Optional as Maybe

import torch
from torch import Tensor, LongTensor
from torch import load, cat, stack, save, no_grad, manual_seed
from torch.cuda import empty_cache

from nlp4ifchallenge import types
from nlp4ifchallenge.scripts import train_bert, train_aggregator
from nlp4ifchallenge.models import bert, aggregation

from transformers.utils import logging
logging.set_verbosity("CRITICAL")

from math import ceil

In [None]:
# Example tweet: this tweet should be classified as contained misinfo
tweets = [types.Tweet(0, "President Trump's comments about the coronavirus death rate were 100% correct. The media falsely claimed he was spreading misinformation. They falsely reported that his comments weren't in line with top health officials. That was 100% fake news.")]

# Retrain 8 individual Bert models, save them in checkpoints

In [None]:
names = ['vinai-covid', 'vinai-tweet', 'cardiffnlp-tweet', 'cardiffnlp-hate', 'del-covid', 'cardiffnlp-irony', 'cardiffnlp-offensive', 'cardiffnlp-emotion']
device = "cuda"
train_path = root+'data/english/covid19_disinfo_binary_english_train_old.tsv'
dev_path = root+'data/english/covid19_disinfo_binary_english_dev_input.tsv'
test_path = ""
batch_size = 16
early_stopping = 0
num_epochs = 3
save_path = root+'data/checkpoints'
with_class_weights = False
ignore_nan = False

for name in names:
  train_bert.main(name=name, device=device, print_log=True, train_path=train_path,
                dev_path=dev_path, test_path=test_path, batch_size=batch_size, early_stopping=early_stopping,
                num_epochs=num_epochs, save_path=save_path,
                with_class_weights=with_class_weights, ignore_nan=ignore_nan)

### Test with test example

In [None]:
for name in names:
  model = bert.make_model(name=name, ignore_nan=ignore_nan)
  # optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
  checkpoint = torch.load(root+'data/checkpoints/'+name+'-english/model.p', map_location=torch.device(device))
  model.load_state_dict(checkpoint['model_state_dict'])
  # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  # epoch = checkpoint['epoch']
  # loss = checkpoint['loss']
  model.eval()
  print(name, model.predict(tweets))

# Retrain ensemble Bert Model: version containing all 8 individual Bert models

In [None]:
model_names = str(len(names))
train_path = root+'data/english/covid19_disinfo_binary_english_train_old.tsv'
dev_path = root+'data/english/covid19_disinfo_binary_english_dev_input.tsv'
test_path = ""
device = "cuda"
batch_size = 16
dropout = 0.25
num_epochs = 3
model_dir = root+'data/checkpoints'
hidden_size = 12
lr = 1e-02
wd = 1e-02
load_stored = False

train_aggregator.main(model_names=model_names,
               train_path=train_path, dev_path=dev_path, test_path=test_path,
               device=device, batch_size=batch_size, dropout=dropout, 
               num_epochs=num_epochs, model_dir=model_dir,
               hidden_size=hidden_size, lr=lr, wd=wd, load_stored=load_stored,
               print_log=True)

### Test with test example

In [None]:
[test_inputs] = train_aggregator.get_scores(model_names=names,
                                            datasets=[tweets],
                           batch_size=batch_size, device=device,
                           model_dir=model_dir, data_tag="english")
# votes = train_aggregator.aggregate_votes(test_inputs)
# votes
aggregator = train_aggregator.MetaClassifier(num_models=8, hidden_size=hidden_size, dropout=dropout).to(device)
aggregator.load_state_dict(torch.load(model_dir + f'/aggregator-english/model.p'))
outs = aggregator.predict(test_inputs.to(device))
outs

# Portion of FastAPI codes: use this to debug API results

In [None]:
# MODELS_DIR = root+'data/checkpoints/'
# device = "cpu"
# batch_size = 16
# ignore_nan = False
# hidden_size = 12
# dropout = 0.25

# MODELS = {'vinai-covid': None,
#           'vinai-tweet': None,
#           'cardiffnlp-tweet': None,
#           'cardiffnlp-hate': None,
#           'del-covid': None,
#           'cardiffnlp-irony': None,
#           'cardiffnlp-offensive': None,
#           'cardiffnlp-emotion': None}

# for name in MODELS:
#     model = bert.make_model(name=name, ignore_nan=ignore_nan)
#     checkpoint = torch.load(MODELS_DIR+name+'-english/model.p', map_location=torch.device(device))
#     model.load_state_dict(checkpoint['model_state_dict'])
#     model.eval()
#     MODELS[name] = model


# def get_scores(model_names: List[str], datasets: List[List[types.Tweet]], batch_size: int, device: str,
#                model_dir: str, data_tag: str) -> List[Tensor]:
#     """
#        :returns num_dataset tensors of shape B x M x Q
#     """
#     outs = []
#     for name in model_names:
#         this_model_outs = []
#         model = MODELS[name]
#         for dataset in datasets:
#             this_dataset_outs = []
#             nbatches = ceil(len(dataset) / batch_size)
#             for batch_idx in range(nbatches):
#                 start, end = batch_idx * batch_size, min(len(dataset), (batch_idx + 1) * batch_size)
#                 this_dataset_outs.append(model.predict_scores(dataset[start:end]).cpu())
#             this_model_outs.append(cat(this_dataset_outs, dim=0))
#         outs.append(this_model_outs)
#         empty_cache()
#     return [stack(x, dim=1) for x in zip(*outs)]


# tweets = ["President Trump's comments about the coronavirus death rate were 100% correct. The media falsely claimed he was spreading misinformation. They falsely reported that his comments weren't in line with top health officials. That was 100% fake news."]

# ins = []
# for i, t in enumerate(tweets):
#     ins.append(types.Tweet(i, t))
# # ins = [types.Tweet(0, tweets)]

# predictions = {}
# for name in MODELS:
#     model = MODELS[name]
#     predictions[name] = model.predict(ins)

# [test_inputs] = get_scores(model_names=MODELS.keys(), datasets=[ins],
#                 batch_size=batch_size, device=device, model_dir=MODELS_DIR, data_tag="english")
# aggregator = train_aggregator.MetaClassifier(num_models=8, hidden_size=hidden_size, dropout=dropout).to(device)
# aggregator.load_state_dict(torch.load(MODELS_DIR+'/aggregator-english/model.p', map_location=torch.device(device)))
# predictions['aggregator'] = aggregator.predict(test_inputs.to(device))
# predictions