## Installs and Mounting Google Drive

In [8]:
!pip install transformers

from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


## Tweakable Parameters

In [9]:
device = 'cpu'

dataset = 'news'

nela_path = '/gdrive/MyDrive/ECE692:MisinformationProject/data/nela-gt-2020/'
dataset_path = 'news/'

bert_pretrained_model = 'bert-base-uncased'


## Collect Files

In [10]:
import os
filenames = sorted(os.listdir(nela_path + dataset_path))
filenames.remove('labels.csv')
filenames.remove('labels.json')

## Convert Label CSV to JSON dict


In [11]:
%%script echo Skipping this cell.

import csv
import json

labels = {}
with open(nela_path + dataset_path + 'labels.csv', 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    seen = set()
    for row in reader:
        if row['source'] == 'newsweek':
            continue
        labels[row['source']] = int(row['label'])

with open(nela_path + dataset_path + 'labels.json', 'w') as f:
    json.dump(labels, f)

Skipping this cell.


## Load Labels

In [12]:
import json

with open(nela_path + dataset_path + 'labels.json', 'r') as f:
    labels = json.load(f)

print(f"Out of {len(filenames)} sources, {sum(filename[:-5] not in labels for filename in filenames)} of them do not have labels.")
print(f"Out of {len(labels)} labels, {sum(label not in {filename[:-5] for filename in filenames} for label in labels.keys())} of them do not have any source data.")

Out of 520 sources, 187 of them do not have labels.
Out of 334 labels, 1 of them do not have any source data.


## Determine \# of Samples in Dataset



In [13]:
%%script echo Skipping this cell.

import json
from tqdm.notebook import tqdm
import sys

count = 0
for filename in tqdm(filenames, desc="Determining Number of Samples"):
    print(filename)
    sys.exit(1)
    if filename[:-5] not in labels:
        continue

    with open(nela_path + dataset_path + filename) as f:
        data = json.load(f)

    for sample in data:
        if not sample["content"]:
            continue

        count += 1

print(f"Dataset: {dataset} has {count} examples.")

Skipping this cell.


## Rewriting json files to include number of sentences and substituting content with its cleaned data version

In [14]:
import json
from tqdm.notebook import tqdm
import sys
import spacy
from spacy.lang.en import English
import re

def clean_content(content):
    def repl(match):
        match_text = match.group(0)
        if match_text in {'\n', '\t'}:
            return ' '

        return ''

    pattern = re.compile(
        "|".join([
            "\s+@",                  # Transformation artifact
            "@\s+",                  # Transformation artifact
            "\n",
            "\t"
        ])
    )
    return re.sub(pattern, repl, content)

# nlp = spacy.load('en')
nlp = English()  # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

count = 0
with open(nela_path + dataset_path + '123.json') as f:
        data = json.load(f)

for index, sample in enumerate(data):
    if not sample["content"]:
        continue
    clean_data = clean_content(sample['content'])    
    sentences = nlp(clean_data)
    
    num_sentences = len(list(sentences.sents))
    data[index]['sentence_length']= num_sentences
    data[index]['content']= clean_data
    count += 1

with open(nela_path + dataset_path + '123.json', 'w') as f:
    json.dump(data, f)

print(f"Dataset: {dataset} has {count} examples.")

Dataset: news has 852 examples.


## Load, Clean, and Write Text

In [15]:
%%script echo Skipping this cell.

import re
import json
import torch
from tqdm.notebook import tqdm

train_split = 0.8
valid_split = 0.2

assert train_split + valid_split == 1.

dataset_counts = {
    "news": 1_272_211,
    "covid": None,
    "election": None
}

torch.manual_seed(0)

train_count = int(train_split * dataset_counts[dataset])
valid_count = dataset_counts[dataset] - train_count 
train_indices, valid_indices = torch.split(
    torch.randperm(dataset_counts[dataset]),
    [train_count, valid_count]
)

train_indices = set(train_indices.tolist())
valid_indices = set(valid_indices.tolist())


def clean_content(content):
    def repl(match):
        match_text = match.group(0)
        if match_text in {'\n', '\t'}:
            return ' '

        return ''

    pattern = re.compile(
        "|".join([
            "\s+@",                  # Transformation artifact
            "@\s+",                  # Transformation artifact
            "\n",
            "\t"
        ])
    )
    return re.sub(pattern, repl, content)

train_file = open(nela_path + f"{dataset}_train.txt", "w+")
valid_file = open(nela_path + f"{dataset}_valid.txt", "w+")

index = 0
for filename in tqdm(filenames, desc="Processing Filenames"):
    if filename[:-5] not in labels:
        continue

    with open(nela_path + dataset_path + filename) as f:
        data = json.load(f)

    for sample in data:
        if not sample["content"]:
            continue

        cleaned_content = clean_content(sample["content"])
        if index in train_indices:
            train_file.write(f"{labels[sample['source']]} {cleaned_content}\n")
        elif index in valid_indices:
            valid_file.write(f"{labels[sample['source']]} {cleaned_content}\n")

        index += 1

train_file.close()
valid_file.close()

Skipping this cell.


## Save File Offsets For Faster Data Loading

In [16]:
%%script echo Skipping this cell.

import json
from itertools import count
from tqdm.notebook import tqdm

for subset in tqdm(["train", "valid"], desc="Processing subsets"):
    offsets = {}
    with open(nela_path + f"{dataset}_{subset}.txt", 'r') as f:
        for line_number in tqdm(count(), desc=f"Processing {subset}", leave=False):
            offset = f.tell()
            line = f.readline()
            if not line:
                break

            offsets[line_number] = offset

    with open(nela_path + f"{dataset}_{subset}_offsets.json", 'w+') as f:
        json.dump(offsets, f)

Skipping this cell.


## Copy Remote Files to Local VM

In [17]:
from shutil import copyfile
from tqdm.notebook import tqdm
files = [
    f'{dataset}_train.txt',
    f'{dataset}_train_offsets.json',
    f'{dataset}_valid.txt',
    f'{dataset}_valid_offsets.json'
]

for file in tqdm(files, desc="Copying Remote Files"):
    copyfile(nela_path + file, './' + file)

HBox(children=(FloatProgress(value=0.0, description='Copying Remote Files', max=4.0, style=ProgressStyle(descr…




## Dataset Code

In [18]:

import json
from dataclasses import dataclass
from transformers import BertTokenizerFast
import torch

tokenizer = BertTokenizerFast.from_pretrained(bert_pretrained_model)

@dataclass
class TokenizedText:
    __slots__ = ['id', 'source', 'title', 'content', 'label']
    id: str
    source: str
    title: str
    content: str
    label: int

class Dataset(torch.utils.data.dataset.Dataset):
    pass

class NelaGT(Dataset):
    pass

class NelaGT2020(NelaGT):
    _nela_path = '/gdrive/MyDrive/ECE692:MisinformationProject/data/nela-gt-2020/'
    _nela_path = './'
    _valid_datasets = ['news']
    _valid_subsets = ['train', 'valid']

    def __init__(self, dataset, device, subset):

        if dataset not in self._valid_datasets:
            raise ValueError(
                f"`{dataset}` is not a valid dataset. "
                f"Valid datasets are: {self._valid_datasets}"
            )

        if subset not in self._valid_subsets:
            raise ValueError(
                f"`{subset}` is not a valid subset. "
                f"Valid subsets are: {self._valid_subsets}"
            )

        torch.manual_seed(0)

        self._device = device

        self._filename = self._nela_path + f'{dataset}_{subset}.txt'

        with open(self._nela_path + f'{dataset}_{subset}_offsets.json', 'r') as f:
            self._offsets = json.load(f)

    def __len__(self):
        return len(self._offsets)

    def __getitem__(self, index):
        with open(self._filename, 'r') as f:
            f.seek(self._offsets[str(index)], os.SEEK_SET)
            line = f.readline()
            label = int(line[0])
            text = line[2:]

        return (
            tokenizer(
                text, padding='max_length', truncation=True, return_tensors="pt"
            )['input_ids'].to(self._device),
            torch.as_tensor([label], device=self._device)
        )

#    def _shuffle(self):
#        index_splits = torch.split(
#            torch.randperm(len(self._offsets)), self._batch_size
#        )
#        self._index_batches = [
#            split.tolist()
#            for split in index_splits
#        ]

#    def __iter__(self):
#        self._batch_index = 0
#        self._shuffle()
#        return self

#    def __next__(self):
#        if self._batch_index >= len(self._index_batches):
#            raise StopIteration

#        labels = []
#        lines = []
#        indices = self._index_batches[self._batch_index]
#        with open(self._filename, 'r') as f:
#            for index in indices:
#                f.seek(self._offsets[str(index)], os.SEEK_SET)
#                line = f.readline()
#                labels.append(
#                    int(line[0])
#                )
#                lines.append(line[2:])

#        self._batch_index += 1

#        return (
#            tokenizer(
#                lines, padding=True, truncation=True, return_tensors="pt"
#            )['input_ids'].to(self._device),
#            torch.as_tensor(labels, device=self._device)
#        )

train_ds = NelaGT2020(dataset, device, 'train')
valid_ds = NelaGT2020(dataset, device, 'valid')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




## Access the tokens and labels from the dataset

In [19]:
for i in range(len(train_ds)):
    tokens, label = train_ds[i]
    print(tokens.size())
    print(tokens)
    print(label)
    print(tokenizer.convert_ids_to_tokens(tokens[0]))
    break

for i in range(len(valid_ds)):
    tokens, label = valid_ds[i]
    print(tokens.size())
    print(tokens)
    print(label)
    print(tokenizer.convert_ids_to_tokens(tokens[0]))
    break

torch.Size([1, 512])
tensor([[  101,  2062, 21887, 23350,  5843,  7698, 25748,  2651,  1010,  2023,
          2051,  1999, 15842,  1010,  2167,  3792,  1010,  2004, 13337,  2013,
          1996,  2177,  1523,  2000,  3099,  6060,  6201,  1521,  1055,  2994,
          1011,  2188,  2344,  4214,  2009,  1523,  2019, 20454,  2058, 16416,
          2818,  2008,  1055,  2235,  5661,  1010,  1524,  2429,  2000,  1996,
          2739,  1004,  9718,  1012,  2012,  2560,  2028,  1996,  2344,  1516,
          1037,  4868,  1011,  2095,  1011,  2214,  2450,  1010,  2464,  1999,
          1996,  2678,  2917,  1010,  2040,  2409,  2157,  2000, 21614, 21365,
          1010,  1524,  2004,  2016,  2001,  2579,  2185,  1012,  3422,  1024,
         15842,  2003,  1037,  1523,  2512,  1011,  6827,  4023,  1524,  1024,
          2012,  1996,  2051,  1997,  2167,  3792,  1521,  1055,  2165,  3466,
          1999,  2397,  2233,  1010,  2009,  2018,  2525,  2363,  2062,  2084,
          3263,  1010,  2199, 1

## Run through BERT (Currently crashes because of OOM errors)

In [20]:
from transformers import BertForSequenceClassification, BertForMultipleChoice, Trainer, TrainingArguments, EarlyStoppingCallback, TrainerCallback, BertConfig

config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = 3
bert = BertForSequenceClassification(config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




In [21]:
from torch.nn import functional as F
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, Trainer, TrainingArguments, EarlyStoppingCallback, TrainerCallback

'''print(ds)
for tokens, labels in ds:
  outputs = bert(tokens)
  loss = F.cross_entropy(outputs.logits, labels)
  print(loss)

'''
class LossCallback(TrainerCallback):
  def on_evaluate(self, args, state, control, metrics, **kwargs):
    val_loss.append(metrics['eval_loss'])

def collator(samples):
    tokens, labels = zip(*samples)
    return {
        'input_ids': torch.cat(tokens),
        'labels': torch.cat(labels)
    }


training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',
    load_best_model_at_end = True,
    evaluation_strategy='epoch',
    dataloader_pin_memory=False            # directory for storing logs
)

trainer = Trainer(
    data_collator=collator,
    model=bert,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_ds,    # evaluation dataset
    eval_dataset=valid_ds,
    callbacks=[EarlyStoppingCallback(early_stopping_patience = 4)]
)
#trainer.train()

In [22]:
import os
import torch
import random
from transformers import BertForSequenceClassification

directory = '/gdrive/MyDrive/ECE692:MisinformationProject/models/results_512/'

for i, filename in enumerate(['checkpoint-16954', 'checkpoint-33908', 'checkpoint-50862', 'checkpoint-67816', 'checkpoint-84770']): #os.listdir(directory):
    file_path = directory+filename
    model = BertForSequenceClassification.from_pretrained(file_path)
    model = model.to(device)
    indices = list(range(len(valid_ds)))
    subset_indices = random.sample(indices, 50000)
    subset = torch.utils.data.Subset(valid_ds, subset_indices)
    valid_loader = torch.utils.data.DataLoader(subset, batch_size=1)

    print('Epoch', i+1, ':')
    with torch.no_grad():
      correct = 0.0
      loss = 0.0
      step = 1
      for X, y in valid_loader:
            outputs = model(X[0])
            logits = outputs[0]

            # loss
            criterion = torch.nn.CrossEntropyLoss()
            loss += criterion(logits, y[0]).item()


            # accuracy
            pred = torch.argmax(logits)
            correct += (pred == y[0].item()).float().item()

            if step % 10000 == 0:
              print('Step', step, 'Loss:', loss/step, 'Accuracy:', correct/step)

            step += 1

TypeError: ignored

In [None]:
import os
import torch
from transformers import BertForSequenceClassification

directory = '/results'

for filename in os.listdir(directory):
    file_path = directory+filename
    model = BertForSequenceClassification.from_pretrained(file_path)
    model = model.to(device)
    valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=1)

    print('Epoch', i+1, ':')
    with torch.no_grad():
      correct = 0.0
      step = 1
      for X, y in valid_loader:
            outputs = model(X[0])
            logits = outputs[0]

            # loss
            criterion = torch.nn.CrossEntropyLoss()
            loss = criterion(logits, y[0]).item()


            # accuracy
            pred = torch.argmax(logits)
            correct += (pred == y[0].item()).float().item()

            if step % 10000 == 0:
              print('Step', step, 'Loss:', loss, 'Accuracy:', correct/step)

            step += 1