In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tabulate import tabulate

!pip install datasets

from tqdm.notebook import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig



In [None]:
df_train = pd.read_csv("/content/train.tsv", sep="\t", header=None)
df_valid = pd.read_csv("/content/valid.tsv", sep="\t", header=None)
df_test = pd.read_csv("/content/test.tsv", sep="\t", header=None)

In [None]:
df_train.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [None]:
from datasets import Dataset
from datasets import DatasetDict

def preprocessing_fn1(df):
  # changing the label to true or false
  df['label'] = [1 if x=="true" or x=="mostly-true" or x == "half-true"
                 else 0 for x in df[1]]

  #df.dropna
  df = df.fillna('')

  # we drop the columns of the counts and the id
  df = df.drop([0,1,8,9,10,11,12] , axis=1)

  # join the metadata in a single column
  metadata = []

  for i in range(len(df)):

      speaker =  df[4][i]
      if speaker == 0:
          speaker = ''

      subject =  df[4][i]
      if subject == 0:
          subject = ''

      job =  df[5][i]
      if job == 0:
          job = ''

      state =  df[6][i]
      if state == 0:
          state = ''

      affiliation =  df[7][i]
      if affiliation == 0:
          affiliation = ''

      context =  df[13][i]
      if context == 0 :
          context = ''

      metadata.append(str(subject) + ' ' + str(speaker) + ' ' + str(job) + ' ' + str(state) + ' ' + str(affiliation) + ' ' + str(context))



  # Adding the metadata column to the dataset
  df[14] = metadata

  # Creating a new column composed of the metadata in front of the sentence
  df["sentence"] = df[14].astype('str') + " " + df[2] #Combining metadata and the text columns into single columns

  # We drop all columns apart from the label and the sentence
  df = df.drop([2,3,4,5,6,7,13], axis=1)

  # Creating a dictionnary of labels and sentences
  pre_dataset = {'label': [df['label'][i] for i in range(df.shape[0]) ],
                 'sentence': [df['sentence'][i] for i in range(df.shape[0]) ]}

  # Transforming the dataframe into a dataset
  dataset = Dataset.from_dict(pre_dataset)

  return dataset


In [None]:
dataset_train = preprocessing_fn1(df_train)
dataset_test = preprocessing_fn1(df_test)
dataset_valid = preprocessing_fn1(df_valid)

In [None]:
print(dataset_train)
print(dataset_test)
print(dataset_valid)

Dataset({
    features: ['label', 'sentence'],
    num_rows: 10240
})
Dataset({
    features: ['label', 'sentence'],
    num_rows: 1267
})
Dataset({
    features: ['label', 'sentence'],
    num_rows: 1284
})


In [None]:
from torch.utils.data import Dataset

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case = True)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import re

def preprocess(text):
    if text == '':
        return ''
    else:
        text = text.lower()
        text_cleaned = re.sub(r'@[A-Za-z0-9_]+', '', text)
        text_cleaned = re.sub(r'#[A-Za-z0-9_]+', '', text_cleaned)
        text_cleaned = re.sub(r'https?:\/\/\S*', '', text_cleaned)
        text_cleaned = text_cleaned.replace(',', '')
    return text_cleaned


In [None]:
def create_sentence_embeddings(sentences):
        input_ids = []

        for sent in sentences:
            preprocessed_sent = preprocess(sent)
            input = tokenizer.encode_plus(preprocessed_sent, add_special_tokens=True, max_length = 200,
                                               padding = "max_length", return_attention_mask=False,
                                               truncation=True)
            input_ids.append(input['input_ids'])

        input_ids = torch.tensor(input_ids)
        return input_ids

In [None]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [None]:
from tqdm import tqdm_notebook
from torch.utils.data import TensorDataset
from sklearn.metrics import f1_score, balanced_accuracy_score

loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-6
optimizer = torch.optim.RAdam(params =  model.parameters(), lr=learning_rate, weight_decay = 1e-5)


def train(model, dataset_train, dataset_valid, batch_size = 32):

  input_ids_train = create_sentence_embeddings(dataset_train['sentence'])
  input_ids_dev = create_sentence_embeddings(dataset_valid['sentence'])

  labels_train = torch.tensor(dataset_train['label'], dtype=torch.long)
  labels_dev = torch.tensor(dataset_valid['label'], dtype=torch.long)

  input_ids_train = torch.tensor(input_ids_train)
  labels_train = torch.tensor(labels_train)

  input_ids_valid = torch.tensor(input_ids_dev)
  labels_valid = torch.tensor(labels_dev)


  train_dataset = TensorDataset(input_ids_train, labels_train)
  valid_dataset = TensorDataset(input_ids_valid, labels_valid)

  train_loader = DataLoader(train_dataset, batch_size = batch_size)
  valid_loader = DataLoader(valid_dataset, batch_size = batch_size)


  max_epochs = 10
  model = model.train()
  for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(train_loader):
      optimizer.zero_grad()
      if torch.cuda.is_available():
        sent = sent.cuda()
        label = label.cuda()
      output = model.forward(sent)[0]
      _, predicted = torch.max(output, 1)

      loss = loss_function(output, label)
      loss.backward()
      optimizer.step()

      if i%100 == 0:
        all_predicted= []
        all_labels = []
        correct = 0
        total = 0
        for sent, label in valid_loader:
          sent = sent.squeeze(0)
          if torch.cuda.is_available():
            sent = sent.cuda()
            label = label.cuda()
          output = model.forward(sent)[0]
          _, predicted = torch.max(output.data, 1)
          total += label.size(0)
          correct += (predicted.cpu() == label.cpu()).sum()
          all_predicted.extend(predicted.cpu().numpy())
          all_labels.extend(label.cpu().numpy())


        accuracy = 100.00 * correct.numpy() / total
        f1 = f1_score(all_labels, all_predicted)
        balanced_acc = balanced_accuracy_score(all_labels, all_predicted)

        print('Iteration: {}. Loss: {}. Accuracy: {}%. F1 Score: {}. BACC: {}'.format(i, loss.item(), accuracy, f1, balanced_acc))
  return model


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
trained_model = train(model, dataset_train, dataset_valid)

  input_ids_train = torch.tensor(input_ids_train)
  labels_train = torch.tensor(labels_train)
  input_ids_test = torch.tensor(input_ids_dev)
  labels_test = torch.tensor(labels_dev)
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(max_epochs)):


  0%|          | 0/10 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


EPOCH -- 0
Iteration: 0. Loss: 0.695770263671875. Accuracy: 53.7490134175217%. F1 Score: 0.6846071044133477. BACC: 0.4860653122008297%
Iteration: 100. Loss: 0.6748214364051819. Accuracy: 55.722178374112076%. F1 Score: 0.709476954945624. BACC: 0.4986792185228522%
Iteration: 200. Loss: 0.6749122738838196. Accuracy: 56.195737963693766%. F1 Score: 0.7192716236722307. BACC: 0.4988033187958728%
Iteration: 300. Loss: 0.695040225982666. Accuracy: 56.353591160220994%. F1 Score: 0.7202832574607991. BACC: 0.5004077580399249%
EPOCH -- 1
Iteration: 0. Loss: 0.6602306365966797. Accuracy: 56.59037095501184%. F1 Score: 0.7213779128672746. BACC: 0.5031202354359465%
Iteration: 100. Loss: 0.6713263988494873. Accuracy: 56.59037095501184%. F1 Score: 0.7196738022426096. BACC: 0.504343509555721%
Iteration: 200. Loss: 0.6390042304992676. Accuracy: 56.353591160220994%. F1 Score: 0.720565942395149. BACC: 0.5002038790199624%
Iteration: 300. Loss: 0.696172833442688. Accuracy: 56.51144435674822%. F1 Score: 0.71670

In [None]:
def save_model(model):
    filename = f"roberta_misinformation_10epochs.pth"
    torch.save(model.state_dict(), filename)

save_model(trained_model)

In [None]:
def test(trained_model, data, batch_size = 32):

  model.eval()

  input_ids = create_sentence_embeddings(data['sentence'])

  labels = torch.tensor(data['label'], dtype=torch.long)

  input_ids = torch.tensor(input_ids)
  labels = torch.tensor(labels)

  dataset = TensorDataset(input_ids, labels)

  loader = DataLoader(dataset, batch_size=batch_size)

  all_predicted= []
  all_labels = []
  correct = 0
  total = 0

  with torch.no_grad():
    for sent, label in loader:
      sent = sent.squeeze(0)
      if torch.cuda.is_available():
        sent = sent.cuda()
        label = label.cuda()
      output = model.forward(sent)[0]
      _, predicted = torch.max(output.data, 1)

      total += label.size(0)
      correct += (predicted.cpu() == label.cpu()).sum()

      all_predicted.extend(predicted.cpu().numpy())
      all_labels.extend(label.cpu().numpy())

  accuracy = 100.00 * correct.numpy() / total
  f1 = f1_score(all_labels, all_predicted)
  balanced_acc = balanced_accuracy_score(all_labels, all_predicted)

  print('Test results - Accuracy: {}%. F1 Score: {}. BACC: {}%'.format(accuracy, f1, balanced_acc))

In [None]:
test(trained_model, dataset_test)