In [1]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!nvidia-smi

Mon Jun 28 02:30:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    45W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
df_news = pd.read_csv('/content/drive/My Drive/Files/covid-news.csv',encoding='iso-8859-1')

In [4]:
df_multi = pd.read_csv('/content/drive/My Drive/Files/articles.csv',encoding='iso-8859-1')

In [5]:
df_news.dtypes

Title    object
Label      bool
dtype: object

In [6]:
df_multi.dtypes

Unnamed: 0      int64
PreviewURL     object
ReportURL      object
Date           object
Source         object
Title          object
Label          object
Explanation    object
Country        object
dtype: object

In [7]:
df_multi.columns

Index(['Unnamed: 0', 'PreviewURL', 'ReportURL', 'Date', 'Source', 'Title',
       'Label', 'Explanation', 'Country'],
      dtype='object')

In [8]:
df_multi.head()

Unnamed: 0.1,Unnamed: 0,PreviewURL,ReportURL,Date,Source,Title,Label,Explanation,Country
0,0,https://www.poynter.org/?ifcn_misinformation=c...,https://facta.news/notizia-falsa/2021/03/01/le...,2021-03-01,Facta,Covid-19 masks are dangerous and the obligatio...,False,Covid-19 face masks are safe and there is no c...,Italy
1,1,https://www.poynter.org/?ifcn_misinformation=t...,https://www.newtral.es/bulo-protesta-rumania-m...,2021-03-01,Newtral.es,Thousands of people demonstrate in Romania aga...,False,The photograph corresponds to protests held in...,"Spain, Romaina"
2,2,https://www.poynter.org/?ifcn_misinformation=t...,https://chequeado.com/verificacionfb/es-falso-...,2021-02-28,Chequeado,"The Argentine Vice President, Cristina FernÃ¡n...",False,One of the images corresponds to the flu vacci...,Argentina
3,3,https://www.poynter.org/?ifcn_misinformation=t...,https://chequeado.com/verificacionfb/es-falso-...,2021-02-26,Chequeado,The Province of Buenos Aires (Argentina) has a...,False,The misinformation arose from a tweet by the d...,Argentina
4,4,https://www.poynter.org/?ifcn_misinformation=p...,https://facta.news/notizia-falsa/2021/02/26/qu...,2021-02-26,Facta,"Pfizer, Moderna and Astrazeneca are handing ou...",False,The flyers are fake,Italy


In [9]:
df_multi.Label.value_counts()

FALSE              6146
False              3437
Misleading          787
MISLEADING          427
No evidence         272
                   ... 
MANIPULATED           1
True but              1
Partly true           1
missing context       1
Fake news             1
Name: Label, Length: 65, dtype: int64

In [10]:
df_multi.Label = df_multi.Label.str.upper()

In [11]:
df_multi.Label.value_counts()[:5]

FALSE              9590
MISLEADING         1288
NO EVIDENCE         276
PARTIALLY FALSE     259
MOSTLY FALSE        166
Name: Label, dtype: int64

In [12]:
df_multi_filtered = df_multi[(df_multi.Label == 'FALSE') | (df_multi.Label == 'MISLEADING')].copy()

In [13]:
df_multi_filtered = df_multi_filtered[['Title','Label']].copy()

In [14]:
df_multi_filtered

Unnamed: 0,Title,Label
0,Covid-19 masks are dangerous and the obligatio...,FALSE
1,Thousands of people demonstrate in Romania aga...,FALSE
2,"The Argentine Vice President, Cristina FernÃ¡n...",FALSE
3,The Province of Buenos Aires (Argentina) has a...,FALSE
4,"Pfizer, Moderna and Astrazeneca are handing ou...",FALSE
...,...,...
11898,The coronavirus was created in a lab and paten...,FALSE
11899,A Chinese market caused the new coronavirus (v...,MISLEADING
11901,Stores and supermarkets in Veracruz (Mexico) w...,FALSE
11902,"A chain message circulated on Tuesday, Jan. 14...",FALSE


In [15]:
df_news.Label = df_news.Label.astype(str)

In [16]:
df_news.Label = df_news.Label.str.upper()

In [17]:
df_final = pd.concat([df_news,df_multi_filtered]).reset_index()

In [18]:
df_final.dtypes

index     int64
Title    object
Label    object
dtype: object

In [19]:
df_final.drop(['index'],inplace=True,axis=1)

In [20]:
df_final.Label = df_final.Label.str.upper()

In [21]:
df_final.Label.value_counts()

TRUE          27119
FALSE         21218
MISLEADING     1288
Name: Label, dtype: int64

In [22]:
!pip install --quiet transformers

In [23]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [24]:
df_final.Label.value_counts()

TRUE          27119
FALSE         21218
MISLEADING     1288
Name: Label, dtype: int64

In [25]:
label2id = {'FALSE':0,'MISLEADING':1,'TRUE':2}
id2label = {v:k for k, v in label2id.items()}

In [26]:
label2id

{'FALSE': 0, 'MISLEADING': 1, 'TRUE': 2}

In [27]:
from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

MAX_LEN = 64


def convert_examples_to_features(tweets, labels):
  input_ids = [
      bert_tokenizer.encode(tweet, add_special_tokens=True) for tweet in tweets
  ]

  input_ids = pad_sequences(
      input_ids,
      maxlen=MAX_LEN,
      dtype="long", 
      value=bert_tokenizer.pad_token_id,
      padding="post",
      truncating="post"
  )

  input_ids = torch.tensor(input_ids)
  attention_masks = (input_ids > 0).int()
  labels = torch.tensor([label2id[label] for label in labels])

  return TensorDataset(input_ids, attention_masks, labels)

In [28]:
dataset = convert_examples_to_features(df_final.Title, list(df_final.Label))

In [29]:
len(dataset.tensors[2])

49625

In [30]:
len(dataset.tensors[1])

49625

In [31]:
from sklearn.model_selection import train_test_split

train_data, val_data, train_labels, val_labels = train_test_split(
    dataset,
    list(df_final.Label), 
    random_state=1234,
    test_size=0.2
)

print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}")

Train size: 39700, Validation size: 9925


In [32]:
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [33]:
from transformers import AutoConfig

bert_config = AutoConfig.from_pretrained(
    "bert-base-uncased"
)
print(bert_config.num_labels)

2


In [34]:
id2label

{0: 'FALSE', 1: 'MISLEADING', 2: 'TRUE'}

In [35]:
label2id

{'FALSE': 0, 'MISLEADING': 1, 'TRUE': 2}

In [36]:
from transformers import AutoConfig, AutoModel

bert_sequential_config = AutoConfig.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

In [37]:
bert_sequential_model = AutoModelForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path="bert-base-uncased",
            config=bert_sequential_config,
        )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [38]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

print(f"Moving model to device: {device}")
bert_sequential_model = bert_sequential_model.to(device)

Moving model to device: cuda


In [39]:
from torch.utils.data import (
    DataLoader,
    TensorDataset,
    RandomSampler,
    SequentialSampler,
)

BATCH_SZ = 64

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(
    dataset=train_data,
    sampler=train_sampler,
    batch_size=BATCH_SZ
)

val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(
    dataset=val_data,
    sampler=val_sampler,
    batch_size=BATCH_SZ
)

In [40]:
bert_sequential_model.parameters()

<generator object Module.parameters at 0x7fe4a3ec8ed0>

In [41]:
from torch.optim import SGD
from torch.optim import Adadelta
from tqdm import tqdm

# define a learning rate
LR=5e-4
optimizer = SGD(bert_sequential_model.parameters(), lr=LR)
optimizer_ad = Adadelta(bert_sequential_model.parameters(), lr=LR)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

EPOCHS = 10
loss = []

for epoch in range(EPOCHS):
    batch_loss = 0
    # The model is in training model now; while in evaluation mode,
    # we change this to .eval()
    bert_sequential_model.train()

    for batch in tqdm(train_dataloader):
        # move the input data to device
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        # pass the input to the model
        outputs = bert_sequential_model(
            input_ids, 
            attention_mask=attention_mask, 
            labels=labels
        )
        
        # set model gradients to 0, so that optmizer won't accumulate
        # them over subsequent training iterations
        optimizer.zero_grad()
        loss = outputs[0]

        # obtain loss, and backprop
        batch_loss += loss.item()
        loss.backward()
        #clip gradient norms to avoid any exploding gradient problems
        # torch.nn.utils.clip_grad_norm_(bert_sequential_model.parameters(), 1.0)
        optimizer.step()

    epoch_train_loss = batch_loss / len(train_dataloader)  
    print(f"epoch: {epoch+1}, train_loss: {epoch_train_loss}")
    
    # At the end of each epoch, we will also run the model 
    # on the validation dataset
    val_loss, val_accuracy = 0, 0
    true_labels, predictions = [], []

    for val_batch in val_dataloader:
        val_batch = tuple(t.to(device) for t in val_batch)
        input_ids, attention_mask, labels = val_batch
        
        with torch.no_grad():        
            outputs = bert_sequential_model(
              input_ids, 
              attention_mask=attention_mask, 
              labels=labels
            )
        
        val_loss += loss.item()
        
        # convert predictions and gold labels to numpy arrays so that
        # we can compute evaluation metrics like accuracy and f1
        label_ids = labels.to('cpu').numpy()
        preds = outputs[1].detach().cpu().numpy()
        preds = np.argmax(preds, axis=1)
        true_labels.extend(label_ids)
        predictions.extend(preds)
      
    acc = f1_score(y_true=true_labels, y_pred=predictions, average='micro')
    f1 = f1_score(y_true=true_labels, y_pred=predictions, average='macro')

    print(f"epoch: {epoch+1} val loss: {val_loss}, accuracy:{acc}, f1:{f1}")

100%|██████████| 621/621 [03:39<00:00,  2.83it/s]


epoch: 1, train_loss: 0.5008085014931438


  0%|          | 0/621 [00:00<?, ?it/s]

epoch: 1 val loss: 86.40164995193481, accuracy:0.9082115869017632, f1:0.612499639799983


100%|██████████| 621/621 [03:39<00:00,  2.83it/s]


epoch: 2, train_loss: 0.24846626309476996


  0%|          | 0/621 [00:00<?, ?it/s]

epoch: 2 val loss: 8.120770812034607, accuracy:0.9303778337531486, f1:0.6277418034680554


  7%|▋         | 44/621 [00:15<03:24,  2.83it/s]