# Load the JSON files

In [1]:
import os
import warnings
warnings.simplefilter('ignore')

DATA_PATH = '/content/drive/MyDrive/DATASETS/NELA/'
NELA_2018_SITE_SPLIT = 'nela_gt_2018_site_split/'

if os.path.isdir(NELA_2018_SITE_SPLIT):
  !rm -r {NELA_2018_SITE_SPLIT}
!cp -r {DATA_PATH + NELA_2018_SITE_SPLIT} {NELA_2018_SITE_SPLIT}

In [2]:
import pandas as pd
import json
import numpy as np

def jsonl_to_df(file_path):
    with open(file_path) as f:
        lines = f.read().splitlines()

    df_inter = pd.DataFrame(lines)
    df_inter.columns = ['json_element']

    df_inter['json_element'].apply(json.loads)

    return pd.json_normalize(df_inter['json_element'].apply(json.loads))

In [3]:
train_df = jsonl_to_df(NELA_2018_SITE_SPLIT + 'train.jsonl')
train_df['split'] = 'train'
dev_df = jsonl_to_df(NELA_2018_SITE_SPLIT + 'dev.jsonl')
dev_df['split'] = 'dev'
test_df = jsonl_to_df(NELA_2018_SITE_SPLIT + 'test.jsonl')
test_df['split'] = 'test'

df = pd.concat([train_df, dev_df, test_df])
df.sample(5)

Unnamed: 0,content,title,date,source,label,split
29907,"Stock prices took a sharp plunge Tuesday, for ...",Dow Falls 950 Points Or 4 Percent In 2 Days,2018-11-20,npr,1,train
1898,No matter how he ends up ever turning it into ...,Hundreds Of Justice Dept Members Team Up To Sa...,2018-05-06,bipartisanreport,0,test
12123,Ukraine's parliament is to decide whether to b...,Protests in Kiev after Russia seizes Ukraine s...,2018-11-26,bbc,1,dev
37324,WASHINGTON The Defense Department said Thursd...,Pentagon delays Trumps veterans parade until a...,2018-08-17,chicagosuntimes,1,test
58632,My favorite part of the Daily Beast article is...,You Won039t Have This Gang of Migrant Stone-Th...,2018-11-29,frontpagemagazine,0,train


In [4]:
table = pd.pivot_table(df, values='title', index=['split'], columns=['label'], aggfunc='count', margins='all')
table

label,0,1,All
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dev,20294,20294,40588
test,19410,19410,38820
train,35302,35302,70604
All,75006,75006,150012


In [5]:
df.fillna('', inplace=True)

In [6]:
df['label'] = df['label'].astype(float)

# Logistic Regression (Title only)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc
from sklearn.linear_model import LogisticRegression
from scipy import sparse

vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words= 'english',ngram_range=(3,6),dtype=np.float32)

In [None]:
X_train_vect = sparse.hstack([vect_word.fit_transform(X_train), vect_char.fit_transform(X_train)])
X_test_vect = sparse.hstack([vect_word.fit_transform(X_test), vect_char.fit_transform(X_test)])

In [None]:
lr = LogisticRegression(C=1, max_iter=500, random_state = 42)
lr.fit(X_train_vect, y_train)

LogisticRegression(C=1, max_iter=500, random_state=42)

In [None]:
y_pred = lr.predict(X_test_vect)

In [None]:
print('\nConfusion matrix\n',confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))


Confusion matrix
 [[ 1725 17685]
 [ 1408 18002]]
              precision    recall  f1-score   support

           0       0.55      0.09      0.15     19410
           1       0.50      0.93      0.65     19410

    accuracy                           0.51     38820
   macro avg       0.53      0.51      0.40     38820
weighted avg       0.53      0.51      0.40     38820



# BERT (Title Only)

In [7]:
!pip install transformers --q
!pip install datasets --q
!pip install pytorch-lightning --q
!pip install torchmetrics --q

In [8]:
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertModel, AdamW, get_linear_schedule_with_warmup

import pytorch_lightning as pl
from torchmetrics import Accuracy, F1
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.metrics import classification_report, confusion_matrix

pl.seed_everything(42)

Global seed set to 42


42

In [9]:
MODEL_NAME = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
class reliableNews(Dataset):
  def __init__(self, data, tokenizer, max_token_len = 512):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    data_row = self.data.iloc[index]
    title = data_row.title
    labels = data_row.label

    encoding = self.tokenizer.encode_plus(
        title,
        add_special_tokens=True,
        max_length = self.max_token_len,
        return_token_type_ids = False,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    return dict(
        title = title,
        input_ids = encoding['input_ids'].flatten(),
        attention_mask = encoding['attention_mask'].flatten(),
        labels = torch.tensor(labels, dtype = torch.float32)
    )

In [11]:
class reliableNewsDataModule(pl.LightningDataModule):
  def __init__(self, train_df, val_df, test_df, tokenizer, batch_size = 16, max_token_len = 512):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.val_df = val_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def setup(self, stage=None):
    self.train_dataset = reliableNews(
        self.train_df,
        self.tokenizer,
        self.max_token_len
    )

    self.val_dataset = reliableNews(
        self.val_df,
        self.tokenizer,
        self.max_token_len
    )

    self.test_dataset = reliableNews(
        self.test_df,
        self.tokenizer,
        self.max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = True
    )
  
  def val_dataloader(self):
    return DataLoader(
        self.val_dataset,
        batch_size = self.batch_size,
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = self.batch_size,
    )

In [12]:
N_EPOCHS = 10
BATCH_SIZE = 8

data_module = reliableNewsDataModule(
    df[df['split']=='train'].sample(5000),
    df[df['split']=='dev'].sample(1000),
    df[df['split']=='test'].sample(1000),
    tokenizer,
    batch_size = BATCH_SIZE,
    max_token_len = 512
)

In [13]:
class reliableNewsClassifier(pl.LightningModule):
  def __init__(self, n_classes, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(MODEL_NAME, return_dict = True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCEWithLogitsLoss()
    
  def forward(self, input_ids, attention_mask, labels = None):
    x = self.bert(input_ids, attention_mask=attention_mask)
    x = self.classifier(x.pooler_output)
    x = torch.sigmoid(x)
    loss = 0

    if labels is None:
      loss = self.criterion(x, labels)
    return loss, x 

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']

    loss, ouputs = self(input_ids, attention_mask, labels)
    self.log('train_loss', loss, prog_bar=True, logger=True)

    return {'loss': loss, 'predicitons': outputs, 'labels': labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    
    return loss

  def training_epoch_end(self, outputs):
    pass

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = self.n_warmup_steps,
        num_training_steps = self.n_training_steps
    )

    return dict(
        optimizer = optimizer,
        lr_scheduler = dict(
            scheduler = scheduler,
            interval = 'step'
        )
    )

In [14]:
steps_per_epoch = 5000//BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(624, 3120)

In [15]:
model = reliableNewsClassifier(
    n_classes = 1,
    n_warmup_steps = warmup_steps,
    n_training_steps = total_training_steps
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

logger = TensorBoardLogger("lightning_logs", name="reliable-news")

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3)

In [17]:
trainer = pl.Trainer(
  logger=logger,
  checkpoint_callback=checkpoint_callback,
  callbacks=[early_stopping_callback],
  max_epochs=N_EPOCHS,
  gpus=0,
  progress_bar_refresh_rate=30
)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model, data_module)


  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BertModel         | 108 M 
1 | classifier | Linear            | 769   
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.244   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


Training: 0it [00:00, ?it/s]