In [2]:
!pip install transformers --q
!pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html --q
!pip install --upgrade --force-reinstall gdown --q
!pip install wandb -q
!pip install boto3 --q

Collecting boto3
  Downloading boto3-1.20.26-py3-none-any.whl (131 kB)
     |████████████████████████████████| 131 kB 8.1 MB/s            
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.0-py3-none-any.whl (79 kB)
     |████████████████████████████████| 79 kB 11.5 MB/s            
[?25hCollecting botocore<1.24.0,>=1.23.26
  Downloading botocore-1.23.26-py3-none-any.whl (8.5 MB)
     |████████████████████████████████| 8.5 MB 66.4 MB/s            
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.20.26 botocore-1.23.26 jmespath-0.10.0 s3transfer-0.5.0


# Download the dataset from Gdrive

In [None]:

# import gdown

# https://drive.google.com/drive/folders/11mRvsHAkggFEJvG4axH4mmWI6FHMQp7X?usp=sharing

!gdown --folder https://drive.google.com/drive/folders/11mRvsHAkggFEJvG4axH4mmWI6FHMQp7X?usp=sharing --quiet

In [None]:
# Google Cloud Notebook
!pip install --upgrade --force-reinstall gdown --q
NELA_2018_SITE_SPLIT = 'nela_gt_2018_site_split/'

!gdown --folder https://drive.google.com/drive/folders/11mRvsHAkggFEJvG4axH4mmWI6FHMQp7X?usp=sharing -O {NELA_2018_SITE_SPLIT} --quiet

# Load the JSON files into DF

In [None]:
import os
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import json
import numpy as np

'''
Convert jsonl files to pandas dataset
'''
def jsonl_to_df(file_path):
    with open(file_path) as f:
        lines = f.read().splitlines()

    df_inter = pd.DataFrame(lines)
    df_inter.columns = ['json_element']

    df_inter['json_element'].apply(json.loads)

    return pd.json_normalize(df_inter['json_element'].apply(json.loads))

'''
Load all datasets into one.
Use this for visualization and EDA
'''
def load_dataset(file_path):
    train_df = jsonl_to_df(os.path.join(file_path, 'train.jsonl'))
    train_df['split'] = 'train'
    val_df = jsonl_to_df(os.path.join(file_path, 'val.jsonl'))
    val_df['split'] = 'val'
    test_df = jsonl_to_df(os.path.join(file_path, 'train.jsonl'))
    test_df['split'] = 'test'

    df = pd.concat([train_df, val_df, test_df])
    pd.concat([train_df, val_df, test_df])
    df.fillna('', inplace=True)
    return df

In [None]:
NELA_2018_SITE_SPLIT = 'data/nela_gt_2018_site_split/'
df = load_dataset(NELA_2018_SITE_SPLIT)
df.sample(5)

In [None]:
table = pd.pivot_table(df, values='title', index=['split'], columns=['label'], aggfunc='count', margins='all')
table

# Tokenizer

In [None]:
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification, BertModel, AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import classification_report, confusion_matrix

from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid", palette="pastel")

_ = torch.manual_seed(42)

In [None]:
MODEL_NAME = 'distilbert-base-cased'
tokenizer =DistilBertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def get_token_length(text):
    encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            # max_length = 512,
            return_token_type_ids = False,
            # padding = 'max_length',
            # truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )
    return encoding['input_ids'].shape[1]

In [None]:
train_df = df[df['split'] == 'train']
tqdm.pandas()
train_df['title_token'] = train_df['title'].progress_apply(get_token_length)
train_df['content_token'] = train_df['content'].progress_apply(get_token_length)

In [None]:
ax = sns.boxplot(data=train_df, y = 'label', x = 'title_token',  orient="h")
_ = plt.title('Token distribution (titles)')
_ = plt.yticks([0, 1], ['unreliable', 'reliable'], rotation='vertical')

In [None]:
ax = sns.boxplot(data=train_df, y = 'label', x = 'content_token',  orient="h", showfliers = False)
_ = plt.title('Token distribution (content)')
_ = plt.yticks([0, 1], ['unreliable', 'reliable'], rotation='vertical')

# Truncate 

In [None]:
'''
DataSet class
'''
class ReliableNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_token_len = 128, title_only=True):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len
        self.title_only = title_only

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        
        text = data_row.title
        if not self.title_only:
            text = text + ' [SEP] ' + data_row.content
            
        labels = data_row.label

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length = self.max_token_len,
            return_token_type_ids = False,
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        return dict(
            text = text,
            input_ids = encoding['input_ids'].flatten(),
            attention_mask = encoding['attention_mask'].flatten(),
            labels = torch.tensor(labels, dtype=torch.float32)
        )

def create_reliable_news_dataloader(file_path, tokenizer, max_len=128, batch_size=8, shuffle=False, sample = None, title_only=False):
    df = jsonl_to_df(file_path)
    
    # Load only a partial dataset
    if sample:
        df = df.sample(sample)
    
    ds = ReliableNewsDataset(df, tokenizer, max_token_len = max_len, title_only=title_only)
    return DataLoader(ds, batch_size = batch_size, shuffle=shuffle)

In [None]:
train_data_loader = create_reliable_news_dataloader('data/nela_gt_2018_site_split/train.jsonl', tokenizer, max_len=512, batch_size=8, shuffle=True, sample=16)
sample = next(iter(train_data_loader))
sample.keys()

In [None]:
model = DistilBertModel.from_pretrained('distilbert-base-cased')

In [None]:
output = model(input_ids = sample['input_ids'], attention_mask = sample['attention_mask'])

In [None]:
hidden_state = output.last_hidden_state
hidden_state[:,0].size()

In [None]:
print(model.config.hidden_size)