In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Loading Data

In [4]:
ds = pd.read_csv('/content/drive/MyDrive/datasets/code_jam_may25/data/final_labels_SG2.csv', sep=';')

In [5]:
ds

Unnamed: 0,text,news_link,outlet,topic,type,label_bias,label_opinion,biased_words
0,"""Orange Is the New Black"" star Yael Stone is r...",https://www.foxnews.com/entertainment/australi...,Fox News,environment,right,Non-biased,Entirely factual,[]
1,"""We have one beautiful law,"" Trump recently sa...",https://www.alternet.org/2020/06/law-and-order...,Alternet,gun control,left,Biased,Somewhat factual but also opinionated,"['bizarre', 'characteristically']"
2,"...immigrants as criminals and eugenics, all o...",https://www.nbcnews.com/news/latino/after-step...,MSNBC,white-nationalism,left,Biased,Expresses writer’s opinion,"['criminals', 'fringe', 'extreme']"
3,...we sounded the alarm in the early months of...,https://www.alternet.org/2019/07/fox-news-has-...,Alternet,white-nationalism,left,Biased,Somewhat factual but also opinionated,[]
4,[Black Lives Matter] is essentially a non-fals...,http://feedproxy.google.com/~r/breitbart/~3/-v...,Breitbart,marriage-equality,,Biased,Expresses writer’s opinion,['cult']
...,...,...,...,...,...,...,...,...
3669,You’ve heard of Jim Crow and Southern Segregat...,http://feedproxy.google.com/~r/breitbart/~3/ei...,Breitbart,marriage-equality,,Biased,Expresses writer’s opinion,['ALL']
3670,Young female athletes’ dreams and accomplishme...,http://feedproxy.google.com/~r/breitbart/~3/eW...,Breitbart,marriage-equality,,Biased,Expresses writer’s opinion,"['dashed', '""identify""']"
3671,"Young white men, reacting to social and educat...",https://thefederalist.com/2016/05/23/how-anti-...,Federalist,white-nationalism,right,Biased,Expresses writer’s opinion,"['evil', 'white']"
3672,Young women taking part in high school and col...,,Breitbart,sport,right,Biased,Somewhat factual but also opinionated,"['dashed', '""identify""']"


In [6]:
# Nan values
ds.isna().sum()

Unnamed: 0,0
text,0
news_link,32
outlet,0
topic,0
type,1000
label_bias,0
label_opinion,0
biased_words,0


In [7]:
ds.iloc[0][0]

  ds.iloc[0][0]


'"Orange Is the New Black" star Yael Stone is renouncing her U.S. green card to return to her native Australia in order to fight climate change.'

In [8]:
ds['text']

Unnamed: 0,text
0,"""Orange Is the New Black"" star Yael Stone is r..."
1,"""We have one beautiful law,"" Trump recently sa..."
2,"...immigrants as criminals and eugenics, all o..."
3,...we sounded the alarm in the early months of...
4,[Black Lives Matter] is essentially a non-fals...
...,...
3669,You’ve heard of Jim Crow and Southern Segregat...
3670,Young female athletes’ dreams and accomplishme...
3671,"Young white men, reacting to social and educat..."
3672,Young women taking part in high school and col...


# Useful functions

In [9]:
from transformers import BertTokenizer, BertModel
import torch
import math
import numpy as np
from tqdm import tqdm

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def BERT_text_to_embeddings(texts, max_length=512, batch_size=100, force_device=None, disable_progress_bar=False):
    # Tokenize the texts
    ids_list = []
    attention_mask_list = []

    # Tokenize each text and create the input_ids and attention masks
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,          # Add [CLS] and [SEP]
            max_length=max_length,            # Pad or truncate text to max length
            padding='max_length',             # Pad to max length
            return_attention_mask=True,       # Create attention mask
            return_tensors='pt',              # Return PyTorch tensors
            truncation=True
        )
        ids_list.append(encoded['input_ids'].squeeze().tolist())
        attention_mask_list.append(encoded['attention_mask'].squeeze().tolist())

    # Set device
    if force_device is not None:
        device = torch.device(force_device)
    else:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model.to(device)

    if not disable_progress_bar:
        print(f'Using the {device} device.')

    # Getting embeddings in batches
    embeddings = []

    for i in tqdm(range(math.ceil(len(ids_list) / batch_size)), disable=disable_progress_bar):
        ids_batch = torch.LongTensor(ids_list[batch_size * i:batch_size * (i + 1)]).to(device)
        attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size * i:batch_size * (i + 1)]).to(device)

        with torch.no_grad():
            model.eval()
            # Extracting the hidden state of the [CLS] token for each batch
            batch_embeddings = model(input_ids=ids_batch, attention_mask=attention_mask_batch)

        # We only take the embedding corresponding to the [CLS] token (first token)
        embeddings.append(batch_embeddings[0][:, 0, :].detach().cpu().numpy())

    return np.concatenate(embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
def train_val_test_split (df, rstate = 42, shuffle=True, stratify=None):
    if stratify != None:
        strat=df[stratify]
    else:
        strat = None
    train_set, test_set = train_test_split(df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)

    if stratify!=None:
        strat=test_set[stratify]
    else:
        strat = None
    val_set, test_set=train_test_split(test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)

    return (train_set, val_set, test_set)

# Data Preprocessing

In [11]:
# Manually encoding label_bias column
ds['label_bias'] = ds['label_bias'].apply(lambda x: 0 if x == 'Biased' else 1)

In [12]:
ds['label_bias'].value_counts()

Unnamed: 0_level_0,count
label_bias,Unnamed: 1_level_1
1,1864
0,1810


In [13]:
# Only keeping input and output
df = ds[['text', 'label_bias']]

In [14]:
# Splitting in train and test
train_set, val_set, test_set = train_val_test_split(df, stratify='label_bias')
xtrain, ytrain = train_set['text'], train_set['label_bias']
xval, yval = val_set['text'], val_set['label_bias']
xtest, ytest = test_set['text'], test_set['label_bias']

# Model

In [None]:
embeddings_train = BERT_text_to_embeddings(xtrain)
embeddings_val = BERT_text_to_embeddings(xval)
embeddings_test = BERT_text_to_embeddings(xtest)

Using the cuda device.


 39%|███▉      | 9/23 [00:25<00:38,  2.77s/it]