In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 73.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 81.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [3]:
import time
import random
import os

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import DebertaTokenizer, TFDebertaForSequenceClassification
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# These auto classes load the right type of tokenizer and model based on a model name
from transformers import AutoTokenizer, TFAutoModel

In [4]:
# set seed, TF uses python ramdom and numpy library, so these must also be fixed
tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)
os.environ['PYTHONHASHSEED']=str(0)
os.environ['TF_DETERMINISTIC_OPS'] = '0'

In [5]:
def read_media_cloud_data(path, label):
    """Read in data downloaded from media cloud and assign a label to all rows"""
    df = pd.read_csv(path)
    df['Label_bias'] = label
    df = df.rename({'title': 'sentence'}, axis=1)
    return df

# read in two datasets
PATH_biased = "/content/drive/MyDrive/Colab Notebooks/data/news_headlines_usa_biased.csv"
PATH_neutral = "/content/drive/MyDrive/Colab Notebooks/data/news_headlines_usa_neutral.csv"
df_biased = read_media_cloud_data(PATH_biased, 1)
df_neutral = read_media_cloud_data(PATH_neutral, 0)

# combine them
df_distant = pd.concat([df_biased,df_neutral], axis=0, ignore_index=1)
df_distant = shuffle(df_distant)

# train-test split
df_distant_train, df_distant_test = train_test_split(df_distant, test_size=0.2)

# # test pipeline set
# df_distant, exclude = train_test_split(df_distant, test_size=0.90)
# df_distant_train, df_distant_test = train_test_split(df_distant, test_size=0.2)

In [6]:
def preprocess(df, model_name):
    """convert a pandas dataframe into a tensorflow dataset"""
    df2 = df.copy(deep=False)
    target = df2.pop('Label_bias')
    sentence = df2.pop('sentence')

    if model_name=='bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    elif model_name=='roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    elif model_name=='deberta':
        tokenizer = DebertaTokenizer.from_pretrained("kamalkraj/deberta-base")

    train_encodings = tokenizer(
                        sentence.tolist(),                      
                        add_special_tokens = True, # add [CLS], [SEP]
                        truncation = True, # cut off at max length of the text that can go to BERT
                        padding = True, # add [PAD] tokens
                        return_attention_mask = True, # add attention mask to not focus on pad tokens
              )
    
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(train_encodings), 
         target.tolist()))
    return dataset

In [7]:
def train_model(train_df, test_df, model_name, batch=24):
    # pandas -> tensorflow
    train_distant_dataset = preprocess(train_df, model_name)
    test_distant_dataset = preprocess(test_df, model_name)

    # batch and randomize
    BUFFER_SIZE = 10000
    BATCH_SIZE = batch

    train_distant_dataset = train_distant_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    test_distant_dataset = test_distant_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    
    tf.keras.backend.clear_session()
    
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True) # after 3 epochs without improvement, stop training
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    
    if model_name=='bert':
        clf = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
    elif model_name=='roberta':
        clf = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
    elif model_name == 'deberta':
        clf = TFDebertaForSequenceClassification.from_pretrained("kamalkraj/deberta-base")

    clf.compile(optimizer=optimizer, loss='binary_crossentropy', metrics='accuracy') 
    history = clf.fit(train_distant_dataset, epochs=5, validation_data = test_distant_dataset, callbacks=[callback])
    trained_layer = clf.get_layer(index=0).get_weights()
    clf.save_weights(f'/content/drive/MyDrive/Colab Notebooks/weights/{model_name}_final_checkpoint_news_headlines_USA')

In [8]:
# train deberta
train_model(df_distant_train, df_distant_test, 'deberta', 128)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/744 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Downloading:   0%|          | 0.00/555M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDebertaForSequenceClassification.

Some layers of TFDebertaForSequenceClassification were not initialized from the model checkpoint at kamalkraj/deberta-base and are newly initialized: ['classifier', 'cls_dropout', 'pooler']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
# train bert
train_model(df_distant_train, df_distant_test, 'bert', 128)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
# train roberta
train_model(df_distant_train, df_distant_test, 'roberta', 128)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
