In [1]:
import time
import random
import os

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import DebertaTokenizer, TFDebertaForSequenceClassification
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# These auto classes load the right type of tokenizer and model based on a model name
from transformers import AutoTokenizer, TFAutoModel

2022-10-29 10:04:18.362953: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-29 10:04:18.563329: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-29 10:04:19.274604: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/jstil/miniconda3/envs/tf/lib/
2022-10-29 10:04:19.276950: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin

In [2]:
# set seed, TF uses python ramdom and numpy library, so these must also be fixed
tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)
os.environ['PYTHONHASHSEED']=str(0)
os.environ['TF_DETERMINISTIC_OPS'] = '0'

In [3]:
def read_media_cloud_data(path, label):
    """Read in data downloaded from media cloud and assign a label to all rows"""
    df = pd.read_csv(path)
    df['Label_bias'] = label
    df = df.rename({'title': 'sentence'}, axis=1)
    return df

# read in two datasets
PATH_biased = "data/news_headlines_usa_biased.csv"
PATH_neutral = "data/news_headlines_usa_neutral.csv"
df_biased = read_media_cloud_data(PATH_biased, 1)
df_neutral = read_media_cloud_data(PATH_neutral, 0)

# combine them
df_distant = pd.concat([df_biased,df_neutral], axis=0, ignore_index=1)
df_distant = shuffle(df_distant)

# train-test split
df_distant_train, df_distant_test = train_test_split(df_distant, test_size=0.2)

# # test pipeline set
# df_distant, exclude = train_test_split(df_distant, test_size=0.95)
# df_distant_train, df_distant_test = train_test_split(df_distant, test_size=0.2)

In [4]:
def preprocess(df, model_name):
    """convert a pandas dataframe into a tensorflow dataset"""
    df2 = df.copy(deep=False)
    target = df2.pop('Label_bias')
    sentence = df2.pop('sentence')

    if model_name=='bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    elif model_name=='roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    elif model_name=='deberta':
        tokenizer = DebertaTokenizer.from_pretrained("kamalkraj/deberta-base")

    train_encodings = tokenizer(
                        sentence.tolist(),                      
                        add_special_tokens = True, # add [CLS], [SEP]
                        truncation = True, # cut off at max length of the text that can go to BERT
                        padding = True, # add [PAD] tokens
                        return_attention_mask = True, # add attention mask to not focus on pad tokens
              )
    
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(train_encodings), 
         target.tolist()))
    return dataset

In [5]:
def train_model(train_df, test_df, model_name):
    # pandas -> tensorflow
    train_distant_dataset = preprocess(train_df, model_name)
    test_distant_dataset = preprocess(test_df, model_name)

    # batch and randomize
    BUFFER_SIZE = 10000
    BATCH_SIZE = 24

    train_distant_dataset = train_distant_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    test_distant_dataset = test_distant_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    
    tf.keras.backend.clear_session()
    
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True) # after 3 epochs without improvement, stop training
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    
    if model_name=='bert':
        clf = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
    elif model_name=='roberta':
        clf = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
    elif model_name == 'deberta':
        clf = TFDebertaForSequenceClassification.from_pretrained("kamalkraj/deberta-base")

    clf.compile(optimizer=optimizer, loss='binary_crossentropy', metrics='accuracy') 
    history = clf.fit(train_distant_dataset, epochs=5, validation_data = test_distant_dataset, callbacks=[callback])
    trained_layer = clf.get_layer(index=0).get_weights()
    clf.save_weights(f'./checkpoints/{model_name}_final_checkpoint_news_headlines_USA')

In [6]:
# train bert
train_model(df_distant_train, df_distant_test, 'bert')

2022-10-29 10:04:55.844302: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:966] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-10-29 10:04:55.883151: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:966] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-10-29 10:04:55.883719: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:966] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-10-29 10:04:55.884642: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

Epoch 1/5


2022-10-29 10:05:55.772237: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 28015456 exceeds 10% of free system memory.


Epoch 2/5


In [7]:
# train roberta
train_model(df_distant_train, df_distant_test, 'roberta')

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5


In [None]:
# train deberta
train_model(df_distant_train, df_distant_test, 'deberta')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
All model checkpoint layers were used when initializing TFDebertaForSequenceClassification.

Some layers of TFDebertaForSequenceClassification were not initialized from the model checkpoint at kamalkraj/deberta-base and are newly initialized: ['cls_dropout', 'pooler', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Epoch 2/5
Epoch 3/5
