# Transformer model

ref: https://huggingface.co/course/chapter2/2?fw=pt

## Make Dataset

In [1]:
import pandas as pd
import numpy as np
import datasets
from datasets import Dataset, DatasetDict

In [2]:
data_df = pd.read_pickle('../../../dataset/FakeNewsNet/data/FakeNewsNet.pkl')
data_df['label_raw'] = data_df['label']
data_df['label'] = data_df['label'].map({'false': 0, 'true': 1})
data_df.head(10)

Unnamed: 0,text,processed_text,label,label_raw
0,On Air with Ryan Seacrest is offering you a ch...,"[air, ryan, seacrest, offer, chance, win, nigh...",0,False
1,‘American Idol’ final: How to vote for the sea...,"[american, idol, final, vote, season, winner, ...",0,False
2,@ScottDisick @KrisJenner @khloekardashian — LA...,"[latest, art, shame, revenge, prank, banksy, s...",0,False
3,@foquinha Youngblood - 5 Seconds of Summer \nO...,"[youngblood, seconds, summer, little, mix, del...",0,False
4,Kylie Jenner ‘Open’ To Reconciliation With Tyg...,"[kylie, jenner, open, reconciliation, tyga, pr...",0,False
5,@Khalais1 @ibpqueen @IstantheBadGuy @_luluomar...,"[yes, studio, album, album, consistent, let, k...",0,False
6,@realDonaldTrump Says the Jesuit of his brothe...,"[say, jesuit, brother, entrench, rome, payday,...",0,False
7,Kim Kardashian Recalls “Tough Conversation” Wi...,"[kim, kardashian, recalls, tough, conversation...",0,False
8,"RT @rihanna: RT @RyanSeacrest: ""Nobody really ...","[rt, rt, care, miserable, happy, cynthia, nelm]",0,False
9,Portia de Rossi: Ellen Divorce Rumors Make Us ...,"[portia, de, rossi, ellen, divorce, rumors, fe...",0,False


In [3]:
# Train test split
from sklearn.model_selection import train_test_split


def get_shape(X, y):
    return {
        "shape": X.shape,
        "true": np.count_nonzero(y),
        "false": len(y) - np.count_nonzero(y),
    }


X_train, X_test, y_train, y_test = train_test_split(
    data_df["text"], data_df["label"], test_size=0.2, random_state=2023
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=2023
)

print("train", get_shape(X_train, y_train))
print("val", get_shape(X_val, y_val))
print("test", get_shape(X_test, y_test))


train {'shape': (875639,), 'true': 596727, 'false': 278912}
val {'shape': (218910,), 'true': 149047, 'false': 69863}
test {'shape': (273638,), 'true': 186595, 'false': 87043}


In [4]:
data_ds = DatasetDict()

data_ds['train'] = Dataset.from_pandas(pd.concat([X_train, y_train], axis=1))
data_ds['validation'] = Dataset.from_pandas(pd.concat([X_val, y_val], axis=1))
data_ds['test'] = Dataset.from_pandas(pd.concat([X_test, y_test], axis=1))

data_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 875639
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 218910
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 273638
    })
})

In [2]:
liar_df = pd.read_pickle("../../../dataset/LIAR/data/master.pkl")
liar_df['text'] = liar_df['statement']
liar_df['label_raw'] = liar_df['label']
liar_df['label'] = liar_df['label_raw'].apply(lambda x: 1 if x else 0)

# Train test split
from sklearn.model_selection import train_test_split


def get_shape(X, y):
    return {
        "shape": X.shape,
        "true": np.count_nonzero(y),
        "false": len(y) - np.count_nonzero(y),
    }


X_liar_train, X_liar_test, y_liar_train, y_liar_test = train_test_split(
    liar_df["text"], liar_df["label"], test_size=0.2, random_state=2023
)
X_liar_train, X_liar_val, y_liar_train, y_liar_val = train_test_split(
    X_liar_train, y_liar_train, test_size=0.2, random_state=2023
)

print("train", get_shape(X_liar_train, y_liar_train))
print("val", get_shape(X_liar_val, y_liar_val))
print("test", get_shape(X_liar_test, y_liar_test))

train {'shape': (8185,), 'true': 4560, 'false': 3625}
val {'shape': (2047,), 'true': 1149, 'false': 898}
test {'shape': (2559,), 'true': 1425, 'false': 1134}


In [3]:
liar_df.head()

Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,statement_processed,text,label_raw
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,say Annies List political group support trimes...,Says the Annies List political group supports ...,False
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,decline coal start start natural gas take star...,When did the decline of coal start? It started...,True
2,324.json,1,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Hillary Clinton agree John McCain vote George ...,"Hillary Clinton agrees with John McCain ""by vo...",True
3,1123.json,0,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,health care reform legislation likely mandate ...,Health care reform legislation is likely to ma...,False
4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,economic turnaround start end term,The economic turnaround started at the end of ...,True


In [4]:
liar_ds = DatasetDict()

liar_ds['train'] = Dataset.from_pandas(pd.concat([X_liar_train, y_liar_train], axis=1))
liar_ds['validation'] = Dataset.from_pandas(pd.concat([X_liar_val, y_liar_val], axis=1))
liar_ds['test'] = Dataset.from_pandas(pd.concat([X_liar_test, y_liar_test], axis=1))

liar_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 8185
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2047
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2559
    })
})

## Fine Tune

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
tokenized_data = tokenizer(liar_ds['train']["text"], return_tensors="np", padding=True, truncation=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data = dict(tokenized_data)

labels = np.array(liar_ds['train']["label"])  # Label is already an array of 0 and 1

print("tokenized_data", tokenized_data['input_ids'].shape)
print("labels", labels.shape)

2023-02-02 03:15:55.041676: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-02 03:15:56.141236: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64
2023-02-02 03:15:56.141311: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64


tokenized_data (8185, 512)
labels (8185,)


In [6]:
val_labels = np.array(liar_ds['validation']["label"])
val_tokenized_data = tokenizer(liar_ds['validation']["text"], return_tensors="np", padding=True, truncation=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
val_tokenized_data = dict(val_tokenized_data)

In [7]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5), loss="binary_crossentropy", metrics=["accuracy"])

model.summary()

2023-02-02 03:16:01.486136: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-02-02 03:16:01.518755: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-02-02 03:16:01.518975: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-02-02 03:16:01.519720: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, 

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [8]:
import os
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    TensorBoard,
    CSVLogger,
)

earlyStopping = EarlyStopping(monitor="val_loss", patience=3, verbose=0, mode="min")
modelCheckpoint = ModelCheckpoint(
    "./model/model_{epoch:02d}_{val_loss:.2f}.hdf5", verbose=1, save_freq=1000
)
tensorboard = TensorBoard(
    log_dir="./liar_logs",
    histogram_freq=1,
    write_graph=True,
    update_freq="epoch",
    embeddings_freq=1,
)
csvLogger = CSVLogger("./liar_training.log.csv", separator=",", append=False)

callbacks = [earlyStopping, modelCheckpoint, tensorboard, csvLogger]


In [11]:
model.fit(
    x=tokenized_data,
    y=labels,
    validation_data=(val_tokenized_data, val_labels),
    callbacks=callbacks,
    epochs=10000,
    batch_size=4
)


Epoch 1/10000
  26/2047 [..............................] - ETA: 6:02 - loss: 4.1821 - accuracy: 0.5000

KeyboardInterrupt: 