### Basic Transformer Model Implementation
Source at:
    https://huggingface.co/docs/transformers/tasks/sequence_classification

In [61]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import create_optimizer
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback
from transformers import pipeline
from tqdm import tqdm

In [58]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core).
Your token has been saved to C:\Users\Ken\.cache\huggingface\token
Login successful


In [None]:
# !pip install transformers datasets evaluate

In [50]:
!pip install tensorflow



In [None]:
# !pip3 install --upgrade tensorflow-gpu --user

In [23]:
imdb = load_dataset("imdb")

Found cached dataset imdb (C:/Users/Ken/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [28]:
imdb["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [29]:
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [31]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [32]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [33]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [40]:
tokenized_imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [51]:
# BELOW IS FOR PYTORCH

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [34]:
# BELOW IS FOR TENSORFLOW

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [52]:
accuracy = evaluate.load("accuracy")

In [53]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [54]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [55]:
# BELOW IS FOR PYTORCH

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [62]:
training_args = TrainingArguments(
    output_dir="transformer_1_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# tqdm(trainer.train())
trainer.train()

C:\Users\Ken\Documents\UChicago\CAPP_30255-Advanced_Machine_Learning_and_NLP\Final_Project\toxic-language-filter\ken\my_awesome_model is already a clone of https://huggingface.co/kenkliesner/my_awesome_model. Make sure you pull the latest changes with `repo.git_pull()`.


KeyboardInterrupt: 

In [None]:
trainer.push_to_hub()

# ABOVE IS FOR PYTORCH

In [41]:
# from transformers import create_optimizer
# import tensorflow as tf

In [43]:
# print(tf.__version__)

2.12.0


In [None]:
# !pip install tensorflow-gpu

# https://stackoverflow.com/questions/70624869/tfbertforsequenceclassification-requires-the-tensorflow-library-but-it-was-not-f

In [None]:
# BELOW IS FOR TENSORFLOW

# batch_size = 16
# num_epochs = 5
# batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
# total_train_steps = int(batches_per_epoch * num_epochs)
# optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
# model = TFAutoModelForSequenceClassification.from_pretrained(
#     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
# )

In [None]:
# tf_train_set = model.prepare_tf_dataset(
#     tokenized_imdb["train"],
#     shuffle=True,
#     batch_size=16,
#     collate_fn=data_collator,
# )

# tf_validation_set = model.prepare_tf_dataset(
#     tokenized_imdb["test"],
#     shuffle=False,
#     batch_size=16,
#     collate_fn=data_collator,
# )

In [None]:
# model.compile(optimizer=optimizer)

In [None]:
# metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [None]:
# push_to_hub_callback = PushToHubCallback(
#     output_dir="my_awesome_model",
#     tokenizer=tokenizer,
# )

In [None]:
# callbacks = [metric_callback, push_to_hub_callback]

In [None]:
# model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

# ABOVE IS FOR TENSORFLOW

In [None]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

In [None]:
classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")
classifier(text)

In [None]:
# BELOW IS FOR PYTORCH

tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
inputs = tokenizer(text, return_tensors="pt")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

# ABOVE IS FOR PYTORCH

In [None]:
# BELOW IS FOR TENSORFLOW

# tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
# inputs = tokenizer(text, return_tensors="tf")

In [None]:
# model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
# logits = model(**inputs).logits

In [None]:
# predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
# model.config.id2label[predicted_class_id]

# ABOVE IS FOR TENSORFLOW

In [None]:
balanced_data = pd.read_csv("./data/balanced_data.csv")

In [None]:
balanced_data.head()

In [None]:
balanced_data.iloc[2]

In [None]:
balanced_data["comment_text"][3]

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

def balance_data(dataset, label):
    '''
    Depending on the chosen label, the function provide 
    a balanced dataset on the label. The result dataset 
    will have equal number of label == 1/0
    '''
    pos_data = dataset[dataset[label] == 1]
    neg_data = dataset[dataset[label] == 0]
    neg_data_sample = neg_data.sample(n=len(pos_data), random_state=12)

    df = pd.concat([neg_data_sample, pos_data])
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    return df


class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        '''
        data: Pandas dataframe
        split: 
        '''
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data.iloc[index]
        # Extract the relevant columns from the CSV file
        text = sample["comment_text"]
        label = sample["toxic"]
        # Apply any transformations to the data
        if self.transform:
            text, label = self.transform(text, label)
        return text, label

In [None]:
# split data: train 70%, validation 20%, test 10%
train_data, rest_data = train_test_split(balanced_data, test_size=0.3, random_state=1)
val_data, test_data = train_test_split(rest_data, test_size=0.33, random_state=1)

In [None]:
# load data into customized datasets
train_set = CustomDataset(train_data)
val_set = CustomDataset(val_data)
test_set = CustomDataset(test_data)

print("train:", len(train_set), "validation:", len(val_set), "test", len(test_set))