In [None]:
!pip install datasets
!pip install tensorflow
!pip install transformers

In [None]:
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from transformers import (AutoTokenizer, PushToHubCallback, TFAutoModelForSequenceClassification)
from huggingface_hub import HfApi
from google.colab import drive
import os

drive.mount('/content/drive')

output_dir = "/content/drive/MyDrive/output/toxic_model_hub"
os.makedirs(output_dir, exist_ok=True)
checkpoint_path = "/content/drive/MyDrive/output/checkpoint.ckpt"

os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"

labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
label_to_index = {label: index for index, label in enumerate(labels)}
index_to_label = {index: label for index, label in enumerate(labels)}

dataset = load_dataset("csv", data_files="train.csv")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(record):
    text = record["comment_text"]
    label_batch = {key: record[key] for key in record.keys() if key in labels}

    tokenized_data = tokenizer(text, padding="max_length", truncation=True)

    label_matrix = np.zeros((len(text), len(labels)))

    for index, label in enumerate(labels):
        label_matrix[:, index] = label_batch[label]

    tokenized_data["labels"] = label_matrix.tolist()

    return tokenized_data

model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=len(labels),
    label2id=label_to_index,
    id2label=index_to_label,
)

encoded_dataset = dataset.map(
    preprocess_data, batched=True, remove_columns=[*labels, "id", "comment_text"]
)

tf_dataset = model.prepare_tf_dataset(
    encoded_dataset["train"], batch_size=32, shuffle=True, tokenizer=tokenizer
)

ckpt_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, save_weights_only=True, verbose=1
)

lr_schedule = PolynomialDecay(
    initial_learning_rate=3e-5, decay_steps=10000, power=1.0, end_learning_rate=0.0
)

model.compile(optimizer=Adam(learning_rate=lr_schedule), loss="binary_crossentropy")
model.fit(tf_dataset, epochs=1, callbacks=[ckpt_callback])

In [None]:
!huggingface-cli login

In [13]:
model.push_to_hub("toxic-classifier")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

tf_model.h5:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [16]:
tokenizer.push_to_hub("toxic-classifier")

CommitInfo(commit_url='https://huggingface.co/jjderz/toxic-classifier/commit/9a9498e9e8c197182767ece2df960a02cc8de9d0', commit_message='Upload tokenizer', commit_description='', oid='9a9498e9e8c197182767ece2df960a02cc8de9d0', pr_url=None, pr_revision=None, pr_num=None)