In [4]:
!pip install -q transformers datasets evaluate accelerate scikit-learn

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

#convert pandas df into Hugging Face 'Datasets' which are faster for GPU
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(test_df)

#tokenizer
#from_pretrained tells to load the model configuration from pretrained model library
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    # Padding="max_length" ensures all sentences look the same length to the model
    # Truncation=True cuts off super long tweets
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True)

print("Tokenizing data")
train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)

#check accuracy (how many got right) and F1 score (balance between precision/recall) for now
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    return {**accuracy, **f1}

#download the pre-trained distilbert model
#num_labels=2 because we have 2 choices: hate (1) or Not hate (0)
#AutoModel picks pretrianed model automatically, in case model needs to be redefined in future
#ForSequenceClassification gives the entire sequence (the whole sentence) one score. Basically, is the sentence itself hateful, not just an individual word
#if we want to figure out if each individual word for hate speech, we would use ForTokenClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

#model training parameters
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,              #loop through the data 2 times
    per_device_train_batch_size=16,  #learn 16 tweets at a time
    per_device_eval_batch_size=64,
    eval_strategy="epoch",           #test after every loop
    save_strategy="epoch",           #save model after every loop
    load_best_model_at_end=True,     #keep only the best version
)

#defined
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
)

#train
print("Starting training")
trainer.train()

#Save the trained model to download later
model.save_pretrained("./hate_speech_model")
tokenizer.save_pretrained("./hate_speech_model")
print("Model saved to folder")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing data


Map:   0%|          | 0/19824 [00:00<?, ? examples/s]

Map:   0%|          | 0/4957 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhuudahmad[0m ([33mhuudahmad-n-a[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1777,0.157404,0.941497,0.006849
2,0.1463,0.159779,0.947347,0.408163


Model saved to folder


In [5]:
import shutil
from google.colab import files

# Zip the folder
shutil.make_archive('hate_speech_model', 'zip', './hate_speech_model')

# Download it to your local computer
files.download('hate_speech_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
import torch
torch.cuda.is_available()

True