In [None]:
#imports
from datasets import load_dataset, load_metric, list_metrics
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from scipy.special import softmax
import numpy as np 
import pandas as pd

In [None]:
#pre-process tweet data
target_labels = ['not_cyberbullying', 'age', 'ethnicity', 'gender', 'religion', 'other_cyberbullying']

dataset_df = pd.read_csv("data/cyberbullying_tweets/cyberbullying_tweets.csv")
dataset_df.rename(columns = {'cyberbullying_type':'label', 'tweet_text':'text'}, inplace = True)
dataset_df['label'] = dataset_df['label'].replace({'not_cyberbullying':0, 'age':1, 'ethnicity':2, 'gender':3, 'religion':4, 'other_cyberbullying':5})
dataset_df.to_csv("data/cyberbullying_tweets/cyberbullying_tweets.csv", index=False)
dataset_df

In [47]:
#load bullying speech dataset and store it
dataset = load_dataset("data/cyberbullying_tweets")

input_labels = np.unique(np.array(dataset['train']['label']))
label_count = len(input_labels)
print("Data set structure:", dataset, "\nLabels:", input_labels)

Using custom data configuration cyberbullying_tweets-6558229128e7786d
Reusing dataset csv (/Users/haydenprescott/.cache/huggingface/datasets/csv/cyberbullying_tweets-6558229128e7786d/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/1 [00:00<?, ?it/s]

Data set structure: DatasetDict({
    train: Dataset({
        features: ['tweet_text', 'cyberbullying_type'],
        num_rows: 47692
    })
}) 
Labels: ['age' 'ethnicity' 'gender' 'not_cyberbullying' 'other_cyberbullying'
 'religion']


In [48]:
# set the base model to a bert-base-cased
model_name = "bert-base-cased"

In [49]:
# get the tokenizer from the model and store it
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /Users/haydenprescott/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/

In [50]:
# define the tokenize function that the tokenizer will be used for
def tokenize(samples):
    return tokenizer(samples['text'], padding="max_length", truncation=True)

In [51]:
#tokenize the data and store it as a new variable, then determine the number of unique labels that will be used to classify the data during training and evaluation
dataset_tokens = dataset.map(tokenize, batched=True)

  0%|          | 0/48 [00:00<?, ?ba/s]

In [52]:
# randomize the tokenized data, then split it up into a training set and an evaluation set
shuffle = dataset_tokens['train'].shuffle(seed=42)

train_count = int(shuffle.num_rows * 0.9) 

dataset_train = shuffle.select(range(0, train_count))
dataset_eval = shuffle.select(range(train_count, shuffle.num_rows))
print("Eval:", dataset_eval, "\nTrain:", dataset_train)

Eval: Dataset({
    features: ['tweet_text', 'cyberbullying_type', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4770
}) 
Train: Dataset({
    features: ['tweet_text', 'cyberbullying_type', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 42922
})


In [53]:
# create an instance of the base model that will be trained
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=label_count)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /Users/haydenprescott/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_a

In [54]:
# set the characteristics of how the model will be trained (ex: the similarity of the output to the ground truth will be evaluated every epoch)
training_args = TrainingArguments(output_dir="bullying_model", evaluation_strategy="epoch")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [55]:
# load the accuracy metric
acc_metric = load_metric("accuracy")

In [56]:
# define a fuction that will evaluate the accuracy of the model's output
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return acc_metric.compute(predictions=predictions, references=labels)

In [57]:
# create an instance of the trainer class
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    compute_metrics=compute_metrics,
)

In [None]:
# train the model!
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: source_data, text, id. If source_data, text, id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9758
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3660


In [None]:
ft_model_checkpoint = "bullying_model/checkpoint-3500"

In [None]:
ft_model = AutoModelForSequenceClassification.from_pretrained(ft_model_checkpoint)

In [None]:
sample_text = "I like cats."

In [None]:
sample_tokens = tokenizer(sample_text, return_tensors="pt", padding="max_length", truncation=True)
sample_out = ft_model(**sample_tokens)
scores = sample_out[0][0].detach().numpy()
scores = softmax(scores)

labeled_scores = list(tuple(zip([target_labels[idx] for idx in input_labels], scores)))
labeled_scores.sort(key=lambda y: y[1], reverse=True)
print(labeled_scores)