In [1]:
import torch 
from collections.abc import Iterable
from datasets import load_dataset, list_datasets

# Model and tokenizer from 🤗 Transformers
from transformers import AutoModelForSequenceClassification, \
    BertForSequenceClassification, BertTokenizerFast, AutoModel, AutoTokenizer

# Code you will write for this assignment
from train_model import init_model, preprocess_dataset, init_trainer
from test_model import init_tester

In [2]:
hate_speech = load_dataset("hate_speech_offensive")
split = hate_speech["train"].train_test_split(.1, seed=3463)
hate_speech["train"] = split["train"]
hate_speech["test"] = split["test"]

split = hate_speech["train"].train_test_split(.1, seed=3463)
hate_speech["train"] = split["train"]
hate_speech["val"] = split["test"]

Found cached dataset hate_speech_offensive (/Users/junzeli/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /Users/junzeli/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-3a9bf3a1049559c7.arrow and /Users/junzeli/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-69dfaebdf31e0aae.arrow
Loading cached split indices for dataset at /Users/junzeli/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-b2a34171e8c0b921.arrow and /Users/junzeli/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-38b528465638c7b4.arrow


In [3]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
hate_speech["train"] = preprocess_dataset(hate_speech["train"], tokenizer)
hate_speech["val"] = preprocess_dataset(hate_speech["val"], tokenizer)
hate_speech["test"] = preprocess_dataset(hate_speech["test"], tokenizer)

# Visualize the preprocessed dataset
for k, v in hate_speech["val"][:2].items(): 
    print("{}:\n{}\n{}\n".format(k, type(v),
                                 [item[:20] if isinstance(item, Iterable) else 
                                 item for item in v[:5]]))

Map:   0%|          | 0/20073 [00:00<?, ? examples/s]

Map:   0%|          | 0/20073 [00:00<?, ? examples/s]

Map:   0%|          | 0/2231 [00:00<?, ? examples/s]

Map:   0%|          | 0/2231 [00:00<?, ? examples/s]

Map:   0%|          | 0/2479 [00:00<?, ? examples/s]

Map:   0%|          | 0/2479 [00:00<?, ? examples/s]

count:
<class 'list'>
[3, 3]

hate_speech_count:
<class 'list'>
[0, 0]

offensive_language_count:
<class 'list'>
[3, 3]

neither_count:
<class 'list'>
[0, 0]

class:
<class 'list'>
[1, 1]

tweet:
<class 'list'>
['@CinWicked Did you j', 'Fuck bitches get mon']

class_converted:
<class 'list'>
[1, 1]

input_ids:
<class 'list'>
[[0, 5238, 29442, 22972, 689, 14, 45, 329, 122, 4256, 9893, 3866, 11, 1287, 50239, 41407, 50239, 12, 2, 1], [0, 1004, 1179, 51, 325, 4056, 43027, 515, 12, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

token_type_ids:
<class 'list'>
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

attention_mask:
<class 'list'>
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

labels:
<class 'list'>
[1, 1]



In [6]:
trainer = init_trainer("vinai/bertweet-base", hate_speech["train"], hate_speech["val"])

loading configuration file config.json from cache at /Users/junzeli/.cache/huggingface/hub/models--vinai--bertweet-base/snapshots/118ab1d567653bec16bbb081eafb6f8942f72108/config.json
You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Model config BertConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertweetTokenizer",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "

In [5]:
import pickle

def read_result(file):
    with open(file, 'rb') as f:
        object = pickle.load(f)
    return object

In [15]:
obj = read_result('test_results.p')

In [37]:
import numpy as np

texts = np.array(hate_speech['test']['tweet'])
true_labels = np.array(hate_speech['test']['labels'])
len(true_labels)

2479

In [38]:
pred_labels = obj[0].argmax(axis=1)
len(pred_labels)

2479

In [40]:
texts[(true_labels == 0) & (pred_labels == 1)]

array(['Up early then a bitch driving to denton omg can I move already',
       '"@nohands_24: "@20ToLife_: "@nohands_24: "@DejaaLeeann_: @B_Coleman2 swears I\'m #1 man" easily" easily" behind Austin" nip?',
       "@TheIronPony You rebel. Next you'll be ripping mattress tags off. It's a slippery slope.",
       'This bitch @Lvl_7_Eevee buys sparkling water.',
       "lmaoooo RT @PhillyTheBoss: Everything on BET is trash. RT @MrCOOK_: B-Rad RT @fkinLIVE: White Mike was on BET, he can't be trash",
       'Sixten just shoved like 8 Oreos down his throat',
       "&#8220;@CommonManKFAN: And that's why mock drafts are a joke my friends.....&#8221;Anyone???",
       'RT @briangaar: Old white people complaining about government tyranny is like the Yankees being upset that players make too much money',
       'Are poor little guinea pig died :(',
       'Finally got my medicine and crackers!',
       'Joba Chamberlain looks like a homeless guy that won a contest. #Yankees',
       "@eGoTheIco