## Data preprocessing

### HASOC

In [135]:
import pandas as pd
import re
import os
import emoji

In [136]:
dirname = os.path.abspath('')
train_filename = os.path.join(dirname, "data\\HateSpeechRecoginition\\HASOC_german_dataset\\german_dataset.tsv")
test_filename = os.path.join(dirname, 'data\\HateSpeechRecoginition\\HASOC_german_dataset\\hasoc_de_test_gold.tsv')


train_raw_data = pd.read_csv(train_filename, sep='\t')
test_raw_data = pd.read_csv(test_filename, sep='\t')

print(len(train_raw_data))
print(len(test_raw_data))

train_raw_data.loc[[0]]

3819
850


Unnamed: 0,text_id,text,task_1,task_2
0,hasoc_de_1,Frank Rennicke – Ich bin stolz https://t.co/Cm...,NOT,NONE


In [137]:
# remove all usernames
def clean_tweet(tweet):
    tweet = re.sub("@[^\s]+",'USER',tweet)
    tweet = re.sub("http[^\s]+", "\n", tweet) #Remove http links
    tweet = re.sub("www[^\s]+", "\n", tweet) #Remove http links
    tweet = emoji.replace_emoji(tweet, '') #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    
    return tweet

train_raw_data['text'] = train_raw_data['text'].map(lambda x: clean_tweet(x))
test_raw_data['text'] = test_raw_data['text'].map(lambda x: clean_tweet(x))

# Binary labeling
train_raw_data['task_1'] = train_raw_data['task_1'].str.replace("NOT","0")
test_raw_data['task_1'] = train_raw_data['task_1'].str.replace("NOT","0")
train_raw_data['task_1'] = train_raw_data['task_1'].str.replace("HOF","1")
test_raw_data['task_1'] = train_raw_data['task_1'].str.replace("HOF","1")

# Create a validation dataset
from sklearn.model_selection import train_test_split

train, validation = train_test_split(train_raw_data, test_size=0.05, random_state=42)

# Save
train.to_csv('data/HateSpeechRecoginition/HASOC_train.csv')
test_raw_data.to_csv('data/HateSpeechRecoginition/HASOC_test.csv')
validation.to_csv('data/HateSpeechRecoginition/HASOC_validation.csv')

print(len(validation))

del train_raw_data
del test_raw_data
del validation
del train

191


In [139]:
# Open data in a suitable format for pytorch

import datasets
from datasets import load_dataset, load_from_disk
dataset = load_dataset('data/HateSpeechRecoginition', data_files={'train': 'HASOC_train.csv', 'test': 'HASOC_test.csv', 'validation': 'HASOC_validation.csv'})
dataset

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text_id', 'text', 'task_1', 'task_2'],
        num_rows: 3628
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text_id', 'text', 'task_1', 'task_2'],
        num_rows: 850
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'text_id', 'text', 'task_1', 'task_2'],
        num_rows: 191
    })
})

In [140]:
raw_train_ds = dataset["train"]
raw_test_ds = dataset["test"]
raw_val_ds = dataset["validation"]

## Training

In [141]:
from transformers import AutoTokenizer

checkpoint = 'bert-base-german-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets["train"][0]

Map:   0%|          | 0/3628 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/191 [00:00<?, ? examples/s]

{'Unnamed: 0': 2304,
 'text_id': 'hasoc_de_2305',
 'text': "ef:'Korruption der Herrscher in Afrika: Der Griff in die Staatskasse: Korruption der Herrscher in Afrika: Der Griff in die Staatskasse \n '",
 'task_1': 0,
 'task_2': 'NONE',
 'input_ids': [3,
  454,
  26913,
  26964,
  26979,
  18183,
  21,
  14273,
  50,
  9061,
  26964,
  233,
  11759,
  50,
  30,
  1477,
  9472,
  26964,
  18183,
  21,
  14273,
  50,
  9061,
  26964,
  233,
  11759,
  50,
  30,
  1477,
  9472,
  26979,
  4],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [142]:
tokenized_datasets = tokenized_datasets.remove_columns(["text_id", "text","task_2"])
tokenized_datasets = tokenized_datasets.rename_column("task_1", "labels")

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3628
    })
    test: Dataset({
        features: ['Unnamed: 0', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 850
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 191
    })
})

In [143]:
# Dynamic padding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [144]:
from transformers import TrainingArguments
import torch

torch.cuda.is_available()

True

In [145]:
# Define Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [146]:
from transformers import Trainer
import numpy as np
import evaluate

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments('test-trainer', evaluation_strategy="epoch")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [147]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.285191,0.921466,0.0
2,0.358900,0.298335,0.921466,0.0
3,0.346800,0.32299,0.879581,0.258065


TrainOutput(global_step=1362, training_loss=0.33593997591217534, metrics={'train_runtime': 341.6152, 'train_samples_per_second': 31.86, 'train_steps_per_second': 3.987, 'total_flos': 370287699316800.0, 'train_loss': 0.33593997591217534, 'epoch': 3.0})

In [148]:
# Save

model.save_pretrained('models/HateSpeechRecognition/model_01')
tokenizer.save_pretrained('models/HateSpeechRecognition/tokenizer_01')

('models/HateSpeechRecognition/tokenizer_01\\tokenizer_config.json',
 'models/HateSpeechRecognition/tokenizer_01\\special_tokens_map.json',
 'models/HateSpeechRecognition/tokenizer_01\\vocab.txt',
 'models/HateSpeechRecognition/tokenizer_01\\added_tokens.json',
 'models/HateSpeechRecognition/tokenizer_01\\tokenizer.json')

## Test

In [152]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

preds = np.argmax(predictions.predictions, axis=-1)

(850, 2) (850,)


In [153]:
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8917647058823529, 'f1': 0.041666666666666664}

In [172]:
print(tokenized_datasets["test"][38])

{'Unnamed: 0': 38, 'labels': 1, 'input_ids': [3, 3011, 5477, 26964, 26979, 1232, 1138, 287, 987, 21846, 26903, 26935, 573, 8174, 8425, 26914, 4847, 21846, 26903, 26935, 9593, 142, 30, 2586, 2579, 16391, 26914, 892, 6378, 18725, 6123, 232, 30, 21846, 26903, 26935, 7408, 7286, 281, 42, 287, 3361, 311, 6690, 142, 380, 8801, 21012, 26914, 2, 4], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [174]:
predictions[1][38]

1

In [154]:
# Custom test

from transformers import pipeline

myPipe = pipeline(model)
myPipe(["Sei still voll idiot"])

AttributeError: 'BertForSequenceClassification' object has no attribute 'startswith'