In [None]:
import csv
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW, BertModel
from sklearn.model_selection import train_test_split
import re
import torch.nn as nn
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn import feature_extraction, preprocessing, naive_bayes
from sklearn.model_selection import KFold
import nltk
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import torchvision

In [None]:
# path to data
path = "protest_data.csv"

In [None]:
# read data into lists
data = ([], [])
with open(path, newline='') as csvfile:
    d = csv.DictReader(csvfile)
    for row in d:
        #print(row)
        data[0].append(row["text"])
        data[1].append(row["2XisjFSSIK"])

In [None]:
# preprocessing, tokenization, and lemmatization for baseline:
stopwords = nltk.corpus.stopwords.words("german")
sentence_list = data[0]
cleaned_sentence_list = []
for sentence in sentence_list:
    sentence = re.sub('@[a-zA-Z_]*', '@tag', sentence)
    sentence = re.sub('[&+#]', '', sentence)
    sentence = re.sub('(https://|www.)[a-zA-Z./0-9!?_=-]*', 'url', sentence)
    sentence = re.sub('(\((dpa|ots)\)|(dpa|ots|AP))', '', sentence)
    word_list = sentence.split()
    word_list = [word for word in word_list if word not in stopwords]
    stemmer = nltk.stem.porter.PorterStemmer()
    stemmed_list = [stemmer.stem(word) for word in word_list]
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    lemmatized_list = [lemmatizer.lemmatize(word) for word in stemmed_list]
    sen = " ".join(lemmatized_list)
    cleaned_sentence_list.append(sen)

# split into train and test set (no validation data necessary) for baseline    
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(cleaned_sentence_list, data[1], test_size=0.2, random_state=35)

In [None]:
# tf-idf for baseline:
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_base)
X_test_tfidf = tfidf_vectorizer.transform(X_test_base)
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()

In [None]:
# naive bayes baseline:
classifier = naive_bayes.MultinomialNB()
classifier.fit(X_train_tfidf, y_train_base)
pred = classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test_base, pred)
precision = precision_score(y_test_base, pred, pos_label="1")
recall = recall_score(y_test_base, pred, pos_label="1")
f1 = f1_score(y_test_base, pred, pos_label="1")

In [None]:
# split data into train, test and val set
X = data[0]
y = data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=35)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=35)

In [None]:
# initialize tokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')
# alternatively, bert-base-multilingual-cased

In [None]:
# tokenize and truncate all sentences to 150 tokens

X_train_t = tokenizer(X_train, padding=True, truncation=True, max_length=150, return_tensors="pt")
X_val_t = tokenizer(X_val, padding=True, truncation=True, max_length=150, return_tensors="pt")
X_test_t = tokenizer(X_test, padding=True, truncation=True, max_length=150, return_tensors="pt")
y_train = list(map(int, y_train))
y_val = list(map(int, y_val))
y_test = list(map(int, y_test))
y_train_t = torch.tensor(y_train)
y_val_t = torch.tensor(y_val)
y_test_t = torch.tensor(y_test)

In [None]:
# use torch Dataloader for sampling and batching
batch_size = 32

train_data = TensorDataset(X_train_t["input_ids"], X_train_t["attention_mask"], y_train_t)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(X_val_t["input_ids"], X_val_t["attention_mask"], y_val_t)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(X_test_t["input_ids"], X_test_t["attention_mask"], y_test_t)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
# training with Huggingface's BertForSequenceClassification 

model = BertForSequenceClassification.from_pretrained('bert-base-german-cased', return_dict=True)
# alternatively, bert-base-multilingual-cased

model.train()
optim = AdamW(model.parameters(), lr=1e-5)

for e in range(3):
    for batch in train_dataloader:
        optim.zero_grad()
        inputs, mask, labels = batch
        outputs = model(inputs, attention_mask=mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()
        print(outputs.loss)

    print("e:", e)

In [None]:
# evaluation using Huggingface's BertForSequenceClassification

model.eval()

accuracies = []
recalls = []
precisions = []
f1_scores = []
for batch in val_dataloader:
    inputs, mask, labels = batch
    outputs = model(inputs, attention_mask=mask, labels=labels)
    preds = torch.argmax(outputs.logits, dim = 1)
    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds)
    precision = precision_score(labels, preds)
    f1 = f1_score(labels, preds)
    accuracies.append(accuracy)
    recalls.append(recall)
    precisions.append(precision)
    f1_scores.append(f1)
    print(accuracy, recall, precision, f1)

tot_acc = np.mean(np.array(accuracies))
tot_rec = np.mean(np.array(recalls))
tot_pre = np.mean(np.array(precisions))
tot_f1 = np.mean(np.array(f1_scores))
print(tot_acc, tot_rec, tot_pre, tot_f1)

In [None]:
# own model with three options regarding output layers:

class ProtestClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_layer, dropout=0.5, freeze=True):
        super(ProtestClassifier, self).__init__()
        
        self.bert = BertModel.from_pretrained('bert-base-german-cased')
        # alternatively, bert-base-multilingual-cased
        self.output_layer = output_layer
        
        if self.output_layer == "linear":
            self.classifier = nn.Sequential(
                nn.Linear(input_size, hidden_size),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(hidden_size, 1)) 
        if self.output_layer == "LSTM":
            self.classifier = nn.Sequential(
                nn.LSTM(input_size, hidden_size, batch_first = True))#,
        if self.output_layer == "CNN":
            self.classifier = nn.Sequential(
                nn.Conv1d(150, 150, 3, padding=True))
        
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(hidden_size*150, 1)
        self.lin3 = nn.Linear(383*150, 1)
        self.max = nn.MaxPool1d(3, stride=2)
        
        self.sigmoid = nn.Sigmoid()  
        
        # freeze all, some, or none of the bert layers
        if freeze == True:
            for p in self.bert.parameters():
                p.requires_grad = False
        elif freeze == False:
            for p in self.bert.parameters():
                p.requires_grad = True
        else:
            modules = [self.bert.embeddings, *self.bert.encoder.layer[:freeze]]
            for module in modules:
                for param in module.parameters():
                    param.requires_grad = False
    
    def forward(self, input_ids, mask):
        last_h, cls = self.bert(input_ids=input_ids, attention_mask = mask)
        if self.output_layer == "LSTM":
            c, (hn, cn) = self.classifier(last_h)
            c = torch.flatten(c, start_dim = 1)
            c = self.relu(c)
            c = self.lin2(c)
            c = torch.flatten(c)
        elif self.output_layer == "CNN":
            c = self.classifier(last_h)
            c = self.relu(c)
            c = self.max(c)
            c = torch.flatten(c, start_dim = 1)
            c = self.lin3(c)
            c = torch.flatten(c)  
        elif self.output_layer == "linear":
            c = self.classifier(cls)
            c = torch.flatten(c)
        return c

In [None]:
# k-fold cross-validation, to verify that results are representative across folds
# using own models

# set hyperparameters
batch_size = 32


device = "cuda:0"

input_size = 768
hidden_size = 50
output_layer = "linear"

loss_fn = nn.BCEWithLogitsLoss()

kf = KFold(n_splits=5)
# fold loop
for n, (train_index, test_index) in enumerate(kf.split(X_train_t["input_ids"])):
    tot_accuracies = []
    tot_recalls = []
    tot_precisions = []
    tot_f1_scores = []

    X_train_id, X_test_id = X_train_t["input_ids"][train_index], X_train_t["input_ids"][test_index]
    X_train_am, X_test_am = X_train_t["attention_mask"][train_index], X_train_t["attention_mask"][test_index]
    y_train, y_test = y_train_t[train_index], y_train_t[test_index]


    train_data = TensorDataset(X_train_id, X_train_am, y_train)
    test_data = TensorDataset(X_test_id, X_test_am, y_test)
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    
    model_b = ProtestClassifier(input_size, hidden_size, output_layer, 0.5).to(device)
    optim_b = torch.optim.Adam(model_b.parameters(), lr=0.00001)
    model_b.train()
    # training loop
    for e in range(2):
        for batch in train_dataloader:
            optim_b.zero_grad()
            inputs, mask, labels = batch
            inputs = inputs.to(device)
            mask = mask.to(device)
            labels = labels.to(device)
            pred = model_b(inputs, mask)
            labels = labels.float()
            loss = loss_fn(pred, labels)
            loss.backward()
            optim_b.step()

    # evaluation loop
    model_b.eval()
    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    for batch in test_dataloader:
        inputs, mask, labels = batch
        inputs = inputs.to(device)
        mask = mask.to(device)

        with torch.no_grad():
            out = model_b(inputs, mask)
        prob = torch.sigmoid(out)
        pred = prob > 0.5
        pred = pred.cpu()
        accuracy = accuracy_score(labels, pred)
        recall = recall_score(labels, pred)
        precision = precision_score(labels, pred)
        f1 = f1_score(labels, pred)
        accuracies.append(accuracy)
        recalls.append(recall)
        precisions.append(precision)
        f1_scores.append(f1)
    # calculate average scores per fold
    tot_acc = np.mean(np.array(accuracies))
    tot_rec = np.mean(np.array(recalls))
    tot_pre = np.mean(np.array(precisions))
    tot_f1 = np.mean(np.array(f1_scores))
    print("fold", n, ":", tot_acc, tot_rec, tot_pre, tot_f1)
    tot_accuracies.append(tot_acc)
    tot_recalls.append(tot_rec)
    tot_precisions.append(tot_pre)
    tot_f1_scores.append(tot_f1)

# calculate overall average scores
print("averages scores across folds:")
print(np.mean(np.array(tot_accuracies)))
print(np.mean(np.array(tot_recalls)))
print(np.mean(np.array(tot_precisions)))
print(np.mean(np.array(tot_f1_scores)))

In [None]:
# helper function for tensor board
def get_num_correct(preds, labels):
    return preds.argmax(dim=0).eq(labels).sum().item()

In [None]:
# training with own models

device = "cuda:1"


input_size = 768
hidden_size = 100
output_layer = "CNN"
freeze = False
lr = 0.00001  
epochs = 3
dropout = 0.2
bert = "German Bert"
model_b = ProtestClassifier(input_size, hidden_size, output_layer, dropout, freeze).to(device)
optim_b = torch.optim.Adam(model_b.parameters(), lr = lr)
loss_fn = nn.BCEWithLogitsLoss()

model_b.train()
comment = f' bert = {bert} output_layer = {output_layer} freeze = {freeze} hidden_size = {hidden_size} lr = {lr} epochs = {epochs}'
tb = SummaryWriter(comment=comment)

# make image and graph for tensor board:
input_data, mask, labels = next(iter(train_dataloader))
input_data = input_data.to(device)
mask = mask.to(device)
labels = labels.to(device)
grid = torchvision.utils.make_grid(input_data)
tb.add_image("input_data", grid)
tb.add_graph(model_b, (input_data, mask))

# training loop
for e in range(epochs):
    tot_loss = 0
    tot_correct = 0
    for batch in train_dataloader:
        optim_b.zero_grad()
        inputs, mask, labels = batch
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(device)
        pred = model_b(inputs, mask)
        labels = labels.float()
        loss = loss_fn(pred, labels)
        
        tot_loss += loss.item()
        tot_correct+= get_num_correct(pred, labels)
        
        loss.backward()
        optim_b.step()
        print(loss)

    tb.add_scalar("Loss", tot_loss, e)
    tb.add_scalar("training correct", tot_correct, e)
    tb.add_scalar("training accuracy", tot_correct/ len(train_data), e)
    
    for name, weight in model_b.named_parameters():
        tb.add_histogram(name, weight, e)
    
    print("epoch:", e, "total_correct:", tot_correct, "loss:",tot_loss)

In [None]:
# evaluation with own models

model_b.eval()

accuracies = []
recalls = []
precisions = []
f1_scores = []
for batch in val_dataloader:
    inputs, mask, labels = batch
    inputs = inputs.to(device)
    mask = mask.to(device)
    with torch.no_grad():
        out = model_b(inputs, mask)
    prob = torch.sigmoid(out)
    pred = prob > 0.5
    pred = pred.cpu()
    accuracy = accuracy_score(labels, pred)
    recall = recall_score(labels, pred)
    precision = precision_score(labels, pred)
    f1 = f1_score(labels, pred)
    accuracies.append(accuracy)
    recalls.append(recall)
    precisions.append(precision)
    f1_scores.append(f1)
    print(accuracy, recall, precision, f1)
    
tot_acc = np.mean(np.array(accuracies))
tot_rec = np.mean(np.array(recalls))
tot_pre = np.mean(np.array(precisions))
tot_f1 = np.mean(np.array(f1_scores))
print(tot_acc, tot_rec, tot_pre, tot_f1)

# add hyperparamters and results to tensor board
tb.add_hparams(
            {"output_layer": output_layer, "freeze": freeze, "hidden_size": hidden_size, "lr": lr, "epochs": epochs},
            {
                "accuracy": tot_acc,
                "recall": tot_rec,
                "precision": tot_pre,
                "f1-score": tot_f1,
                "loss": tot_loss,
            },
        )

tb.close()