In [1]:
import json
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa

annotations = {}


In [10]:
classes_map = {
    "pozytywny wydźwięk": 0,
    "neutralny wydźwięk": 1,
    "negatywny wydźwięk": 2,
    "mowa nienawiści": 3,
    "positive": 0,
    "neutral": 1,
    "negative": 2,
    "hate_speech": 3,
    "pozytywne": 0,
    "neutralne": 1,
    "negatywne": 2,
    "mowa nienawisci": 3,
    "nienawisc": 3
}

In [3]:
def loadDataset(path: str, annots: dict[str, list[int | str]] = None):
    with open(path, 'r', encoding='utf-8') as file:
        if annots == None:
            annots = {}
            for line in file:
                line = json.loads(line)
                annots[line["text"]] = [classes_map[line["label"][0]]]
        else:
            for line in file:
                line = json.loads(line)
                annots[line["text"]].append(classes_map[line["label"][0]])
    return annots

In [4]:
def separateConflicts(annots: dict[str, list[int | str]]) -> tuple[list[dict], list[dict]]:
    correct = []
    wrong = []

    for text, labels in annots.items():
        if all(label == labels[0] for label in labels):
            correct.append({"text": text, "label": labels[0]})
        else:
            wrong.append({"text": text, "label": "", "old_labels": str(labels)})
    
    return correct, wrong

In [5]:
def kappaCohenResult(annots: dict[str, list[int | str]]):
    annotators = [[], [], []]
    for annotations in annots.values():
        for annotation, annotator in zip(annotations, annotators):
            annotator.append(annotation)
    
    print('Kappa Cohen:')
    print(f'Klaudia - Michał: {cohen_kappa_score(annotators[0], annotators[1])}')
    print(f'Klaudia - Kajtek: {cohen_kappa_score(annotators[0], annotators[2])}')
    print(f'Kajtek - Michał: {cohen_kappa_score(annotators[2], annotators[1])}')


In [6]:
def kappaFleissResult(annots: dict[str, list[int | str]]):
    results = []
    for annotations in annots.values():
        new_sample = [0, 0, 0, 0]
        for annotation in annotations:
            new_sample[annotation] += 1
        results.append(new_sample)
    print(f'Kappa Fleiss: {fleiss_kappa(results)}')

In [7]:
def write_jsonl(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

Pierwsza iteracja

In [11]:
annotations = loadDataset("data/klaudia.jsonl")
annotations = loadDataset("data/michal.jsonl", annotations)
annotations = loadDataset("data/kajetan.jsonl", annotations)

In [36]:
kappaCohenResult(annotations)

Kappa Cohen:
Klaudia - Michał: 0.48137755713840324
Klaudia - Kajtek: 0.3391286665649942
Kajtek - Michał: 0.43653533422726887


In [37]:
kappaFleissResult(annotations)

Kappa Fleiss: 0.4092986691137362


In [38]:
correct, wrong = separateConflicts(annotations)

In [39]:
len(wrong)

257

In [40]:
len(correct)

243

In [45]:
write_jsonl("data/first_round_correct.jsonl", correct)

In [46]:
write_jsonl("data/first_round_wrong.jsonl", wrong)

In [7]:
with open("data/first_round_correct.jsonl", 'r', encoding='utf-8') as file:
    correct_data = [json.loads(line) for line in file]

with open("data/first_round_wrong.jsonl", 'r', encoding='utf-8') as file:
    for line in file:
        new_sample = json.loads(line)
        new_sample.pop("old_labels")
        correct_data.append(new_sample)

write_jsonl("data/first_round.jsonl", correct_data)

Druga iteracja

In [12]:
annotations = loadDataset("data/klaudia2.jsonl")
annotations = loadDataset("data/michal2.jsonl", annotations)
annotations = loadDataset("data/kajetan2.jsonl", annotations)

In [13]:
kappaCohenResult(annotations)

Kappa Cohen:
Klaudia - Michał: 0.5364647713226205
Klaudia - Kajtek: 0.5660956085883145
Kajtek - Michał: 0.62563985025594


In [14]:
kappaFleissResult(annotations)

Kappa Fleiss: 0.575833516771932


In [16]:
correct, wrong = separateConflicts(annotations)
write_jsonl("data/second_round_correct.jsonl", correct)
write_jsonl("data/second_round_wrong.jsonl", wrong)

In [17]:
with open("data/second_round_correct.jsonl", 'r', encoding='utf-8') as file:
    correct_data = [json.loads(line) for line in file]

with open("data/second_round_wrong.jsonl", 'r', encoding='utf-8') as file:
    for line in file:
        new_sample = json.loads(line)
        new_sample.pop("old_labels")
        correct_data.append(new_sample)

write_jsonl("data/second_round.jsonl", correct_data)