In [2]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import DataLoader, TensorDataset
import json
import pdb
import logging.handlers
import argparse
import os
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from networkx import DiGraph, relabel_nodes, all_pairs_shortest_path_length
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, h_recall_score, h_precision_score, fill_ancestors, multi_labeled
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm
from ordered_set import OrderedSet

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
import json

# 文件路径
file_paths = ['train.json', 'validation.json']

# 读取数据
merged_data = []

for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:  # 指定文件编码
        data = json.load(file)
        merged_data.extend(data)

# 写入新的 JSON 文件
with open('merged_file.json', 'w', encoding='utf-8') as merged_file:  # 指定写入文件编码为 UTF-8
    json.dump(merged_data, merged_file, indent=2, ensure_ascii=False)


In [4]:
file_path = 'merged_file.json'
# Read the JSON file into a list of dictionaries
with open(file_path, 'r', encoding='utf-8') as file:
    data_train = json.load(file)

rows = []
possible_labels = OrderedSet(label for entry in data_train for label in entry['labels'])

for entry in data_train:
    id = entry['id']
    text = entry['text']
    labels = entry['labels']
    row = {}
    row['id'] = id
    row['text'] = text

    for label in possible_labels:
        row[label] = int(label in labels)

    rows.append(row)

df = pd.DataFrame(rows)
data_train = df

# Check and replace NaN values
for i in data_train.index:
    if pd.isna(data_train["text"][i]):
        data_train["text"][i] = ""




In [5]:
file_path = 'dev_subtask1_en.json'
# Read the JSON file into a list of dictionaries
with open(file_path, 'r', encoding='utf-8') as file:
    data_validation = json.load(file)

rows = []
possible_labels = OrderedSet(label for entry in data_validation for label in entry['labels'])

for entry in data_validation:
    id = entry['id']
    text = entry['text']
    labels = entry['labels']
    row = {}
    row['id'] = id
    row['text'] = text

    for label in possible_labels:
        row[label] = int(label in labels)

    rows.append(row)

df = pd.DataFrame(rows)
data_validation = df

# Check and replace NaN values
for i in data_validation.index:
    if pd.isna(data_validation["text"][i]):
        data_validation["text"][i] = ""

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(list(data_train["text"]), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(list(data_validation['text']), truncation=True, padding=True, return_tensors='pt')

In [7]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __getitem__(self, idx):
        return {key: val[idx].to(device) for key, val in self.encodings.items()}, self.labels[idx].to(device)

    def __len__(self):
        return len(self.labels)



In [8]:
labels = sorted([col for col in data_train.columns if col not in ["text", "id"]])

train_dataset = CustomDataset(train_encodings, data_train[labels])
test_dataset = CustomDataset(test_encodings, data_validation[labels])
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(labels)).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
print(labels)

['Appeal to authority', 'Appeal to fear/prejudice', 'Bandwagon', 'Black-and-white Fallacy/Dictatorship', 'Causal Oversimplification', 'Doubt', 'Exaggeration/Minimisation', 'Flag-waving', 'Glittering generalities (Virtue)', 'Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Name calling/Labeling', 'Obfuscation, Intentional vagueness, Confusion', 'Presenting Irrelevant Data (Red Herring)', 'Reductio ad hitlerum', 'Repetition', 'Slogans', 'Smears', 'Thought-terminating cliché', 'Whataboutism']


In [10]:
from tqdm import tqdm
import numpy as np
from torch.utils.data import Dataset, DataLoader
from networkx import DiGraph
from torch import nn

G = DiGraph()
G.add_edge(ROOT, "Logos")
G.add_edge("Logos", "Repetition")
G.add_edge("Logos", "Obfuscation, Intentional vagueness, Confusion")
G.add_edge("Logos", "Reasoning")
G.add_edge("Logos", "Justification")
G.add_edge('Justification', "Slogans")
G.add_edge('Justification', "Bandwagon")
G.add_edge('Justification', "Appeal to authority")
G.add_edge('Justification', "Flag-waving")
G.add_edge('Justification', "Appeal to fear/prejudice")
G.add_edge('Reasoning', "Simplification")
G.add_edge('Simplification', "Causal Oversimplification")
G.add_edge('Simplification', "Black-and-white Fallacy/Dictatorship")
G.add_edge('Simplification', "Thought-terminating cliché")
G.add_edge('Reasoning', "Distraction")
G.add_edge('Distraction', "Misrepresentation of Someone's Position (Straw Man)")
G.add_edge('Distraction', "Presenting Irrelevant Data (Red Herring)")
G.add_edge('Distraction', "Whataboutism")
G.add_edge(ROOT, "Ethos")
G.add_edge('Ethos', "Appeal to authority")
G.add_edge('Ethos', "Glittering generalities (Virtue)")
G.add_edge('Ethos', "Bandwagon")
G.add_edge('Ethos', "Ad Hominem")
G.add_edge('Ethos', "Transfer")
G.add_edge('Ad Hominem', "Doubt")
G.add_edge('Ad Hominem', "Name calling/Labeling")
G.add_edge('Ad Hominem', "Smears")
G.add_edge('Ad Hominem', "Reductio ad hitlerum")
G.add_edge('Ad Hominem', "Whataboutism")
G.add_edge(ROOT, "Pathos")
G.add_edge('Pathos', "Exaggeration/Minimisation")
G.add_edge('Pathos', "Loaded Language")
G.add_edge('Pathos', "Appeal to (Strong) Emotions")
G.add_edge('Pathos', "Appeal to fear/prejudice")
G.add_edge('Pathos', "Flag-waving")
G.add_edge('Pathos', "Transfer") 

In [11]:
class HierarchicalLoss(nn.Module):
    def __init__(self, alpha=1.0):
        super(HierarchicalLoss, self).__init__()
        self.alpha = alpha

    def forward(self, logits, labels, hierarchy):
        loss = nn.BCEWithLogitsLoss()(logits, labels)

        # Add hierarchical regularization
        for i in range(labels.size(1)):
            for j in range(labels.size(1)):
                if i != j and hierarchy.has_node(i) and hierarchy.has_node(j) and nx.has_path(hierarchy, source=i, target=j):
                    loss += self.alpha * torch.relu(logits[:, i] - logits[:, j])

        return loss

In [21]:
hierarchical_loss = HierarchicalLoss(alpha=0.1)
num_epochs = 15
model.to(device)

thresholds = {'Appeal to authority':0.44, 'Appeal to fear/prejudice':0.29, 'Bandwagon':0.15, 'Black-and-white Fallacy/Dictatorship':0.4, 
             'Causal Oversimplification':0.24, 'Doubt':0.3, 'Exaggeration/Minimisation':0.3, 'Flag-waving':0.35, 'Glittering generalities (Virtue)':0.32, 
             'Loaded Language':0.48, "Misrepresentation of Someone's Position (Straw Man)":0.12, 'Name calling/Labeling':0.45, 
             'Obfuscation, Intentional vagueness, Confusion':0.1, 'Presenting Irrelevant Data (Red Herring)':0.11, 
             'Reductio ad hitlerum':0.14, 'Repetition':0.28, 'Slogans':0.39, 'Smears':0.5, 'Thought-terminating cliché':0.33, 'Whataboutism':0.25}
for epoch in tqdm(range(num_epochs)):
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        inputs, labels = batch
        outputs = model(**inputs).logits
        loss = hierarchical_loss(outputs, labels, G)
        loss.backward()
        optimizer.step()
    torch.save(model, "roberta_subtask1_epoch_" + str(epoch) + ".pt")



100%|██████████| 3750/3750 [09:34<00:00,  6.53it/s]
 48%|████▊     | 1792/3750 [06:16<06:51,  4.76it/s]
 33%|███▎      | 1/3 [15:51<31:43, 951.63s/it]


KeyboardInterrupt: 

In [17]:
model.eval()
all_preds = []
all_labels = []
label_name = ['Appeal to authority', 'Appeal to fear/prejudice', 'Bandwagon', 'Black-and-white Fallacy/Dictatorship', 
             'Causal Oversimplification', 'Doubt', 'Exaggeration/Minimisation', 'Flag-waving', 'Glittering generalities (Virtue)', 
             'Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Name calling/Labeling', 
             'Obfuscation, Intentional vagueness, Confusion', 'Presenting Irrelevant Data (Red Herring)', 
             'Reductio ad hitlerum', 'Repetition', 'Slogans', 'Smears', 'Thought-terminating cliché','Whataboutism']
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        outputs = torch.sigmoid(model(**inputs).logits)
        binary_preds = (outputs.cpu().numpy() > np.array([thresholds[label] for label in label_name])).astype(int)
        all_preds.extend(binary_preds)
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy:.2f}")

classification_report_str = classification_report(all_labels, all_preds, target_names=data_train.columns[2:])
print("Classification Report:\n", classification_report_str)

Accuracy: 0.18
Classification Report:
                                                      precision    recall  f1-score   support

               Black-and-white Fallacy/Dictatorship       0.85      0.88      0.87       136
                                    Loaded Language       0.00      0.00      0.00        66
                   Glittering generalities (Virtue)       0.00      0.00      0.00        16
                         Thought-terminating cliché       0.00      0.00      0.00        98
                                       Whataboutism       0.00      0.00      0.00        53
                                            Slogans       0.00      0.00      0.00        45
                          Causal Oversimplification       0.00      0.00      0.00        62
                                             Smears       0.81      0.19      0.31        89
                              Name calling/Labeling       0.69      0.15      0.25        71
                              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
output_list = []

for i, entry in data_validation.iterrows():
    entry_id = entry['id']
    text = entry['text']
    predicted_labels = [label for label, pred in zip(possible_labels, all_preds[i]) if pred == 1]

    output_entry = {
        "id": entry_id,
        "text": text,
        "labels": predicted_labels
    }

    output_list.append(output_entry)

# Write the result to a JSON file
output_file_path = 'predictions.json'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    json.dump(output_list, output_file, indent=2)
