In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/uit-nlpp/train.json
/kaggle/input/uit-nlpp/test.json


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification



tokenizer = AutoTokenizer.from_pretrained('Fsoft-AIC/videberta-base')



model = AutoModelForSequenceClassification.from_pretrained('Fsoft-AIC/videberta-base', num_labels=3)





Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/8.49M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/567M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at Fsoft-AIC/videberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# for name, param in model.named_parameters():
#     if 'classifier' not in name:
#         param.requires_grad = False
model = model.cuda()

In [4]:
import re
def preprocess_text(text: str) -> str:    
    text = re.sub(r"['\",\.\?:\-!]", "", text)
    text = text.strip()
    text = " ".join(text.split())
    text = text.lower()
    return text

In [5]:
def claim_fn(row):
    row["context"] = preprocess_text(row["evidence_predict"]) + f" {tokenizer.eos_token} " + preprocess_text(row["claim"])
    return row

In [6]:
import pandas as pd
dataset = pd.read_json("/kaggle/input/uit-nlpp/train.json", orient='index')
dataset = dataset.sample(frac=0.8, random_state=13)
dataset_len = len(dataset)
train_split = int(0.95*dataset_len)
train_dataset = dataset#.iloc[:train_split]
test_dataset = dataset.iloc[train_split:]

In [7]:
train_dataset = train_dataset.apply(claim_fn, axis=1)
test_dataset = test_dataset.apply(claim_fn, axis=1)

In [8]:
CLASSES = {"NEI":1, "SUPPORTED":0, "REFUTED":2}

def preprocess_fn(examples):
    inputs = tokenizer(
        examples["context"], max_length=200,truncation=True, padding='max_length'
    )
    labels = [CLASSES[verdict] for verdict in examples["verdict"]]
    
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels}

In [9]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_dataset)
train_dataset = train_dataset.map(preprocess_fn, batched=True, remove_columns=['context', 'claim', 'evidence_predict', 'domain', 'verdict','__index_level_0__'])

  0%|          | 0/22 [00:00<?, ?ba/s]

In [10]:
test_dataset = Dataset.from_pandas(test_dataset)
test_dataset = test_dataset.map(preprocess_fn, batched=True, remove_columns=['context', 'claim', 'evidence_predict', 'domain', 'verdict','__index_level_0__'])

  0%|          | 0/2 [00:00<?, ?ba/s]

In [11]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1


In [12]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

# data_collator = DataCollatorForTokenClassification
training_args = TrainingArguments(per_device_train_batch_size = 32, 
                                  learning_rate = 5e-5, 
                                  output_dir="test_trainer", 
#                                   evaluation_strategy="epoch", 
                                  save_strategy="steps", 
                                  save_total_limit = 2, 
#                                   load_best_model_at_end = True,
                                  num_train_epochs=10)

In [15]:
from torch import nn
from transformers import Trainer
weight = [0.1249804473643047, 0.12699674163554, 1.0]
class ClassificationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.squeeze(), labels.squeeze())
        return (loss, outputs) if return_outputs else loss

In [16]:
trainer = ClassificationTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [None]:
# for name, param in model.named_parameters():
#     param.requires_grad = True

In [None]:
# from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

# # data_collator = DataCollatorForTokenClassification
# training_args = TrainingArguments(per_device_train_batch_size = 512, 
#                                   learning_rate = 1e-5, 
#                                   output_dir="test_trainer", 
#                                   evaluation_strategy="epoch", 
#                                   save_strategy="epoch", 
#                                   save_total_limit = 2, 
#                                   load_best_model_at_end = True,
#                                   num_train_epochs=10)
# trainer = ClassificationTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     compute_metrics=compute_metrics,
# )

In [None]:
def preprocess_test_fn(examples):
    inputs = tokenizer(
        examples["context"], max_length=256, truncation=True, padding='max_length'
    )
    
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]}

In [None]:
import pandas as pd
from datasets import Dataset

raw_dataset = pd.read_json("/kaggle/input/uit-nlpp/test.json", orient='index')
dataset = raw_dataset.apply(claim_fn, axis=1)

dataset = Dataset.from_pandas(dataset)
dataset = dataset.map(preprocess_test_fn, batched=True, remove_columns=['context','claim','evidence_predict','__index_level_0__'])


In [None]:
import torch

In [None]:
def data_collator(data):
    for key, value in data.items():
        data[key] = torch.tensor(value).cuda().unsqueeze(0)
    return data

In [None]:
from tqdm import tqdm

In [None]:
label = []
for data in tqdm(dataset):
    data = data_collator(data)
    label_pre = model(**data)['logits'].argmax(dim=-1).item()
    label.append(label_pre)

In [None]:
raw_df = raw_dataset.drop(columns=["context","claim"])

In [None]:
raw_df = raw_df.rename(columns={"evidence_predict":"evidence"})

In [None]:
raw_df["verdict"] = label

In [None]:
import numpy as np

In [None]:
raw_df["evidence"] = np.where(raw_df["verdict"] == 1, "", raw_df["evidence"])
raw_df["verdict"] = np.where(raw_df["verdict"] == 0, "SUPPORTED", np.where(raw_df["verdict"] == 1, "NEI","REFUTED"))

In [None]:
folder_path = "/kaggle/working/"

In [None]:
raw_df.to_json(os.path.join(folder_path, "test.json"),orient="index", force_ascii=False)