In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Загрузка датасета
df = pd.read_csv('PATH_TO_DATASET')

In [None]:
# Разделение датасета на тексты с меткой 0 и 1
df_label_0 = df[df['label'] == 0]  # Тексты с меткой 0
df_label_1 = df[df['label'] == 1]  # Тексты с меткой 1

In [None]:
# Уменьшение количества текстов с меткой 0
# Текстов с 0 возьмем в столько же, сколько с 1
# У авторов EUREKA вообще 0 в два раза меньше, чем 1
num_samples = len(df_label_1)  # Количество текстов с меткой 1
df_label_0_sampled = df_label_0.sample(n=num_samples*1, random_state=42)  # Выборка текстов с меткой 0

In [None]:
# Создание сбалансированного датасета
balanced_df = pd.concat([df_label_0_sampled, df_label_1], ignore_index=True)

In [None]:
# Проверка баланса классов
print("Распределение классов в сбалансированном датасете:")
print(balanced_df['label'].value_counts())

In [None]:
# Разделение на обучающую и тестовую выборки
train_df, test_df = train_test_split(balanced_df, test_size=0.2, stratify=balanced_df['label'], random_state=42)

In [None]:
print(len(train_df))
print(len(test_df))

In [None]:
# Install requirements
!pip install -r https://raw.githubusercontent.com/sedrickkeh/EUREKA/main/requirements.txt

In [None]:
# Clone the repository
!git clone https://github.com/sedrickkeh/EUREKA.git

In [None]:
eng_train_df = pd.read_csv('/content/EUREKA/data/train_split.csv')
eng_test_df = pd.read_csv('/content/EUREKA/data/test_split.csv')

In [None]:
eng_test_df

In [None]:
eng_train_df

In [None]:
# Добавим наши test_df и train_df

combined_train_df = pd.concat([eng_train_df, train_df], ignore_index=True)
combined_test_df = pd.concat([eng_test_df, test_df], ignore_index=True)

In [None]:
combined_test_df

In [None]:
combined_train_df

In [None]:
# Сохраним новые версии датасетов (англ+русск)

combined_train_df.to_csv('/content/EUREKA/data/train_split.csv')
combined_test_df.to_csv('/content/EUREKA/data/test_split.csv')

In [None]:
!pip install datasets
!pip install transformers==4.3.0
from datasets import Dataset

In [None]:
# Train
!python /content/EUREKA/train.py --train_path /content/EUREKA/data/train_split.csv \
                --valid_path /content/EUREKA/data/dev_split.csv \
                --test_path /content/EUREKA/data/test_split.csv \
                --cleaning_path /content/EUREKA/data/candidate_replacements.csv\
                --augmentation_path /content/EUREKA/data/augmentation_substitution_wikimatrix.csv

In [None]:
!python /content/EUREKA/train.py --train_file /content/EUREKA/data/train_split.csv --eval_file /content/EUREKA/data/dev_split.csv --output_dir /content/EUREKA/output

In [None]:
# Download tokenizer
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
model_checkpoint = "FacebookAI/xlm-roberta-large"

# Load the best model
model = XLMRobertaForSequenceClassification.from_pretrained('/content/EUREKA/output/checkpoint-500')
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

In [None]:
# # Use the best model
# sentences = [
#     "She suddenly passed away yesterday. It is so sad.",
#     "I'd give it 50-50, especially if senior members of the regime die from the disease.",
#     "Norman Cook (aka Fatboy Slim) is back with his latest project The Brighton Port Authority, or The BPA. The liner notes give a funny, fictional account about the music on this disc and how it came to pass.",
#     "Either way he is still the one paying most of the bills (until I finish my PhD), and people are okay with that so long as they view the relationship as monogamous. It could be some socially constructed reciprocity of man being breadwinner and woman homemaker or something more instinctual. I surrender that thought to you social scientists and will head back to the lab and to the simpler world of biochemistry.",
#     "He is already six feet under for 6 years now."
# ]

# inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# inputs = {key: val.to(device) for key, val in inputs.items()}

# with torch.no_grad():
#     outputs = model(**inputs)
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)

# for sentence, prediction in zip(sentences, predictions):
#     print(f"Sentence: {sentence}")
#     print(f"Prediction (class): {prediction.item()}")

In [None]:
# Прогоним для анализа тот же семл, что смотрели для бейзлайна

baseline_sample = pd.read_csv('/content/baseline_predictions_to_manual_check.csv')

In [None]:
from sklearn.metrics import f1_score

# sentences = test_df['utterance'].tolist()
# true_labels = test_df['label']

# 100 для анализа
sentences = baseline_sample['utterance'].head(100).tolist()
true_labels = baseline_sample['true_label'].head(100)

# Токенизация
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", batch_size=8)  # уменьшите batch_size

# Перемещение модели на GPU, если оно доступно
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: val.to(device) for key, val in inputs.items()}

# Прогон тестовых данных через модель
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Расчет F1-метрики для каждого класса отдельно
f1_per_class = f1_score(true_labels, predictions.cpu().numpy(), average=None)

# Печать F1 для каждого класса
for i, f1 in enumerate(f1_per_class):
    print(f"Class {i}: F1 = {f1:.4f}")

for sentence, true_label, prediction in zip(sentences, true_labels, predictions):
    print(f"Sentence: {sentence}")
    print(f"True label: {true_label}, Prediction: {prediction.item()}")

In [None]:
print(f"F1 Score: {f1:.4f}")

In [None]:
# сохраним 100 примеров для ручного анализа

import csv

output_file = '/content/EUREKA_results_to_analyse.csv'  # Путь к файлу для сохранения
with open(output_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Sentence', 'True label', 'Prediction'])  # Заголовки столбцов
    for sentence, true_label, prediction in zip(sentences, true_labels, predictions):
        writer.writerow([sentence, true_label, prediction.item()])

print(f"Results saved to {output_file}")

In [None]:
# Сохраним чекпоинт

!zip -r eng_trained_model.zip "/content/output_111/checkpoint-490"

from google.colab import files
files.download('eng_trained_model.zip')

Используем обученную сохраненную модель для предсказаний на нашем датасете

In [None]:
import zipfile
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

In [None]:
zip_path = "PATH_TO_ZIP" # путь к архиву
extract_path = "/content/output_111"  # путь к папке для извлечения архива

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizerFast

checkpoint_dir = "/content/output_111/content/output_111/checkpoint-490"

model = XLMRobertaForSequenceClassification.from_pretrained(checkpoint_dir)
tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-large")

In [None]:
batch_size = 4
model.eval()
predictions = []

for i in range(0, len(test_df['utterance'].tolist()), batch_size):
    batch = test_df['utterance'].tolist()[i:i + batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().tolist()
        predictions.extend(preds)

for sentence, pred in zip(test_df['utterance'].tolist(), predictions):
    print(f"Sentence: {sentence}")
    print(f"Prediction: {pred}")

In [None]:
# Оценка модели
from sklearn.metrics import f1_score, classification_report

y_pred = []
for sentence, pred in zip(test_df['utterance'].tolist(), predictions):
    y_pred.append(pred)

y_true = test_df["label"].tolist()

# F1 по каждому классу
print(classification_report(y_true, y_pred, target_names=["No euphemism", "Euphemism"]))