In [None]:
!pip install openpyxl



In [1]:
import pandas as pd
import numpy as np

import torch as t
import torch.nn.functional as f

from tqdm.notebook import trange, tqdm

from bs4 import BeautifulSoup

from transformers import pipeline

# Импорт и обработка данных

---

Импортируем данные, избавимся от html тегов и удалим неинформативные сообщения

In [2]:
data = pd.read_excel('/content/sample_data/dataset_with_labels.xlsx', index_col=0)

In [3]:
from bs4 import BeautifulSoup

# Удалим html теги
def html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator=" ")
    text = text.replace("\xa0", " ")
    return text.strip()

data.MessageText = data.MessageText.apply(lambda x: html_to_text(x))

In [4]:
import re

# Частично избавимся от "шума"
def is_noise(text):
    if len(text) < 3:  # Убираем слишком короткие строки
        return True
    if re.match(r"^[\W\d_]+$", text):  # Только символы и цифры
        return True
    if re.match(r"^[a-zA-Z]+$", text):  # Только латиница
        return True
    return False

data = data[~data.MessageText.apply(is_noise)]

# Аугментация данных

---

Используем предобученную модель для перефразирования предложений, классы для перефразированных предложений сохраняем

Для того, чтобы избавиться от пересечения примеров в будущем, разделим выборку заранее на тренировочную и тестовую

In [5]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(data, test_size=0.2, stratify=data.labels, random_state=42)

In [6]:
# Посмотрим на балансировку данных
classes_counts = train_data.labels.value_counts()
classes_counts

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
1,224
2,127
0,33


In [9]:
# Теперь посчитаем сколько примеров нужно сгенерировать для каждого класса для балансировки(и умножим на 3)
classes_samples = dict(2 * np.floor(classes_counts.max() / classes_counts).astype(int))
classes_samples

{1: 2, 2: 2, 0: 12}

In [10]:
import torch as t
from transformers import pipeline
from tqdm.notebook import trange, tqdm

paraphrase_model = pipeline(
    "text2text-generation",
    model="cointegrated/rut5-base-paraphraser",
    device=0
)

def paraphrase_text(text, num_return_sequences=3):
    paraphrased = paraphrase_model(
        text,
        max_length=128,
        num_return_sequences=num_return_sequences,
        truncation=True,
        temperature=0.5,
        do_sample=True,
    )

    return [p["generated_text"] for p in paraphrased]

aug_data = {
    'MessageText': [],
    'labels': []
}

for class_, return_seqs in classes_samples.items():
    subdata = train_data[train_data.labels == class_]
    for i in trange(subdata.shape[0]):
        aug_text = paraphrase_text(subdata.iloc[i, 0], return_seqs)

        aug_data['MessageText'] += aug_text
        aug_data['labels'] += [subdata.iloc[i, 1]] * return_seqs

        t.cuda.empty_cache()

aug_data = pd.DataFrame(aug_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/977M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/828k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0


  0%|          | 0/224 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


  0%|          | 0/127 [00:00<?, ?it/s]

  0%|          | 0/33 [00:00<?, ?it/s]

In [13]:
# Объединим данные и избавимся от дубликатов
train_data = pd.concat([train_data, aug_data], axis=0).drop_duplicates()

# Рандомно засэмплируем данные
train_data = train_data.sample(frac=1)

In [14]:
# Сохраним данные
train_data.to_excel('/content/train_dataset.xlsx')
valid_data.to_excel('/content/valid_dataset.xlsx')