The main purpose of this file is to back translate unlabeled comments in order to create augmentations

#  Data


In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [None]:
train_labeled = pd.read_csv('../Data_rus/labled_train_data.csv', index_col=0, sep="\t")
comments_labeled = pd.read_csv('../Data_rus/labled_train_comments.csv', index_col=0, sep="\t")
comments_unlabeled = pd.read_csv('../Data_rus/unlabled_train_comments.csv', index_col=0, sep="\t")

In [None]:
whole_comments = comments_labeled.append(comments_unlabeled)

# Settings

In [None]:
import torch
from torch import nn

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Preprocessing

## Delete senseless comments

In [None]:
whole_comments = whole_comments.fillna("Больше нечего сказать")
clean_whole_comments = whole_comments[["comment"]][(whole_comments.comment != "Больше нечего сказать") & (whole_comments.comment != "Да")  & (whole_comments.comment != "Нет")]
train_comments = train_labeled[["comment", "is_aggressive"]][(train_labeled.comment != "Больше нечего сказать") & (train_labeled.comment != "Да")  & (train_labeled.comment != "Нет")]
train_comments

## Text preprocessor

In [None]:
class Preprocessor:
    def __init__(self,
                 punktuation: list,
                 end_punktuation: list):

        self.punkt = set(punktuation)
        self.end_punkt = set(end_punktuation)
        self.punkt = self.punkt  -  self.end_punkt

    def remove_punkt(self, text: str) -> str:
        for punc in self.punkt:
            if punc in text:
                text = text.replace(punc, ' ')
        text = " ".join(text.split())
        return text.strip()

    def replace_end_punkt(self, text: str) -> str:
        for punc in self.end_punkt:
            if punc in text:
                text = text.replace(punc, '.')
        split_text = text.split(".")
        clean_text = [i for i in split_text if i != ""]
        text = " [SEP] ".join(clean_text)
        return text

    def preprocess(self, text: list) -> list:
        no_punkt_text = list(map(self.remove_punkt, text))
        sep_text = list(map(self.replace_end_punkt, no_punkt_text))
        return sep_text

In [None]:
import string
punktuation = list(string.punctuation)
custom_punkt = ["\t", ")", "("]
punktuation.extend(custom_punkt)

end_punktuation = ["!", ".", "?", "\n"]

In [None]:
preprocessor = Preprocessor(punktuation, end_punktuation)

In [None]:
list_comments = list(train_comments.comment)

prep_comments = preprocessor.preprocess(list_comments)
len(prep_comments)

In [None]:
train_comments["comment"] = prep_comments

In [None]:
train_comments

In [None]:
train_comments = train_comments[train_comments.comment != ""]
train_comments

In [None]:
train_comments.to_csv("Data/labeled_comments.csv", index=False)

# Back translation

In [None]:
import pandas as pd
non_empty_comments = list(pd.read_csv("Data/ori_comments.csv").comment)

In [None]:
!pip install transformers
!pip install sentencepiece
import sentencepiece
from transformers import MarianMTModel, MarianTokenizer

In [None]:
target_model_name = 'Helsinki-NLP/opus-mt-ru-en'
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name)
target_model.to(device)

en_model_name = 'Helsinki-NLP/opus-mt-en-ru'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name)
en_model.to(device)
print()

In [None]:
def add_lang(text: str, lang: str) -> str:
    if lang == "en":
        return text
    return f">>{lang}<< {text}"


In [None]:
def translate(texts, model, tokenizer, language):
    clean_texts = []
    for text in texts:
        clean_text = text.replace("[SEP]", " ")
        clean_texts.append(clean_text)
        
    encoded = tokenizer.prepare_seq2seq_batch(clean_texts)
    encoded["input_ids"] = torch.tensor(encoded["input_ids"]).to(device)
    encoded["attention_mask"] = torch.tensor(encoded["attention_mask"]).to(device)

    # Generate translation using model
    translated = model.generate(**encoded)

    # Convert the generated tokens indices back into text
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return translated_texts

In [None]:
def back_translate(texts, source_lang="ru", target_lang="en"):
    # Translate from source to target language
    
    trans_texts = translate(texts, target_model, target_tokenizer,
                         language=target_lang)

    # Translate from target language back to source language
    back_translated_texts = translate(trans_texts, en_model, en_tokenizer, 
                                      language=source_lang)
    
    return back_translated_texts

In [None]:
print(len(non_empty_comments))

In [None]:
ori_texts = non_empty_comments
aug_texts = []
verbose = 100
for i, text in enumerate(ori_texts):
    aug_text = back_translate([text], source_lang="ru", target_lang="en")
    aug_texts.append(aug_text)
    if i % verbose == 0:
        print(i)


In [None]:
aug_text = pd.DataFrame()
aug_text["comment"] = aug_texts
aug_text.to_csv("Data/aug_comments.csv", index=False)