In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !python -m spacy download ru_core_news_sm

# !pip install nltk

# !pip install textblob

# Базовые библиотеки для обработки естественного языка

- NLTK
- SpaCy
- Gensim
- TextBlob
- Transformers
- Scikit-learn

# Tokenization

## SpaCy

- https://spacy.io/models/

In [None]:
import spacy

def tokenize_text(text, language='en_core_web_sm'):
    nlp = spacy.load(language)
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

# Пример использования
text = "This is a sample sentence."
tokens = tokenize_text(text)
print(tokens)

In [None]:
text = "Привет, как дела?"
tokens = tokenize_text(text, 'ru_core_news_sm')
print(tokens)

## NLTK

- https://www.nltk.org/

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def tokenize_text(text, language='english'):
    tokens = word_tokenize(text, language)
    return tokens

# Пример использования
text = "This is a sample sentence."
tokens = tokenize_text(text)
print(tokens)

In [None]:
text = "Привет, как дела?"
tokens = tokenize_text(text, 'russian')
print(tokens)

# Lemmatization and stemming

## Lemmatization

In [None]:
def lemmatize_text(text, language='en_core_web_sm'):
    nlp = spacy.load(language)
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return lemmas

# Пример использования
text = "This is a sample sentence."
lemmas = lemmatize_text(text)
print(lemmas)

In [None]:
# Пример использования
text = "Привет, как дела?"
lemmas = lemmatize_text(text, 'ru_core_news_sm')
print(lemmas)

## Stemming

In [None]:
from nltk.stem.snowball import SnowballStemmer

def stem_text(text, language='english'):
    stemmer = SnowballStemmer(language)
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# Пример использования
text = "This is a sample sentence."
lemmas = stem_text(text)
print(lemmas)

In [None]:
text = "Привет, как дела?"
stemmed_tokens = stem_text(text, 'russian')
print(stemmed_tokens)

# Stop words

In [None]:
from nltk.corpus import stopwords

def remove_stopwords(text, language='english'):
    nltk.download('stopwords')
    stop_words = set(stopwords.words(language))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

# Пример использования
text = "This is a sample sentence."
filtered_text = remove_stopwords(text)
print(filtered_text)

In [None]:
text = "Привет, как дела?"
filtered_text = remove_stopwords(text, 'russian')
print(filtered_text)

# POS tagging

In [None]:
def pos_tagging(text, language='en_core_web_sm'):
    nlp = spacy.load(language)
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    return pos_tags

# Пример использования
text = "This is a sample sentence."
pos_tags = pos_tagging(text)
print(pos_tags)

In [None]:
text = "Привет, как дела?"
pos_tags = pos_tagging(text, 'ru_core_news_sm')
print(pos_tags)

# Sentiment analysis

In [None]:
from textblob import TextBlob

def sentiment_analysis(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    return sentiment

# Пример использования
text = "I love this product!"
sentiment = sentiment_analysis(text)
print(sentiment)

In [None]:
# Пример использования
text = "Этот фильм просто потрясающий!"
sentiment = sentiment_analysis(text)
print(sentiment)

# NER (Named Entity Recognition)

In [None]:
def extract_named_entities(text, language='en_core_web_sm'):
    nlp = spacy.load(language)
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Пример использования
text = "Apple Inc. was founded by Steve Jobs."
named_entities = extract_named_entities(text)
print(named_entities)

In [None]:
# Пример использования
text = "Apple Inc. была основана Стивом Джобсом."
named_entities = extract_named_entities(text, 'ru_core_news_sm')
print(named_entities)

# Text generation

## GPT-2

In [None]:
# !pip install tensorflow-text
# !pip install transformers

In [None]:
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

def text_generation(seed_text, model, tokenizer):
    input_text = tokenizer.encode(seed_text, return_tensors='tf')
    output = model.generate(input_text, max_length=100, num_return_sequences=1)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Пример использования
seed_text = "Once upon a time"
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = TFGPT2LMHeadModel.from_pretrained(model_name)

generated_text = text_generation(seed_text, model, tokenizer)
print(generated_text)

In [None]:
# Пример использования
seed_text = "В некотором царстве, в некотором государстве жил-был"
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = TFGPT2LMHeadModel.from_pretrained(model_name)

generated_text = text_generation(seed_text, model, tokenizer)
print(generated_text)

## RU GPT-2

- https://huggingface.co/ai-forever/rugpt2large

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def generate_text(prompt):
    tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt2large')
    model = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt2large')

    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Пример использования
prompt = "В некотором царстве, в некотором государстве жил-был"
generated_text = generate_text(prompt)
print(generated_text)

# Key word extraction

## YAKE

In [None]:
# !pip install yake

- https://github.com/LIAAD/yake
- http://yake.inesctec.pt/

In [None]:
from yake import KeywordExtractor

def extract_keywords(text):
    extractor = KeywordExtractor(lan="en") #, n=2
    keywords = extractor.extract_keywords(text)
    extracted_keywords = [keyword[0] for keyword in keywords]
    return extracted_keywords

# Пример использования
text = "This is a sample text. We want to extract keywords from it."
keywords = extract_keywords(text)
print(keywords)

## RAKE

In [None]:
# !pip install rake_nltk

- https://csurfer.github.io/rake-nltk/_build/html/index.html

In [None]:
from rake_nltk import Rake

def extract_keywords_rake(text):
    rake = Rake()
    rake.extract_keywords_from_text(text)
    keywords = rake.get_ranked_phrases()
    return keywords

# Пример использования
text = "This is a sample text. We want to extract keywords from it."
keywords = extract_keywords_rake(text)
print(keywords)

# Machine translation

## translate

In [None]:
# !pip install translate

- https://translate-python.readthedocs.io/en/latest/providers.html

In [None]:
from translate import Translator

def translate_text(text, source_lang, target_lang):
    translator = Translator(from_lang=source_lang, to_lang=target_lang)
    translation = translator.translate(text)
    return translation

# Пример использования
text = "Hello, how are you?"
translated_text = translate_text(text, 'en', 'fr')  # Перевод с английского на французский
print(translated_text)

In [None]:
text = "Bonjour, comment allez-vous ?"
translated_text = translate_text(text, 'fr', 'en')  # Перевод с французского на английский
print(translated_text)

## googletrans

In [None]:
# !pip install googletrans==4.0.0-rc1

- https://py-googletrans.readthedocs.io/en/latest/

In [None]:
from googletrans import Translator

def translate_text(text, source_lang, target_lang):
    translator = Translator()
    translation = translator.translate(text, src=source_lang, dest=target_lang)
    translated_text = translation.text
    return translated_text

# Пример использования
text = "Hello, how are you?"
translated_text = translate_text(text, 'en', 'fr')  # Перевод с английского на французский
print(translated_text)

In [None]:
text = "Bonjour comment allez-vous?"
translated_text = translate_text(text, 'fr', 'en')  # Перевод с французского на английский
print(translated_text)

# Text data augmentation

<img src='https://lh6.googleusercontent.com/x3ZAhTDLT1QVSD8gCdaBVMquM2dcYA15A-orfzXyTzhTP8m0ZKLXz_2NrJdWlTgWKRS7BimExM8RO9Ce_uVVVdRR29vGeP0VZdncDZY0GTwkctocQyYg7HK9VL5ay3QC4JhbSXBK'>

<img src='https://amitness.com/images/nlp-aug-bert-augmentations.png'>

<img src='https://editor.analyticsvidhya.com/uploads/67352blog2_da_tech.jpg'>

- Toxic Comment Classification Challenge https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge
- Дархан Нурахметов - Мое серебро на toxic-comment-classification-challenge https://www.youtube.com/watch?v=ojbfxPsjDA8

Популярные методы аугментации:
- Замена синонимов
- Удаление слов
- Добавление шума
- Случайная перестановка слов
- Генерация случайных предложений
- Замена символов (O->0, 5->%, e->3)
- и т.д.

Библиотеки для аугментации текстовых данных:
- nlpaug https://github.com/makcedward/nlpaug
- TextAttack
    - https://github.com/QData/TextAttack
    - https://textattack.readthedocs.io/en/latest/2notebook/3_Augmentations.html

## nlpaug

In [None]:
# !pip install nlpaug

- https://nlpaug.readthedocs.io/en/latest/augmenter/augmenter.html

### SynonymAug

In [None]:
import nlpaug.augmenter.word as naw

text = "This is a sample text."
aug = naw.SynonymAug()

# Пример использования
augmented_text = aug.augment(text)
print(augmented_text)

### RandomWordAug

In [None]:
text = "This is a sample text."
aug = naw.RandomWordAug(action="swap")
augmented_text = aug.augment(text)
print(augmented_text)

### ContextualWordEmbsAug

In [None]:
text = 'the brown fox jumps over the lazy dog'
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)
print(augmented_text)

### BackTranslationAug

In [None]:
# !pip install sacremoses

In [None]:
aug = naw.BackTranslationAug()
text = 'the brown fox jumps over the lazy dog'
augmented_text = aug.augment(text)
print(augmented_text)

### KeyboardAug

In [None]:
import nlpaug.augmenter.char as nac

text = "This is a sample text."
aug = nac.KeyboardAug()
augmented_text = aug.augment(text)

# Пример использования
print(augmented_text)

Статьи про аугментацию для текстовых данных:
- https://neptune.ai/blog/data-augmentation-nlp
- https://www.freecodecamp.org/news/how-to-perform-data-augmentation-in-nlp-projects/
- https://towardsdatascience.com/data-augmentation-library-for-text-9661736b13ff
- https://www.analyticsvidhya.com/blog/2022/02/text-data-augmentation-in-natural-language-processing-with-texattack/
- https://amitness.com/2020/05/data-augmentation-for-nlp/