# Data preprocessing

This program module is designed for preprocessing textual reviews to prepare data for further analysis, including thematic classification and sentiment analysis. The main processing stages include text cleaning, stop-word removal, and lemmatization, which help standardize the text and improve the quality of subsequent processing using natural language processing (NLP) methods.

## Getting the code ready to work

In [1]:
!pip install emoji
!pip install language-tool-python
!pip install nltk pymorphy3 language-tool-python beautifulsoup4

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1
Collecting language-tool-python
  Downloading language_tool_python-2.9.0-py3-none-any.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading language_tool_python-2.9.0-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: language-tool-python
Successfully installed language-tool-python-2.9.0
Collecting pymorphy3
  Downloading pymorphy3-2.0.3-py3-none-any.whl.metadata (1.9 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3)
  Downloading dawg2_

In [2]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from pymorphy3 import MorphAnalyzer
from nltk.corpus import stopwords
from datetime import datetime, timedelta
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
tqdm.pandas()
# Initializing tools
nltk.download('punkt_tab')
nltk.download('stopwords')
morph = MorphAnalyzer()
russian_stopwords = set(stopwords.words('russian'))

# A dictionary of months for date processing
MONTHS = {
    'января': 1, 'февраля': 2, 'марта': 3, 'апреля': 4,
    'мая': 5, 'июня': 6, 'июля': 7, 'августа': 8,
    'сентября': 9, 'октября': 10, 'ноября': 11, 'декабря': 12
}


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Preprocessing text

In [6]:
def preprocess_text(text):
    text = re.sub(r'\.{3,}', '. ', text)
    text = re.sub(r'(?<=[а-яА-Яa-zA-Z0-9])([.,!?])', r' \1', text)
    return text


def parse_russian_date(date_str):
    """Date parsing in Russian format with automatic year detection"""
    if pd.isna(date_str) or not isinstance(date_str, str):
        return pd.NaT
    try:
        parts = re.split(r'[\s,.-]+', date_str.strip())
        parts = [p for p in parts if p]
        if len(parts) == 3:  # Full date (day, month, year)
            day, month, year = parts
            year = int(year)
        elif len(parts) == 2:  # Only day and month
            day, month = parts
            current_date = datetime.now()
            year = current_date.year
            test_date = datetime(year, MONTHS[month.lower()], int(day))
            if test_date > current_date:
                year -= 1
        else:
            return pd.NaT
        if month.lower() not in MONTHS:
            return pd.NaT
        return datetime(year, MONTHS[month.lower()], int(day))
    except:
        return pd.NaT


def clean_text_by_sentences(text):
    """Clearing text with removal of stop words + tokenize"""
    if not isinstance(text, str):
        return []
    try:
        # Применяем предварительную обработку текста
        text = preprocess_text(text)

        sentences = sent_tokenize(text, language="russian")
        cleaned_sentences = []

        for sentence in sentences:
            sentence = BeautifulSoup(sentence, "html.parser").get_text()
            sentence = re.sub(r'[^а-яА-ЯёЁa-zA-Z .,!?]', ' ', sentence)
            sentence = sentence.lower()
            words = word_tokenize(sentence, language="russian")
            filtered_words = [word for word in words
                              if word not in russian_stopwords
                              and len(word) > 2
                              and word.isalpha()]
            cleaned_sentences.append(' '.join(filtered_words))
        return cleaned_sentences
    except Exception as e:
        print(f"Error clearing text: {e}")
        return []

def lemmatize_cleaned_sentences(sentences):
    """Lemmatization of cleaned sentences."""
    if not sentences:
        return []
    try:
        lemmatized_sentences = []
        for sentence in sentences:
            words = word_tokenize(sentence, language="russian")
            lemmas = [morph.parse(word)[0].normal_form for word in words]
            lemmatized_sentences.append(' '.join(lemmas))
        return lemmatized_sentences
    except Exception as e:
        print(f"Lemmatization error: {e}")
        return []


def process_reviews(input_file, output_file):
    """Main processing function"""
    try:
        # Reading data
        df = pd.read_csv(input_file)
        # Date processing
        df['date_parsed'] = df['date'].progress_apply(parse_russian_date)

        # Filtering by date (last six months)
        cutoff_date = datetime.now() - timedelta(days=180)
        df = df[df['date_parsed'] >= cutoff_date]

        # Text processing
        df['text_cleaned'] = df['review'].progress_apply(clean_text_by_sentences)

        # Lemmatizing cleaned sentences
        df['text_lemmatized'] = df['text_cleaned'].progress_apply(lemmatize_cleaned_sentences)

        # Deleting empty reviews
        df = df[df['text_cleaned'].str.len() > 0]

        # Formatting the result
        df['date_formatted'] = df['date_parsed'].dt.strftime('%d.%m.%Y')
        result_cols = ['date_formatted', 'rating', 'review', 'text_cleaned', 'text_lemmatized']

        # Saving output
        df[result_cols].to_csv(output_file, index=False, encoding='utf-8')

        # Example data
        if len(df) > 0:
            print("\nПример данных:")
            print(df[result_cols].head(6).to_markdown(index=False))

        return df

    except Exception as e:
        print(f"Ошибка обработки: {e}")
        return pd.DataFrame()

In [7]:
# EXAMPLE
if __name__ == "__main__":
    input_csv = "hotel_reviews_data.csv"
    output_csv = "processed_reviews.csv"
    processed_data = process_reviews(input_csv, output_csv)

100%|██████████| 18/18 [00:00<00:00, 17602.58it/s]
100%|██████████| 12/12 [00:00<00:00, 592.44it/s]
100%|██████████| 12/12 [00:00<00:00, 255.46it/s]


Пример данных:
| date_formatted   |   rating | review                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | text_cleaned                                               


