# Data preprocessing

This program module is designed for preprocessing textual reviews to prepare data for further analysis, including thematic classification and sentiment analysis. The main processing stages include text cleaning, stop-word removal, and lemmatization, which help standardize the text and improve the quality of subsequent processing using natural language processing (NLP) methods.

## Getting the code ready to work

In [5]:
!pip install emoji
!pip install language-tool-python
!pip install nltk pymorphy3 language-tool-python beautifulsoup4



In [66]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from pymorphy3 import MorphAnalyzer
from nltk.corpus import stopwords
from datetime import datetime, timedelta
import nltk
from tqdm import tqdm

# Initializing tools
nltk.download('punkt')
nltk.download('stopwords')
morph = MorphAnalyzer()
russian_stopwords = set(stopwords.words('russian'))

# A dictionary of months for date processing
MONTHS = {
    'января': 1, 'февраля': 2, 'марта': 3, 'апреля': 4,
    'мая': 5, 'июня': 6, 'июля': 7, 'августа': 8,
    'сентября': 9, 'октября': 10, 'ноября': 11, 'декабря': 12
}


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing text

In [67]:
def parse_russian_date(date_str):
    """Date parsing in Russian format with automatic year detection"""
    if pd.isna(date_str) or not isinstance(date_str, str):
        return pd.NaT
    try:
        parts = re.split(r'[\s,.-]+', date_str.strip())
        parts = [p for p in parts if p]
        if len(parts) == 3:  # Full date (day, month, year)
            day, month, year = parts
            year = int(year)
        elif len(parts) == 2:  # Only day and month
            day, month = parts
            current_date = datetime.now()
            year = current_date.year
            test_date = datetime(year, MONTHS[month.lower()], int(day))
            if test_date > current_date:
                year -= 1
        else:
            return pd.NaT
        if month.lower() not in MONTHS:
            return pd.NaT
        return datetime(year, MONTHS[month.lower()], int(day))
    except:
        return pd.NaT

def clean_text(text):
    """Clearing text with removal of stop words"""
    if not isinstance(text, str):
        return ""
    try:
        text = BeautifulSoup(text, "html.parser").get_text()
        text = re.sub(r'[^а-яА-ЯёЁa-zA-Z .,!?]', ' ', text)
        text = text.lower()
        words = word_tokenize(text, language="russian")
        filtered_words = [word for word in words
                        if word not in russian_stopwords
                        and len(word) > 2
                        and word.isalpha()]
        return ' '.join(filtered_words)
    except Exception as e:
        print(f"Error clearing text: {e}")
        return ""

def lemmatize_text(text):
    """Lemmatization of the text"""
    if not text:
        return ""
    try:
        words = word_tokenize(text, language="russian")
        lemmas = [morph.parse(word)[0].normal_form for word in words]
        return ' '.join(lemmas)
    except Exception as e:
        print(f"Lemmatization error: {e}")
        return ""

def process_reviews(input_file, output_file):
    """Main processing function"""
    try:
        # Reading data
        df = pd.read_csv(input_file)
        # Date processing
        df['date_parsed'] = df['date'].progress_apply(parse_russian_date)

        # Filtering by date (last six months)
        cutoff_date = datetime.now() - timedelta(days=180)
        df = df[df['date_parsed'] >= cutoff_date]

        # Text processing
        df['text_cleaned'] = df['review'].progress_apply(clean_text)
        df['text_lemmatized'] = df['text_cleaned'].progress_apply(lemmatize_text)

        # Deleting empty reviews
        df = df[df['text_cleaned'].str.len() > 0]

        # Formatting the result
        df['date_formatted'] = df['date_parsed'].dt.strftime('%d.%m.%Y')
        result_cols = ['date_formatted', 'rating', 'review', 'text_cleaned', 'text_lemmatized']

        # Saving output
        df[result_cols].to_csv(output_file, index=False, encoding='utf-8')

        # Example data
        if len(df) > 0:
            print("\nПример данных:")
            print(df[result_cols].head(3).to_markdown(index=False))

        return df

    except Exception as e:
        print(f"Ошибка обработки: {e}")
        return pd.DataFrame()

In [68]:
# EXAMPLE
if __name__ == "__main__":
    input_csv = "hotel_reviews_data.csv"
    output_csv = "processed_reviews.csv"
    processed_data = process_reviews(input_csv, output_csv)

Лемматизация: 100%|██████████| 18/18 [00:00<00:00, 12841.89it/s]
Лемматизация: 100%|██████████| 12/12 [00:00<00:00, 1279.89it/s]
Лемматизация: 100%|██████████| 12/12 [00:00<00:00, 265.13it/s]


Пример данных:
| date_formatted   |   rating | review                                                                                                                                                                                                                                                                                                                                                                           | text_cleaned                                                                                                                                                                                                                                                                                                             | text_lemmatized                                                                                                                                                                                                                                                           


