In [1]:
import pandas as pd
import re
from langdetect import detect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from deep_translator import GoogleTranslator
import nltk

# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')

# Emoji removal
def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Language detection
def is_language(text, lang_code):
    try:
        return detect(text) == lang_code
    except:
        return False

# Generic processing function
def preprocess_description(raw_description, lang_code):
    if not isinstance(raw_description, str):
        return ""

    # Keep only paragraphs in the specified language
    paragraphs = raw_description.split('\n')
    lang_paragraphs = [p for p in paragraphs if is_language(p, lang_code)]
    text = ' '.join(lang_paragraphs)

    # Remove emojis, links, emails, HTML, numbers
    text = remove_emojis(text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)

    # Translate to English
    try:
        text = GoogleTranslator(source='auto', target='en').translate(text)
    except Exception as e:
        print(f"Translation failed: {e}")
        return ""

    # Tokenize and clean
    tokens = word_tokenize(text.lower())
    english_stopwords = set(stopwords.words('english'))
    tokens = [t for t in tokens if t.isalpha() and t not in english_stopwords]

    # Stem in English
    english_stemmer = SnowballStemmer("english")
    stemmed_tokens = [english_stemmer.stem(t) for t in tokens]

    return ' '.join(stemmed_tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaydonfaal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jaydonfaal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Process Spanish file
df_spanish = pd.read_excel('./App Description/spanish_desc.xlsx')
df_spanish['processed_description'] = df_spanish['Description'].apply(lambda x: preprocess_description(x, 'es'))
df_spanish.to_csv('./Processed Descriptions/spanish_processed_desc.csv', index=False)

# Process French file
df_french = pd.read_excel('./App Description/french_desc.xlsx')
df_french['processed_description'] = df_french['Description'].apply(lambda x: preprocess_description(x, 'fr'))
df_french.to_csv('./Processed Descriptions/french_processed_desc.csv', index=False)
