In [1]:
import re
import pandas as pd

# Load the data
data = pd.read_excel('uncleaned.xlsx')

# Ensure the 'Comments' column is treated as a string
data['Comments'] = data['Comments'].astype(str)

# Function to remove punctuation, special characters, numbers, and emojis
def remove_punc_special_chars_and_emojis(word):
    # Compile regex patterns for performance
    pattern = re.compile(r'[!\@#\$%\^«»&\*\(\)…\[\]\{\};“”›’‘"\'\:,.\‹\/<>\?\\\\|`´~\-=\+፡።፤;፦፥፧፨፠፣_]')
    numbers_pattern = re.compile(r'[0-9፩፪፫፬፭፮፯፰፱፲፳፴]')
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    word = pattern.sub('', word)
    word = numbers_pattern.sub('', word)
    word = emoji_pattern.sub('', word)
    return word

# Function to replace repeated characters (e.g., hapyyyyyyyyyyy to happy)
def normalize_repeated_chars(word):
    return re.sub(r'(.)\1+', r'\1', word)

# Function to handle English contractions
def replace_contractions(text):
    contractions = {
        "won't": "will not",
        "can't": "cannot",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'t": " not",
        "'ve": " have",
        "'m": " am"
    }
    for contraction, replacement in contractions.items():
        text = re.sub(contraction, replacement, text)
    return text

# Function to remove stop words
def remove_stop_words(text, stop_words):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# List of stop words (you can expand this list)
stop_words = set([
    # English stop words
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
    'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
    'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
    'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
    'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll',
    'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
    "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't",
    'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't",
    # Amharic stop words (examples, expand as needed)
    'እኔ', 'አንተ', 'እርስዎ', 'እሱ', 'እሷ', 'እኛ', 'እናንተ', 'እነሱ', 'ይህ', 'ያ', 'ይሄ', 'እዚህ', 'እዛ', 'እነዚህ', 'እነዚያ', 'እንደ', 'በኩል'
])

# Apply the preprocessing steps
data['Comments'] = data['Comments'].apply(remove_punc_special_chars_and_emojis)
data['Comments'] = data['Comments'].apply(normalize_repeated_chars)
data['Comments'] = data['Comments'].apply(replace_contractions)
data['Comments'] = data['Comments'].apply(lambda x: remove_stop_words(x, stop_words))

# Function for character level normalization
def normalize_char(token):
    replacements = {
        '[ኀሐኸ]': 'ሀ', '[ሑኁዅ]': 'ሁ', '[ኂሒኺ]': 'ሂ', '[ኃሓኻ]': 'ሃ', '[ኌሔኼዄ]': 'ሄ', 
        '[ሕኅኽ]': 'ህ', '[ኆሖኾ]': 'ሆ', '[ሠ]': 'ሰ', '[ሡ]': 'ሱ', '[ሢ]': 'ሲ', 
        '[ሣ]': 'ሳ', '[ሤ]': 'ሴ', '[ሥ]': 'ስ', '[ሦ]': 'ሶ', '[ዐ]': 'አ', 
        '[ዑ]': 'ኡ', '[ዒ]': 'ኢ', '[ዓ]': 'ኣ', '[ዔ]': 'ኤ', '[ዕ]': 'እ', 
        '[ዖ]': 'ኦ', '[ጸ]': 'ፀ', '[ጹ]': 'ፁ', '[ጺ]': 'ፂ', '[ጻ]': 'ፃ', 
        '[ጼ]': 'ፄ', '[ጽ]': 'ፅ', '[ጾ]': 'ፆ', '[ቊ]': 'ቁ', '[ኵ]': 'ኩ', 
        '[ሗ]': 'ኋ', '(ቱ[ዋአ])': 'ቷ', '(ሩ[ዋአ])': 'ሯ', '(ሱ[ዋአ])': 'ሷ', 
        '(ሹ[ዋአ])': 'ሿ', '(ቁ[ዋአ])': 'ቋ', '(ቡ[ዋአ])': 'ቧ', '(ቹ[ዋአ])': 'ቿ', 
        '(ሁ[ዋአ])': 'ኋ', '(ኑ[ዋአ])': 'ኗ', '(ኙ[ዋአ])': 'ኟ', '(ኩ[ዋአ])': 'ኳ', 
        '(ዙ[ዋአ])': 'ዟ', '(ጉ[ዋአ])': 'ጓ', '(ደ[ዋአ])': 'ዷ', '(ጡ[ዋአ])': 'ጧ', 
        '(ጩ[ዋአ])': 'ጯ', '(ጹ[ዋአ])': 'ጿ', '(ፉ[ዋአ])': 'ፏ'
    }
    for pattern, replacement in replacements.items():
        token = re.sub(pattern, replacement, token)
    return token

# Apply the character normalization
data['Comments'] = data['Comments'].apply(normalize_char)

# Save the cleaned data
data.to_excel('cleaned.xlsx', index=False)
