In [1]:
import re
import pandas as pd

# Load the dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Define a function to remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Function to remove blank lines from the dataset
def remove_blank_lines(df, column_name):
    # Remove rows where the message is either NaN or only contains whitespace
    df[column_name] = df[column_name].replace(r'^\s*$', pd.NA, regex=True)  
    df = df.dropna(subset=[column_name]) 
    return df

# Function to remove unwanted texts from messages
def remove_unwanted_texts(df, column_name, unwanted_texts):
    # Iterate through each row and remove unwanted texts and double quotes
    for index, row in df.iterrows():
        message = row[column_name]
        if isinstance(message, str): 
            # Remove unwanted texts
            for unwanted_text in unwanted_texts:
                if unwanted_text in message:
                    message = message.replace(unwanted_text, "")
            # Remove double quotes
            message = message.replace('"', "")
            df.at[index, column_name] = message.strip()
    return df

# Save the cleaned dataset to a new file
def save_cleaned_dataset(df, output_file):
    df.to_csv(output_file, index=False)
    print(f"Cleaned dataset saved to {output_file}")

# Main function
def main(input_file, output_file, column_name, unwanted_texts):
    df = load_dataset(input_file)

    # Handle Missing Values
    df = df.dropna(subset=['Message'])

    # Apply the function to the 'Message' column
    df['Message'] = df['Message'].apply(remove_emojis)
    
    # Remove blank lines (or rows with empty messages)
    df = remove_blank_lines(df, column_name)
    
    # Clean the messages by removing unwanted texts and double quotes
    cleaned_df = remove_unwanted_texts(df, column_name, unwanted_texts)
    
    # Save the cleaned dataset to a new CSV file
    save_cleaned_dataset(cleaned_df, output_file)

if __name__ == "__main__":
    # Specify the input file and output file paths
    input_file = "../data/modern_Data.csv"
    output_file = "../data/cleaned_dataset.csv"
    
    # Define the column containing the messages
    column_name = "Message"
    
    # List of unwanted texts to remove from the messages
    unwanted_texts = [
        "ቴሌግራምt.me/modernshoppingcenter",
        '"በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን"',
        "t.me/modernshopping1",
        "t.me/modernshopping2",
        "በስራችን ላይ ቅሬታ ካለዎት ብቻ በዚህ ስልክ ደዉለዉ ያሳዉቁን።",
        "0956415152",
        "0924743736",
        "0974978584",
        '"በሞደርን እቃወዏች ሂወትዎን ሞደርናይዝ ያድርጉ"',
        'የመረጡትን እቃ ለማዘዝ ከታች ባለዉ የቴሌግራም አድራሻ ይላኩልን',
        'ተጀመረ ተጀመረ ተጀመረ',
        'ልዩ እዉነተኛ የበዓል ቅናሽ',
        'ከነሐሴ 29 እስከ መስከረም 7 ድረስ የሚቆይ እዉነተኛ ቅናሽ አድርገናል።',
        'ለክፍለሀገር ደንበኞቻችን ባሉበት ሐገር በመናሐሪያ እንልካለን።',
    ]
    
    # Run the main function
    main(input_file, output_file, column_name, unwanted_texts)




Cleaned dataset saved to ../data/cleaned_dataset.csv
