In [1]:
import pandas as pd
import re
from transformers import AutoTokenizer
import os


  from .autonotebook import tqdm as notebook_tqdm


# Load the data


In [2]:
# Load the data
data_path = '../data/telegram_data.csv'
df = pd.read_csv(data_path)

# Display the first few rows
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4158,#መልካም_በዓል ❤\n\n✝ለመላው የክርስትና እምነት ተከታዮች በሙሉ እንኳ...,2024-09-26 14:37:04+00:00,photos\Yebonda_libsoch_4158.jpg
1,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4157,🌼መልካም አዲስ አመት ይሁንልን🌼,2024-09-11 07:26:15+00:00,photos\Yebonda_libsoch_4157.jpg
2,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4156,,2024-09-04 19:59:49+00:00,
3,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4155,♨️ውድ የሀይሚ ቦንዳ ደንበኞቻችን ከሞያሌ ያመጣናቸውን ጥራት ያላቸው \...,2024-09-03 18:55:54+00:00,
4,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4154,ከነገ ጀምሮ ልብሶችን ከ50 ብር ጀምሮ ሱቃችን መጥተው መገበያየት ይችላሉ...,2024-08-28 18:21:13+00:00,


In [3]:
# Initialize the pre-trained tokenizer (multilingual, supports Amharic)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")



In [4]:
# Tokenization function for Amharic text
def tokenize_amharic(text):
    return tokenizer.tokenize(text)

In [8]:
def clean_amharic_text(text):
    if not isinstance(text, str):
        text = ''  # Convert non-string values (e.g., NaN, float) to empty string
    # Replace unwanted characters, normalize spaces, etc.
    text = re.sub(r'[^\\w\\s]', '', text)  # Removing special characters
    text = re.sub(r'\\s+', ' ', text).strip()  # Normalize whitespace
    return text

# Preprocess Amharic text

In [9]:
def preprocess_data(df):
    df['Message'] = df['Message'].fillna('')  ## Handle NaN values
    df['cleaned_text'] = df['Message'].apply(clean_amharic_text)
    df['tokens'] = df['cleaned_text'].apply(tokenize_amharic)
    return df


# Run the preprocessing


In [10]:
df_preprocessed = preprocess_data(df)
df_preprocessed.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,cleaned_text,tokens
0,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4158,#መልካም_በዓል ❤\n\n✝ለመላው የክርስትና እምነት ተከታዮች በሙሉ እንኳ...,2024-09-26 14:37:04+00:00,photos\Yebonda_libsoch_4158.jpg,ss,"[▁s, s]"
1,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4157,🌼መልካም አዲስ አመት ይሁንልን🌼,2024-09-11 07:26:15+00:00,photos\Yebonda_libsoch_4157.jpg,,[]
2,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4156,,2024-09-04 19:59:49+00:00,,,[]
3,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4155,♨️ውድ የሀይሚ ቦንዳ ደንበኞቻችን ከሞያሌ ያመጣናቸውን ጥራት ያላቸው \...,2024-09-03 18:55:54+00:00,,,[]
4,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4154,ከነገ ጀምሮ ልብሶችን ከ50 ብር ጀምሮ ሱቃችን መጥተው መገበያየት ይችላሉ...,2024-08-28 18:21:13+00:00,,,[]


# Save the preprocessed data 


In [11]:
# Save the preprocessed data for later tasks
output_path = '../data/preprocessed_telegram_data.csv'
df_preprocessed.to_csv(output_path, index=False)

print("Preprocessing complete. Data saved to", output_path)

Preprocessing complete. Data saved to ../data/preprocessed_telegram_data.csv
