> # Real Time data extraction from telegram channel

In [1]:
import os
os.chdir('..')
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

>> ## Data Ingestion and  Data Preprocessing

In [2]:
from scripts.telegram_scrapper import start_scraping


>>> ### Identify and connect to relevant Telegram channels

In [3]:
channel = ['@ZemenExpress',            
           '@nevacomputer', 
           '@helloomarketethiopia',
           '@forfreemarket',
           '@Shewabrand']       
        
# Call the scraping function
start_scraping(channel)


Scrapping data...
Scraping historical data from @ZemenExpress (Zemen Express®)...
Finished scraping @ZemenExpress
Scraping historical data from @nevacomputer (NEVA COMPUTER®)...
Finished scraping @nevacomputer
Scraping historical data from @helloomarketethiopia (HellooMarket)...
Finished scraping @helloomarketethiopia
Scraping historical data from @forfreemarket (4Free Market🇪🇹)...
Finished scraping @forfreemarket
Scraping historical data from @Shewabrand (Shewa Brand)...
Finished scraping @Shewabrand
Listening for real-time messages...


In [6]:
data = pd.read_csv("telegram_data.csv", delimiter=",")


import re

In [8]:

data.head(10)

Unnamed: 0,Message Date,Sender ID,Message ID,Product Description
0,2025-01-16 12:49:25,-1001307493052,6020,የጡት ጫፍ ዋጋ፦ 400 ብር አድራሻ ቁ 1 መገናኛ መሰረት ደፋር ሞል ሁለ...
1,2025-01-16 06:29:40,-1001307493052,6016,2 1 ከመቀነሻ በተጨማሪ እንደ ስፕሬይ የሚያገለግል ዋጋ፦ 1100 ብር ው...
2,2025-01-16 06:28:10,-1001307493052,6015,2 1 ከመቀነሻ በተጨማሪ እንደ ስፕሬይ የሚያገለግል ዋጋ፦ 1100 ብር ው...
3,2025-01-15 05:43:58,-1001307493052,6013,መልካም በአል ለአጠቃቀም በጣም ምቹ እና ዘመናዊ ቆሻሻ የማይዝ 7000 ዋ...
4,2025-01-14 13:29:49,-1001307493052,6011,ከጠንካራ የተሰራ ለረጅም ሰአት ሙቀትን ጠብቆ ማቆየት የሚችል 1 5 24 ...
5,2025-01-14 08:17:04,-1001307493052,6007,300 750 360 ዋጋ፦ 2200 ብር ውስን ፍሬ ነው ያለው አድራሻ ቁ 1...
6,2025-01-14 06:39:27,-1001307493052,6006,የፀጉር ማስተካከያ የራስ ፀጉር እና ፂምን በስርአት እና በጥራት ለማስተካ...
7,2025-01-13 17:13:16,-1001307493052,6005,3 ዋጋ፦ 1700 ብር ውስን ፍሬ ነው ያለው አድራሻ ቁ 1 መገናኛ መሰረት...
8,2025-01-13 17:13:07,-1001307493052,6004,3 ዋጋ፦ 1700 ብር ውስን ፍሬ ነው ያለው አድራሻ ቁ 1 መገናኛ መሰረት...
9,2025-01-13 17:12:58,-1001307493052,6003,3 ዋጋ፦ 1700 ብር ውስን ፍሬ ነው ያለው አድራሻ ቁ 1 መገናኛ መሰረት...


In [13]:

from amseg import AmharicSegmenter
sent_punct = []
word_punct = []
segmenter = AmharicSegmenter(sent_punct, word_punct)

### Preprocess text data by tokenizing, normalizing, and handling Amharic-specific linguistic features.

#### Tokenize Amharic text

In [14]:

def tokenize_amharic_text(text):
    """
    Tokenizes Amharic text using nltk word_tokenize 
    """
    tokens = segmenter.amharic_tokenizer(text)
    return tokens

### Normalizes Amharic text

In [16]:
def normalize_amharic_text(text):
    """
    Normalizes Amharic text by removing non-Amharic characters and handling case sensitivity.
    """
    text = text.lower()
    # Replace special punctuation marks with a space
    text = re.sub(r'[፡።፣፤፥፦]', ' ', text)
    # Remove any extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [29]:
def preprocess_amharic_text(text):
    normalized_text = normalize_amharic_text(text)
    tokens = tokenize_amharic_text(normalized_text)
    print(tokens)
    return tokens

In [32]:
# Fill NaN values with empty strings
data["Product Description"] = data["Product Description"].fillna("")

In [33]:
data["Product Tokens"] = data["Product Description"].apply(preprocess_amharic_text)
data.to_csv("telegram_data_preprocessed.csv", index=False)


['የጡት', 'ጫፍ', 'ዋጋ', '400', 'ብር', 'አድራሻ', 'ቁ', '1', 'መገናኛ', 'መሰረት', 'ደፋር', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ', '05', '06', 'ቁ', '2', 'ፒያሳ', 'ጊዮርጊስ', 'አደባባይ', 'ራመት_ታቦር_ኦዳ_ህንፃ', '1ኛ', 'ፎቅ', 'ሱቅ', 'ቁ', '1', '-107', '0902660722', '0928460606', 'ፒያሳ', 'ቅርንጫፍ', '0941337070', 'በ', 'ለማዘዝ', 'ይጠቀሙ', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን']
['2', '1', 'ከመቀነሻ', 'በተጨማሪ', 'እንደ', 'ስፕሬይ', 'የሚያገለግል', 'ዋጋ', '1100', 'ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለው', 'አድራሻ', 'ቁ', '1', 'መገናኛ', 'መሰረት', 'ደፋር', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ', '05', '06', 'ቁ', '2', 'ፒያሳ', 'ጊዮርጊስ', 'አደባባይ', 'ራመት_ታቦር_ኦዳ_ህንፃ', '1ኛ', 'ፎቅ', 'ሱቅ', 'ቁ', '1', '-107', '0902660722', '0928460606', 'ፒያሳ', 'ቅርንጫፍ', '0941337070', 'በ', 'ለማዘዝ', 'ይጠቀሙ', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን']
['2', '1', 'ከመቀነሻ', 'በተጨማሪ', 'እንደ', 'ስፕሬይ', 'የሚያገለግል', 'ዋጋ', '1100', 'ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለው', 'አድራሻ', 'ቁ', '1', 'መገናኛ', 'መሰረት', 'ደፋር', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ', '05', '06', 'ቁ', '2', 'ፒያሳ', 'ጊዮርጊስ', 'አደባባይ', 'ራመት_ታቦር_ኦዳ_ህንፃ', '1ኛ', 'ፎቅ', 'ሱቅ', 'ቁ', '1', '-107', '0902660722', '0

>> ### Label a Subset of Dataset in CoNLL Format

####  Identify and label entities such as products, price, and Location in Amharic text.


In [35]:
product_keywords =['የጡት', 'ጫፍ','የማይዙ', 'ማራኪ', 'ግራናይት', 'ቅብ', 'ድስቶች',  'መጥበጫ',  'ክዳናቸው',  'ጭልፋዎች', 'ስቲከር', 'የሚያገለግል', 
                   'የጣት', 'ጥፍር', 'መከላከያ', 'ሽንኩርት', 'ሲልጡ', 'ዝንጅብል',  'በርበሬ', 'ዘለላ', 'ሲቀነጥሱ', 'ጥፍርን', 'ከጉዳት', 'የሚከላከል',
                   'ለጨጨብሳ', 'ለፈጢራ', 'ለዳቦ', 'ለጥብስና', 'ለተለያዩ', 'አገልግሎቶች', 'የሚሆን',  'በጥራት', 'የተሻለ', 'አነስተኛ', 'የኤሌክትሪክ', 
                   
                   'ፍጆታ', 'የሚጠቀም','የፀጉር', 'ማስተካከያ', 'የራስ', 'ፀጉር', 'ፂምን', 'በስርአት', 'በጥራት', 'ለማስተካከል', 'አስተማማኝ', 'ማሽን', 
                   'በሚፈልጉት', 'መጠን', 'ለመቁረጥ', 'መቀያየሪ', 'ቁጥሮች', 'ያሉት', 'በቻርጅ', 'የሚሰራ', 'አስተማምኝ', 'ባትሪ','የውሀ', 'ማቅረቢያ', 'ውብ',  'ማራኪ']

location_keywords= ['ሁለተኛ','ፎቅ','ቢሮ', 'ቁ', '05', '06', 'ፒያሳ', 'ጊዮርጊስ', 'አደባባይ', 'ራመት_ታቦር_ኦዳ_ህንፃ', 
                    'ቅርንጫፍ',  'መገናኛ', 'መሰረት', 'ደፋር', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ድሬዳዋ', 'አሸዋ', 'ሚና', 'ህንፃ','ሱቅ']
price_keywords = ['ብር', 'ዋጋ']

#### CoNLL Format

In [37]:
def label_tokens(tokens):
    labels = []
    inside_price = False
    inside_loc=False
    inside_prod = False
    for token in tokens:
        if token in product_keywords and inside_prod:
            labels.append('I-Product')
        elif token in product_keywords:
            labels.append('B-Product')
            inside_prod =True    
        elif token in location_keywords and inside_loc:
            labels.append('I-LOC')
        elif token in location_keywords:
            labels.append('B-LOC')
            inside_loc = True
        elif token in price_keywords: 
            labels.append("B-PRICE")  
            inside_price = True
        elif token.isdigit() and inside_price:  
            labels.append("I-PRICE")  
        elif token.isdigit(): 
            labels.append("O")  
            inside_price = False
        else:
            labels.append("O")
            inside_price = False
        
    return labels


In [38]:
tokens1 = data['Product Tokens']


In [39]:
labeled_tokens = data['Product Tokens'].apply(label_tokens)


In [40]:
coNLL_data=[]
for token, label in zip(tokens1, labeled_tokens):
    for t,l in zip(token,label):   
        coNLL_data.append(f"{t} {l}")
    coNLL_data.append("")
coNLL_data="\n".join(coNLL_data)

##### Save to a text file in CoNLL format

In [41]:

with open('labeled_data.conll', 'w', encoding='utf-8') as f:
    f.write(coNLL_data)

print("Labeled data has been saved in CoNLL format.")

Labeled data has been saved in CoNLL format.
