## Environment Setup (.env loading)

In [1]:
import os
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
env_path = Path().resolve().parents[1] / '.env'
load_dotenv(dotenv_path=env_path)

def get_env(name, required=True):
    value = os.getenv(name)
    if required and not value:
        raise ValueError(f"Missing required env var: {name}")
    return value

API_ID = int(get_env("API_ID"))
API_HASH = get_env("API_HASH")
PHONE = get_env("PHONE", required=False)

##  Import Dependencies

In [2]:
import re
import os
import pandas as pd
from typing import List, Optional
from telethon import TelegramClient
from telethon.tl.types import MessageMediaPhoto, MessageMediaDocument

## Define Helper Classes

In [3]:
class AmharicTextProcessor:
    @staticmethod
    def preprocess(text: str) -> str:
        if not text:
            return ""
        text = re.sub(r'[^\u1200-\u137F\s]', '', text)  # Keep Amharic chars
        text = re.sub(r'\s+', ' ', text).strip()
        return text


## Telegram Scraper Class

In [4]:
import os
import pandas as pd
from typing import List, Optional
from telethon import TelegramClient
from telethon.tl.types import MessageMediaPhoto


class TelegramNotebookScraper:
    def __init__(self, api_id: int, api_hash: str, phone: Optional[str] = None, media_dir: str = 'media'):
        self.api_id = api_id
        self.api_hash = api_hash
        self.phone = phone
        self.media_dir = media_dir
        os.makedirs(media_dir, exist_ok=True)
        self.client = TelegramClient('notebook_session', self.api_id, self.api_hash)

    async def start(self):
        await self.client.start(phone=self.phone)

    async def download_media(self, message, channel_username: str) -> Optional[str]:
        if isinstance(message.media, MessageMediaPhoto):
            ext = 'jpg'
            filename = f"{channel_username}_{message.id}.{ext}"
            path = os.path.join(self.media_dir, filename)
            await self.client.download_media(message.media, path)
            return path
        return None

    async def fetch_channel_data(self, channel_username: str, limit=500) -> pd.DataFrame:
        data = []
        try:
            entity = await self.client.get_entity(channel_username)
            async for message in self.client.iter_messages(entity, limit=limit):
                text = AmharicTextProcessor.preprocess(message.message or '')
                media_path = await self.download_media(message, channel_username)

                sender = None
                if message.sender:
                    sender = getattr(message.sender, 'username', None) or getattr(message.sender, 'id', None)

                data.append({
                    "channel_title": getattr(entity, 'title', channel_username),
                    "channel_username": channel_username,
                    "message_id": message.id,
                    "text": text,
                    "timestamp": message.date,
                    "views": message.views,
                    "sender": sender,
                    "media_path": media_path
                })
        except Exception as e:
            print(f"Error fetching from {channel_username}: {e}")
        return pd.DataFrame(data)

    async def fetch_multiple_channels(self, channels: List[str], limit=500) -> pd.DataFrame:
        all_data = []
        for channel in channels:
            print(f"Fetching from: {channel}")
            df = await self.fetch_channel_data(channel, limit)
            all_data.append(df)
            print(f"Finished: {channel} — {len(df)} messages")
        return pd.concat(all_data, ignore_index=True)

    async def stop(self):
        await self.client.disconnect()


# Run

In [5]:
import nest_asyncio
nest_asyncio.apply()

## Ingest Messages

In [6]:
scraper = TelegramNotebookScraper(
    API_ID,
    API_HASH,
    PHONE,
    media_dir='../Data/processed/photo'
)

await scraper.start()

channels = [
    '@Fashiontera','@AwasMart','@ethio_brand_collection','@Leyueqa','@helloomarketethiopia','@qnashcom'
]

df = await scraper.fetch_multiple_channels(channels)

await scraper.stop()


Signed in successfully as Kumsa Mergia; remember to not break the ToS or you will risk an account ban!
Fetching from: @Fashiontera
Finished: @Fashiontera — 500 messages
Fetching from: @AwasMart
Finished: @AwasMart — 500 messages
Fetching from: @ethio_brand_collection
Finished: @ethio_brand_collection — 500 messages
Fetching from: @Leyueqa
Finished: @Leyueqa — 500 messages
Fetching from: @helloomarketethiopia
Finished: @helloomarketethiopia — 500 messages
Fetching from: @qnashcom
Finished: @qnashcom — 500 messages


# Save the Data

In [None]:
df.to_csv('../Data/processed/telegram_messages_data.csv', index=False)

# View  the Data

In [8]:
df.head()

Unnamed: 0,channel_title,channel_username,message_id,text,timestamp,views,sender,media_path
0,Fashion tera,@Fashiontera,4152,,2025-06-18 18:37:32+00:00,632.0,Fashiontera,../Data/processed/photo\@Fashiontera_4152.jpg
1,Fashion tera,@Fashiontera,4151,ስልክ ፋሽን ተራ አድራሻ አዲስ አበባ ጦር ሀይሎች ድሪም ታወር ተኛ ፎቅ,2025-06-14 13:32:36+00:00,1181.0,Fashiontera,../Data/processed/photo\@Fashiontera_4151.jpg
2,Fashion tera,@Fashiontera,4150,,2025-06-14 13:32:36+00:00,976.0,Fashiontera,../Data/processed/photo\@Fashiontera_4150.jpg
3,Fashion tera,@Fashiontera,4149,ስልክ ፋሽን ተራ አድራሻ አዲስ አበባ ጦር ሀይሎች ድሪም ታወር ተኛ ፎቅ,2025-06-13 19:55:32+00:00,1092.0,Fashiontera,../Data/processed/photo\@Fashiontera_4149.jpg
4,Fashion tera,@Fashiontera,4148,ስልክ ፋሽን ተራ አድራሻ አዲስ አበባ ጦር ሀይሎች ድሪም ታወር ተኛ ፎቅ,2025-06-11 20:11:16+00:00,1113.0,Fashiontera,../Data/processed/photo\@Fashiontera_4148.jpg
