## Set up TG clients

In [1]:
import os

from dotenv import load_dotenv
from telethon import TelegramClient, functions
from telethon.sessions import StringSession


In [2]:
scrapers_creds = []

for i in range(1, 8):
    load_dotenv(f"../creds/scraper{i}.env", override=True)
    scrapers_creds.append(
        {
            "session": StringSession(os.getenv("TG_SESSION")),
            "api_id": int(os.getenv("TG_API_ID")),
            "api_hash": os.getenv("TG_API_HASH"),
            "device_model": os.getenv("TG_DEVICE_MODEL"),
            "system_version": os.getenv("TG_SYSTEM_VERSION"),
            "app_version": os.getenv("TG_APP_VERSION"),
        }
    )

tg_clients = {}

for creds in scrapers_creds:
    tg_clients[creds["api_id"]] = TelegramClient(**creds)
    await tg_clients[creds["api_id"]].connect()

## Get channels

In [3]:
import sys

sys.path.insert(0, "../scraper/")

In [4]:
load_dotenv("../creds/db.env", override=True)

True

In [5]:
from database.database import (
    Channels,
    Peers,
)
from database.session import get_database_session
from sqlalchemy import select
from telethon.types import InputPeerChannel
from tqdm import tqdm


In [8]:
# Get channels_id list
with get_database_session() as db_session:
    result = db_session.execute(select(Channels.id))
    ch_ids = result.all()

ch_ids = [i[0] for i in ch_ids]
len(ch_ids)

26782

## Scrape pinned messages

In [11]:
ch_id_to_pinned_text = {}

In [None]:
for i in tqdm(ch_ids[:100]):
    channel = None  # Channel peer will be stored here
    tg_client = None  # Telegram client will be stored here, picked by scraper_id

    try:
        # Get channel peer, corresponding to the channel id and pick matching scraper (tg_client)
        with get_database_session() as db_session:
            result = db_session.execute(select(Peers).where(Peers.channel_id == i))
            peer = result.scalars().all()

            if not peer:
                continue

            channel = InputPeerChannel(channel_id=peer.channel_id, access_hash=peer.access_hash)
            tg_client = tg_clients[peer.scraper_id]

        full = await tg_client(functions.channels.GetFullChannelRequest(channel))

        pinned_id = full.full_chat.pinned_msg_id
        if not pinned_id:
            continue

        async for post in tg_client.iter_messages(entity=channel, limit=1, ids=pinned_id):
            if post:
                # Save to dict
                ch_id_to_pinned_text[i] = post.raw_text

    except Exception as e:
        print(f"Error while trying to scrape pinned message from {i}")
        print(e)
        continue

In [20]:
import pickle

with open("/home/deniskirbaba/Documents/influai-data/pinned_raw_texts.pkl", "wb") as f:
    pickle.dump(ch_id_to_pinned_text, f)