<a href="https://colab.research.google.com/github/khalidjasir/ui-km-paper/blob/main/KM_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Install necessary library

In [None]:
%pip install numpy==1.24.4
%pip install scipy==1.10.1
%pip install scikit-learn==1.2.2
%pip install gensim==4.3.2
%pip install pandas==1.5.3
%pip install fasttext
%pip install tqdm
%pip install ipywidgets
%pip install swifter
%pip install emoji
%pip install indonlp
%pip install openpyxl
%pip install telethon
%pip install pytz
%pip install python-dotenv
%pip install networkx
%pip install pyldavis
%pip install beautifulsoup4

## Detect environment

In [2]:
import os
import sys
import subprocess

def detect_environment():
  try:
    import google.colab
    return 'colab'
  except ImportError:
    pass

  if 'CONDA_PREFIX' in os.environ or os.path.exists(os.path.join(sys.prefix, 'conda-meta')):
    return 'conda'

  return 'local'

env = detect_environment()

## Handle Key

In [None]:
api_id = ''
api_hash = ''

if env == 'colab':
  from google.colab import userdata
  api_id = userdata.get('api_id')
  api_hash = userdata.get('api_hash')
elif env == 'conda':
  from dotenv import load_dotenv
  load_dotenv()
  api_id = os.getenv('api_id')
  api_hash = os.getenv('api_hash')
else:
  print('Unable to detect suitable environment!')

## Retrieve from telegram

In [None]:
import csv
import time
from datetime import datetime
from telethon import TelegramClient
from pytz import timezone

session_name = 'my_session'
channel_input = 'https://t.me/diskusipajak'
wib_timezone = timezone('Asia/Jakarta')
start_date = wib_timezone.localize(datetime(2015, 1, 1))
end_date = wib_timezone.localize(datetime(2025, 4, 30))
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_filename = f'telegram_messages_{timestamp}.csv'
batch_size = 10000

async def main():
  async with TelegramClient(session_name, api_id, api_hash) as client:
    channel = await client.get_entity(channel_input)

    buffer = []
    total_count = 0

    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
      writer = csv.writer(file, quoting=csv.QUOTE_ALL)
      writer.writerow([
        'id',
        'date',
        'text',
        'sender_id',
        'chat_id',
        'reply_to_msg_id',
        'views',
        'forwards',
        'buttons',
        'raw_text',
        'message_link'
      ])

    async for msg in client.iter_messages(channel, offset_date=start_date, reverse=True):
      if msg.date > end_date:
        break

      row = [
        msg.id,
        msg.date.astimezone(wib_timezone).strftime('%a %b %d %H:%M:%S %z %Y') if msg.date else "",
        msg.text.replace('\n', ' ').strip() if msg.text else "",
        msg.sender_id if msg.sender_id else '',
        getattr(msg, 'chat_id', getattr(msg.to_id, 'channel_id', '')),
        msg.reply_to_msg_id if msg.reply_to_msg_id else '',
        msg.views if msg.views is not None else '',
        msg.forwards if msg.forwards is not None else '',
        len(msg.buttons) if msg.buttons else 0,
        msg.raw_text.replace('\n', ' ').strip() if msg.raw_text else '',
        f'https://t.me/{channel.username}/{msg.id}'
      ]

      buffer.append(row)
      total_count += 1

      if total_count % batch_size == 0:
        with open(csv_filename, mode='a', newline='', encoding='utf-8') as file:
          writer = csv.writer(file, quoting=csv.QUOTE_ALL)
          writer.writerows(buffer)

        buffer.clear()
        print(f'Written {total_count} messages. Sleeping for 10 seconds...')
        time.sleep(10)

    if buffer:
      with open(csv_filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, quoting=csv.QUOTE_ALL)
        writer.writerows(buffer)
      print(f'Final batch written. Total messages: {total_count}')

    print(f'Done! Messages saved to {csv_filename}.')

await main()

Written 10000 messages. Sleeping for 10 seconds...
Written 20000 messages. Sleeping for 10 seconds...
Written 30000 messages. Sleeping for 10 seconds...
Written 40000 messages. Sleeping for 10 seconds...
Written 50000 messages. Sleeping for 10 seconds...
Written 60000 messages. Sleeping for 10 seconds...
Written 70000 messages. Sleeping for 10 seconds...
Written 80000 messages. Sleeping for 10 seconds...
Written 90000 messages. Sleeping for 10 seconds...
Written 100000 messages. Sleeping for 10 seconds...
Written 110000 messages. Sleeping for 10 seconds...
Written 120000 messages. Sleeping for 10 seconds...
Written 130000 messages. Sleeping for 10 seconds...
Written 140000 messages. Sleeping for 10 seconds...
Written 150000 messages. Sleeping for 10 seconds...
Written 160000 messages. Sleeping for 10 seconds...
Written 170000 messages. Sleeping for 10 seconds...
Written 180000 messages. Sleeping for 10 seconds...
Written 190000 messages. Sleeping for 10 seconds...
Written 200000 messag

## Move file for later processing

In [None]:
if env == 'colab':
  file_path = '/content/drive/MyDrive/telegram-data/'
  !mkdir -p $file_path
  !mv $csv_filename $file_path
  print(f'Successfully moved {csv_filename} to {file_path}')
elif env == 'conda':
  file_path = './data/'
  !mkdir -p $file_path
  !mv $csv_filename $file_path
  print(f'Successfully moved {csv_filename} to {file_path}')
else:
  print('Unable to detect suitable environment! No file moved')

Successfully moved telegram_messages_20250519_042802.csv to /content/drive/MyDrive/telegram-data/


## Retrieve stored telegram messages

In [None]:
import pandas as pd

filename = 'telegram_messages_20250519_042802.csv'

dtype = {
  'id': 'string',
  'date': 'string',
  'text': 'string',
  'sender_id': 'string',
  'chat_id': 'string',
  'reply_to_msg_id': 'string',
  'views': 'Int64',
  'forwards': 'Int64',
  'buttons': 'Int64',
  'raw_text': 'string',
  'message_link': 'string'
}

try:
  if env == 'colab':
    file_path = f'/content/drive/MyDrive/telegram-data/{filename}'
    messages_df = pd.read_csv(file_path, dtype=dtype)
    messages_df['date'] = pd.to_datetime(messages_df['date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
    print(f'Successfully read from {file_path} to dataframe')
  elif env == 'conda':
    file_path = f'./data/{filename}'
    messages_df = pd.read_csv(file_path, dtype=dtype)
    messages_df['date'] = pd.to_datetime(messages_df['date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
    print(f'Successfully read from {file_path} to dataframe')
  else:
    print('Unable to detect suitable environment! Nothing loaded to dataframe')
except FileNotFoundError:
  messages_df = pd.DataFrame()
  print(f'File not found')
except Exception as e:
  messages_df = pd.DataFrame()
  print(f'Error reading file: {e}')

display(messages_df)

## Identify knowledge seeker and contributor

In [None]:
import pandas as pd
import networkx as nx

network_df = messages_df.dropna(subset=['reply_to_msg_id'])

id_to_sender = messages_df.set_index('id')['sender_id'].to_dict()

edges = []
for _, row in network_df.iterrows():
  original_sender = id_to_sender.get(row['reply_to_msg_id'])
  if pd.notna(original_sender) and pd.notna(row['sender_id']) and original_sender != row['sender_id']:
    edges.append((row['sender_id'], original_sender))

G = nx.DiGraph()
G.add_edges_from(edges)

in_deg = dict(G.in_degree())
out_deg = dict(G.out_degree())

user_roles = pd.DataFrame({
  'in_degree': pd.Series(in_deg),
  'out_degree': pd.Series(out_deg)
}).fillna(0)

user_roles.index.name = 'sender_id'
user_roles = user_roles.reset_index()

user_roles['role'] = user_roles.apply(
  lambda row: 'contributor' if row['in_degree'] >= row['out_degree']
  else 'seeker', axis=1
)

messages_df['year'] = messages_df['date'].dt.year
messages_df['month'] = messages_df['date'].dt.month

messages_df = messages_df.drop(columns=['in_degree', 'out_degree', 'role'], errors='ignore')
messages_df = messages_df.merge(user_roles, on='sender_id', how='left')

display(messages_df)


## Merging reply with original message

In [None]:
import pandas as pd

messages_df.dropna(subset=[
  'text',
  'raw_text',
  'in_degree',
  'out_degree',
  'role'
], inplace=True)

row_lookup = {
  (row['chat_id'], row['id']): row.to_dict() for _, row in messages_df.iterrows()
}

def collect_reply_chain(chat_id, msg_id, visited=None):
  if visited is None:
    visited = set()

  key = (chat_id, msg_id)
  if key in visited:
    return []
  visited.add(key)

  current_row = row_lookup.get(key)
  if current_row is None:
    return []

  reply_to_id = current_row.get('reply_to_msg_id')
  if pd.isna(reply_to_id):
    return [current_row]

  parent_chain = collect_reply_chain(chat_id, reply_to_id, visited)
  return parent_chain + [current_row]

def format_chain_sorted(chain):
  chain_sorted = sorted(chain, key=lambda x: x['date'])
  return ' RT '.join(str(row.get('text') or '') for row in chain_sorted)

messages_df['merged_text'] = messages_df.apply(
  lambda row: format_chain_sorted(collect_reply_chain(row['chat_id'], row['id'])), axis=1
)

## Pre-processing functions

In [None]:
import unicodedata
import emoji
import re
import string
import indoNLP.preprocessing

def basic_clean(text):
  # Normalize Unicode (remove fancy fonts, underlines)
  text = unicodedata.normalize('NFKD', text)
  text = ''.join(c for c in text if not unicodedata.combining(c))

  # Translate emojis
  text = emoji.demojize(text, delimiters=(' ', ' '), language='id')

  # Remove HTML
  text = indoNLP.preprocessing.remove_html(text)

  # Remove URL
  text = indoNLP.preprocessing.remove_url(text)

  # Remove usernames
  text = re.sub(r'@\w+', '', text)

  # Remove RT
  text = re.sub(r'\brt\b', '', text, flags=re.IGNORECASE)

  # Remove hashtag symbol but keep the word
  text = re.sub(r'#', '', text)

  # Remove extra whitespace
  text = re.sub(r'\s+', ' ', text).strip()

  return text

def advanced_clean(text):
  # Remove word elongation
  text = indoNLP.preprocessing.replace_word_elongation(text)

  # Replace slang
  text = indoNLP.preprocessing.replace_slang(text)

  # Lowercase
  text = text.lower()

  # Remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))

  # Remove digits
  text = re.sub(r'\d+', '', text)

  # Remove stopwords
  text = indoNLP.preprocessing.remove_stopwords(text)

  # Remove extra whitespace again (just in case)
  text = re.sub(r'\s+', ' ', text).strip()

  return text

## Run pre-processing

In [None]:
import swifter

messages_df.dropna(subset=[
  'text',
  'raw_text',
  'in_degree',
  'out_degree',
  'role',
  'merged_text'
], inplace=True)
messages_df['basic_clean'] = messages_df['merged_text'].swifter.apply(basic_clean)
messages_df['advanced_clean'] = messages_df['basic_clean'].swifter.apply(advanced_clean)
messages_df.dropna(subset=['basic_clean', 'advanced_clean'], inplace=True)

display(messages_df)

## Filter out text that is shorter than 5 words

In [None]:
telegram_df = messages_df[messages_df['advanced_clean'].str.split().str.len() > 5].copy()

display(telegram_df)

## Retrieving language model

In [None]:
import os
import fasttext
import urllib.request

filename = 'lid.176.bin'
model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
model_path = ''

try:
  if env == 'colab':
    model_dir = '/content/drive/MyDrive/models'
  elif env == 'conda':
    model_dir = './models'
  else:
    raise EnvironmentError('Unable to detect suitable environment!')

  os.makedirs(model_dir, exist_ok=True)
  model_path = os.path.join(model_dir, filename)

  if not os.path.exists(model_path):
    print('Model not found locally. Downloading...')
    urllib.request.urlretrieve(model_url, model_path)
    print('Download completed!')

  print(f'Loading model from {model_path}...')
  model = fasttext.load_model(model_path)
  print('Model loaded successfully!')

except Exception as e:
  print(f'Error loading FastText model: {e}')

## Identify language

In [None]:
texts = telegram_df['advanced_clean'].tolist()

predictions = model.predict(texts)

telegram_df['lang_detected'] = [label[0].replace('__label__', '') for label in predictions[0]]
telegram_df['lang_confidence'] = [float(score[0]) for score in predictions[1]]

lang_stats = telegram_df.groupby('lang_detected')['lang_confidence'].agg(
  count='count',
  min='min',
  q25=lambda x: x.quantile(0.25),
  mean='mean',
  median='median',
  q90=lambda x: x.quantile(0.9),
  max='max'
).reset_index().sort_values(by='count', ascending=False)

display(lang_stats)

## Filter text with Bahasa Indonesia

In [None]:
filtered_df = telegram_df[
  (telegram_df['lang_detected'] == 'id')
].copy()

display(filtered_df)

## Find similarity with KNN

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.8)
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_df['advanced_clean'])

nn = NearestNeighbors(metric='cosine', n_neighbors=10)
nn.fit(tfidf_matrix)

distances, indices = nn.kneighbors(tfidf_matrix)

threshold = 0.95
to_remove = set()

for i, (dists, neighbors) in enumerate(zip(distances, indices)):
  for dist, idx in zip(dists[1:], neighbors[1:]):
    sim = 1 - dist
    if sim >= threshold:
      to_remove.add(max(i, idx))

unique_df = filtered_df.drop(filtered_df.index[list(to_remove)]).reset_index(drop=True)

display(unique_df)

## Store to local file

In [None]:
from datetime import datetime

unique_df['date'] = unique_df['date'].dt.tz_localize(None)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
excel_filename = f'filtered_telegram_messages_{timestamp}.xlsx'
unique_df.to_excel(excel_filename, index=False)

if env == 'colab':
  file_path = f'/content/drive/MyDrive/telegram-data/'
  !mkdir -p $file_path
  !mv $excel_filename $file_path
  print(f'Successfully moved {excel_filename} to {file_path}')
elif env == 'conda':
  file_path = f'./data/'
  !mkdir -p $file_path
  !mv $excel_filename $file_path
  print(f'Successfully moved {excel_filename} to {file_path}')
else:
  print('Unable to detect suitable environment!')

## Retrieve filtered messeges

In [None]:
import pandas as pd

filename = 'filtered_telegram_messages_20250521_040024.xlsx'

try:
  if env == 'colab':
    file_path = f'/content/drive/MyDrive/telegram-data/{filename}'
    unique_df = pd.read_excel(file_path)
    unique_df['date'] = pd.to_datetime(unique_df['date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
    print(f'Successfully read from {file_path} to dataframe')
  elif env == 'conda':
    file_path = f'./data/{filename}'
    unique_df = pd.read_excel(file_path)
    unique_df['date'] = pd.to_datetime(unique_df['date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
    print(f'Successfully read from {file_path} to dataframe')
  else:
    print('Unable to detect suitable environment!')
except FileNotFoundError:
  unique_df = pd.DataFrame()
  print(f'File not found')
except Exception as e:
  unique_df = pd.DataFrame()
  print(f'Error reading file: {e}')

display(unique_df)

## Run LDA topic modeling

In [8]:
from collections import defaultdict
from gensim import corpora, models
from gensim.models import CoherenceModel
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import os
import json

os.makedirs('lda_jsons', exist_ok=True)

monthly_yearly_topics = defaultdict(dict)
lda_objects = defaultdict(dict)
metadata_dict = {}

unique_df['assigned_topic'] = None
unique_df['topic_keywords'] = None

for (month, year), group in unique_df.groupby(['month', 'year']):
  tokenized_docs_series = group['advanced_clean'].apply(str.split)

  indexed_docs = list(zip(group.index, tokenized_docs_series))
  indexed_docs = [(idx, doc) for idx, doc in indexed_docs if len(doc) > 0]

  if len(indexed_docs) < 5:
    continue

  indices, tokenized_docs = zip(*indexed_docs)

  dictionary = corpora.Dictionary(tokenized_docs)
  dictionary.filter_extremes(no_below=5, no_above=0.85)

  corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

  filtered = [(idx, doc_bow, doc) for idx, doc_bow, doc in zip(indices, corpus, tokenized_docs) if len(doc_bow) > 0]
  if not filtered:
    continue

  indices, corpus, tokenized_docs = zip(*filtered)

  num_topics = 5

  lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
  )

  topic_id_to_keywords = {
    topic_id: ', '.join([word for word, _ in lda_model.show_topic(topic_id, topn=5)])
    for topic_id in range(num_topics)
  }

  dominant_topics = []
  topic_keywords = []

  for doc_bow in corpus:
    topic_probs = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
    dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
    dominant_topics.append(dominant_topic)
    topic_keywords.append(topic_id_to_keywords[dominant_topic])

  unique_df.loc[list(indices), 'assigned_topic'] = dominant_topics
  unique_df.loc[list(indices), 'topic_keywords'] = topic_keywords

  topics = [
    {
      'topic_id': topic_id,
      'top_words': topic_id_to_keywords[topic_id].split(', ')
    }
    for topic_id in range(num_topics)
  ]

  coherence_model = CoherenceModel(
    model=lda_model,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence='c_v'
  )

  monthly_yearly_topics[month][year] = {
    'topics': topics,
    'coherence': coherence_model.get_coherence()
  }

  lda_objects[month][year] = {
    'model': lda_model,
    'corpus': corpus,
    'dictionary': dictionary
  }

  group_with_topics = unique_df.loc[list(indices)]

  role_counts = group_with_topics.groupby(['sender_id', 'role']).size().reset_index(name='count')

  top_count = 3
  top_contributor = role_counts[role_counts['role'] == 'contributor'].sort_values('count', ascending=False).head(top_count)
  top_seeker = role_counts[role_counts['role'] == 'seeker'].sort_values('count', ascending=False).head(top_count)

  top_contributor_info = top_contributor[['sender_id', 'count']].to_dict('records')
  top_seeker_info = top_seeker[['sender_id', 'count']].to_dict('records')

  monthly_yearly_topics[month][year]['top_contributor'] = top_contributor_info
  monthly_yearly_topics[month][year]['top_seeker'] = top_seeker_info

  metadata_dict[f'{month}_{year}'] = {
    'top_contributor': top_contributor_info,
    'top_seeker': top_seeker_info
  }

  vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
  vis_json = vis_data.to_json()

  json_filename = f'lda_jsons/lda_{month}_{year}.json'
  with open(json_filename, 'w') as f:
    f.write(vis_json)

with open('lda_jsons/metadata.json', 'w') as f:
  json.dump(metadata_dict, f, indent=2)

## Move LDA JSON

In [None]:
current_dir = './lda_jsons'

if env == 'colab':
  target_dir = f'/content/drive/MyDrive/interactive-lda/'
  !mkdir -p $target_dir
  !mv $current_dir $target_dir
  print(f'Successfully moved {current_dir} to {target_dir}')
elif env == 'conda':
  target_dir = f'./interactive-lda/'
  !mkdir -p $target_dir
  !mv $current_dir $target_dir
  print(f'Successfully moved {current_dir} to {target_dir}')
else:
  print('Unable to detect suitable environment!')