<a href="https://colab.research.google.com/github/khalidjasir/ui-km-paper/blob/main/KM_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install necessary library

In [2]:
# Core
%pip install pandas

# Utility
%pip install python-dotenv
%pip install pytz
%pip install swifter
%pip install tqdm
%pip install ipywidgets
%pip install openpyxl

# Telegram Scraper
%pip install telethon

# NLP
%pip install emoji
%pip install indonlp
%pip install fasttext

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0
Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.4.0-py3-none-any.whl size=16505 sha256=89e8838abfac3933d2e80cbfa9587d35fcfb5d3e0d12532e4a5d27dd66ce3574
  Stored in directory: /root/.cache/pip/wheels/ef/7f/bd/9bed48f078f3ee1fa75e0b29b6e0335ce1cb03a38d3443b3a3
Successfully built swifter
Installing collected packages: swifter
Successfully installed swifter-1.4.0
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywid

## Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Function to detect environment

In [4]:
import os
import sys

def detect_environment():
  try:
    import google.colab
    return 'colab'
  except ImportError:
    pass

  if 'CONDA_PREFIX' in os.environ or os.path.exists(os.path.join(sys.prefix, 'conda-meta')):
    return 'conda'

  return 'local'

env = detect_environment()

## Handle Key

In [None]:
api_id = ''
api_hash = ''

if env == 'colab':
  from google.colab import userdata
  api_id = userdata.get('api_id')
  api_hash = userdata.get('api_hash')
elif env == 'conda':
  from dotenv import load_dotenv
  load_dotenv()
  api_id = os.getenv('api_id')
  api_hash = os.getenv('api_hash')
else:
  print('Unable to detect suitable environment!')

## Retrieve from telegram

In [None]:
import csv
import time
from datetime import datetime
from telethon import TelegramClient
from pytz import timezone

session_name = 'my_session'
channel_input = 'https://t.me/diskusipajak'
wib_timezone = timezone('Asia/Jakarta')
start_date = wib_timezone.localize(datetime(2015, 1, 1))
end_date = wib_timezone.localize(datetime(2025, 4, 30))
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_filename = f'telegram_messages_{timestamp}.csv'
batch_size = 10000

async def main():
  async with TelegramClient(session_name, api_id, api_hash) as client:
    channel = await client.get_entity(channel_input)

    buffer = []
    total_count = 0

    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
      writer = csv.writer(file, quoting=csv.QUOTE_ALL)
      writer.writerow([
        'id',
        'date',
        'text',
        'sender_id',
        'chat_id',
        'reply_to_msg_id',
        'views',
        'forwards',
        'buttons',
        'raw_text',
        'message_link'
      ])

    async for msg in client.iter_messages(channel, offset_date=start_date, reverse=True):
      if msg.date > end_date:
        break

      row = [
        msg.id,
        msg.date.astimezone(wib_timezone).strftime('%a %b %d %H:%M:%S %z %Y') if msg.date else "",
        msg.text.replace('\n', ' ').strip() if msg.text else "",
        msg.sender_id if msg.sender_id else '',
        getattr(msg, 'chat_id', getattr(msg.to_id, 'channel_id', '')),
        msg.reply_to_msg_id if msg.reply_to_msg_id else '',
        msg.views if msg.views is not None else '',
        msg.forwards if msg.forwards is not None else '',
        len(msg.buttons) if msg.buttons else 0,
        msg.raw_text.replace('\n', ' ').strip() if msg.raw_text else '',
        f'https://t.me/{channel.username}/{msg.id}'
      ]

      buffer.append(row)
      total_count += 1

      if total_count % batch_size == 0:
        with open(csv_filename, mode='a', newline='', encoding='utf-8') as file:
          writer = csv.writer(file, quoting=csv.QUOTE_ALL)
          writer.writerows(buffer)

        buffer.clear()
        print(f'Written {total_count} messages. Sleeping for 10 seconds...')
        time.sleep(10)

    if buffer:
      with open(csv_filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, quoting=csv.QUOTE_ALL)
        writer.writerows(buffer)
      print(f'Final batch written. Total messages: {total_count}')

    print(f'Done! Messages saved to {csv_filename}.')

await main()

## Move file for later processing

In [None]:
if env == 'colab':
  file_path = '/content/drive/MyDrive/telegram-data/'
  !mkdir -p $file_path
  !mv $csv_filename $file_path
  print(f'Successfully moved {csv_filename} to {file_path}')
elif env == 'conda':
  file_path = './data/'
  !mkdir -p $file_path
  !mv $csv_filename $file_path
  print(f'Successfully moved {csv_filename} to {file_path}')
else:
  print('Unable to detect suitable environment! No file moved')

## Function to detect environment

In [None]:
import os
import sys

def detect_environment():
  try:
    import google.colab
    return 'colab'
  except ImportError:
    pass

  if 'CONDA_PREFIX' in os.environ or os.path.exists(os.path.join(sys.prefix, 'conda-meta')):
    return 'conda'

  return 'local'

env = detect_environment()

## Retrieve stored telegram messages

In [None]:
import pandas as pd

filename = 'telegram_messages_20250519_042802.csv'

dtype = {
  'id': 'string',
  'date': 'string',
  'text': 'string',
  'sender_id': 'string',
  'chat_id': 'string',
  'reply_to_msg_id': 'string',
  'views': 'Int64',
  'forwards': 'Int64',
  'buttons': 'Int64',
  'raw_text': 'string',
  'message_link': 'string'
}

try:
  if env == 'colab':
    file_path = f'/content/drive/MyDrive/telegram-data/{filename}'
  elif env == 'conda':
    file_path = f'./data/{filename}'
  else:
    raise EnvironmentError('Unable to detect suitable environment!')

  messages_df = pd.read_csv(file_path, dtype=dtype)
  messages_df['date'] = pd.to_datetime(
    messages_df['date'],
    format='%a %b %d %H:%M:%S %z %Y',
    errors='coerce'
  )
  messages_df['date'] = messages_df['date'].apply(
    lambda x: x.replace(tzinfo=None) if pd.notnull(x) else x
  )
  print(f'Successfully read from {file_path} to dataframe')
except FileNotFoundError:
  messages_df = pd.DataFrame()
  print(f'File not found')
except Exception as e:
  messages_df = pd.DataFrame()
  print(f'Error reading file: {e}')

display(messages_df)

## Analyze data distribution

In [None]:
monthly_counts = messages_df.groupby(messages_df['date'].dt.to_period('M')).size()
monthly_counts.index = monthly_counts.index.to_timestamp()

quarterly_counts = messages_df.groupby(messages_df['date'].dt.to_period('Q')).size()
quarterly_counts.index = quarterly_counts.index.to_timestamp()

def get_semester(date):
  if pd.isnull(date):
    return None
  return f'{date.year}-S1' if date.month <= 6 else f'{date.year}-S2'

messages_df['semester'] = messages_df['date'].apply(get_semester)
semesterly_counts = messages_df.groupby('semester').size()

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# monthly_counts.sort_index().plot(label='Monthly')
quarterly_counts.sort_index().plot(label='Quarterly')
# semesterly_counts.sort_index().plot(label='Semesterly')

plt.title('Message Volume Over Time')
plt.xlabel('Time')
plt.ylabel('Number of Messages')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## Preprocessing functions

In [None]:
import unicodedata
import emoji
import re
import string
from indoNLP.preprocessing import pipeline, remove_html, remove_url, replace_slang, replace_word_elongation

# Pre-compiled regex patterns
USERNAME_RE = re.compile(r'@\w+')
RT_RE = re.compile(r'\brt\b', flags=re.IGNORECASE)
HASHTAG_RE = re.compile(r'#')
DIGIT_RE = re.compile(r'\d+')
WHITESPACE_RE = re.compile(r'\s+')

# Pre-compiled translation
PUNCT_TRANSLATOR = str.maketrans('', '', string.punctuation)

def fast_clean(text):
  # Case folding to lowercase
  text = text.lower()

  # Normalize Unicode (remove fancy fonts, underlines)
  text = unicodedata.normalize('NFKD', text)
  text = ''.join(c for c in text if not unicodedata.combining(c))

  # Remove usernames
  text = USERNAME_RE.sub('', text)

  # Remove RT
  text = RT_RE.sub('', text)

  # Remove hashtag symbol but keep the word
  text = HASHTAG_RE.sub('', text)

  # Remove digits
  text = DIGIT_RE.sub('', text)

  # Remove punctuation
  text = text.translate(PUNCT_TRANSLATOR)

  # Remove emojis
  text = emoji.replace_emoji(text, replace='')

  # Remove extra whitespace
  text = WHITESPACE_RE.sub(' ', text).strip()

  return text

## Run preprocessing

In [None]:
import swifter

messages_df.dropna(subset=[
  'text',
  'raw_text'
], inplace=True)

messages_df['fast_clean'] = messages_df['text'].swifter.apply(fast_clean)

indonlp_pipeline = pipeline([
  remove_html,
  remove_url,
  replace_slang,
  replace_word_elongation
])

messages_df['basic_clean'] = messages_df['fast_clean'].swifter.apply(indonlp_pipeline)

messages_df.dropna(subset=[
  'text',
  'raw_text',
  'fast_clean',
  'basic_clean'
], inplace=True)

display(messages_df)

## Filter out text that is shorter than 5 words

In [None]:
messages_df = messages_df[messages_df['basic_clean'].str.split().str.len() > 5].copy()

display(messages_df)

## Analyze data distribution

In [None]:
monthly_counts = messages_df.groupby(messages_df['date'].dt.to_period('M')).size()
monthly_counts.index = monthly_counts.index.to_timestamp()

quarterly_counts = messages_df.groupby(messages_df['date'].dt.to_period('Q')).size()
quarterly_counts.index = quarterly_counts.index.to_timestamp()

def get_semester(date):
  if pd.isnull(date):
    return None
  return f'{date.year}-S1' if date.month <= 6 else f'{date.year}-S2'

messages_df['semester'] = messages_df['date'].apply(get_semester)
semesterly_counts = messages_df.groupby('semester').size()

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# monthly_counts.sort_index().plot(label='Monthly')
quarterly_counts.sort_index().plot(label='Quarterly')
# semesterly_counts.sort_index().plot(label='Semesterly')

plt.title('Message Volume Over Time')
plt.xlabel('Time')
plt.ylabel('Number of Messages')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## Retrieving language model

In [None]:
import os
import fasttext
import urllib.request

filename = 'lid.176.bin'
model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
model_path = ''

try:
  if env == 'colab':
    model_dir = '/content/drive/MyDrive/models/fasttext'
  elif env == 'conda':
    model_dir = './models/fasttext'
  else:
    raise EnvironmentError('Unable to detect suitable environment!')

  os.makedirs(model_dir, exist_ok=True)
  model_path = os.path.join(model_dir, filename)

  if not os.path.exists(model_path):
    print('Model not found locally. Downloading...')
    urllib.request.urlretrieve(model_url, model_path)
    print('Download completed!')

  print(f'Loading model from {model_path}...')
  model = fasttext.load_model(model_path)
  print('Model loaded successfully!')

except Exception as e:
  print(f'Error loading FastText model: {e}')

## Identify language

In [None]:
texts = messages_df['basic_clean'].tolist()

predictions = model.predict(texts)

messages_df['lang_detected'] = [label[0].replace('__label__', '') for label in predictions[0]]
messages_df['lang_confidence'] = [float(score[0]) for score in predictions[1]]

lang_stats = messages_df.groupby('lang_detected')['lang_confidence'].agg(
  count='count',
  min='min',
  q25=lambda x: x.quantile(0.25),
  mean='mean',
  median='median',
  q90=lambda x: x.quantile(0.9),
  max='max'
).reset_index().sort_values(by='count', ascending=False)

display(lang_stats)

## Filter text with Bahasa Indonesia

In [None]:
messages_df = messages_df[
  (messages_df['lang_detected'] == 'id')
].copy()

display(messages_df)

## Analyze data distribution

In [None]:
monthly_counts = messages_df.groupby(messages_df['date'].dt.to_period('M')).size()
monthly_counts.index = monthly_counts.index.to_timestamp()

quarterly_counts = messages_df.groupby(messages_df['date'].dt.to_period('Q')).size()
quarterly_counts.index = quarterly_counts.index.to_timestamp()

def get_semester(date):
  if pd.isnull(date):
    return None
  return f'{date.year}-S1' if date.month <= 6 else f'{date.year}-S2'

messages_df['semester'] = messages_df['date'].apply(get_semester)
semesterly_counts = messages_df.groupby('semester').size()

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# monthly_counts.sort_index().plot(label='Monthly')
quarterly_counts.sort_index().plot(label='Quarterly')
# semesterly_counts.sort_index().plot(label='Semesterly')

plt.title('Message Volume Over Time')
plt.xlabel('Time')
plt.ylabel('Number of Messages')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## Store language filtered dataframe to local file

In [None]:
from datetime import datetime

messages_df['date'] = messages_df['date'].dt.tz_localize(None)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
excel_filename = f'language_filtered_telegram_messages_{timestamp}.xlsx'
messages_df.to_excel(excel_filename, index=False)

if env == 'colab':
  file_path = f'/content/drive/MyDrive/telegram-data/'
  !mkdir -p $file_path
  !mv $excel_filename $file_path
  print(f'Successfully moved {excel_filename} to {file_path}')
elif env == 'conda':
  file_path = f'./data/'
  !mkdir -p $file_path
  !mv $excel_filename $file_path
  print(f'Successfully moved {excel_filename} to {file_path}')
else:
  print('Unable to detect suitable environment!')

## Install necessary library for pos tagging and stemming

In [None]:
# Core
%pip install numpy==1.25.2
%pip install pandas==1.5.3
%pip install scipy==1.10.1
%pip install scikit-learn==1.2.2
%pip install torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118
%pip install torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118

# NLP
%pip install flair==0.12.2

## Train POS tagging

In [None]:
from flair.datasets import UD_INDONESIAN
from flair.embeddings import WordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

corpus = UD_INDONESIAN()

tag_type = 'upos'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

embedding = WordEmbeddings('id')

tagger = SequenceTagger(
  hidden_size=256,
  embeddings=embedding,
  tag_dictionary=tag_dictionary,
  tag_type=tag_type,
  use_crf=True
)

trainer = ModelTrainer(tagger, corpus)

trainer.train(
  base_path='pos-id-model',
  learning_rate=0.1,
  mini_batch_size=32,
  max_epochs=10
)

## Function to detect environment

In [None]:
import os
import sys

def detect_environment():
  try:
    import google.colab
    return 'colab'
  except ImportError:
    pass

  if 'CONDA_PREFIX' in os.environ or os.path.exists(os.path.join(sys.prefix, 'conda-meta')):
    return 'conda'

  return 'local'

env = detect_environment()

## Store POS tagger model

In [None]:
if env == 'colab':
  dir_name = 'pos-id-model'
  dir_source = f'/content/{dir_name}'
  dir_target = f'/content/drive/MyDrive/models/{dir_name}'
  !mkdir -p $(dirname $dir_target)
  !cp -r $dir_source $dir_target
  print(f'Successfully copied {dir_name} to {dir_target}')

## Retrieve POS tagger model from drive

In [None]:
if env == 'colab':
  dir_name = 'pos-id-model'
  dir_source = f'/content/drive/MyDrive/models/{dir_name}'
  dir_target = f'/content/{dir_name}'
  !mkdir -p $(dirname $dir_target)
  !cp -r $dir_source $dir_target
  print(f'Successfully copied {dir_name} to {dir_target}')

## Predict POS tagging

## Filter relevant POS and them stem

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

allowed_tags = {'NOUN', 'VERB', 'ADJ'}

def filter_and_stem(tagged_tokens):
  stemmed = []
  for word, tag in tagged_tokens:
    if tag in allowed_tags:
      stemmed_word = stemmer.stem(word)
      if stemmed_word and len(stemmed_word) > 2:
        stemmed.append(stemmed_word)
  return stemmed

messages_df['stemmed_tokens'] = messages_df['pos_tags'].apply(filter_and_stem)
messages_df = messages_df[messages_df['stemmed_tokens'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

## Analyze data distribution

In [None]:
monthly_counts = messages_df.groupby(messages_df['date'].dt.to_period('M')).size()
monthly_counts.index = monthly_counts.index.to_timestamp()

quarterly_counts = messages_df.groupby(messages_df['date'].dt.to_period('Q')).size()
quarterly_counts.index = quarterly_counts.index.to_timestamp()

def get_semester(date):
  if pd.isnull(date):
    return None
  return f'{date.year}-S1' if date.month <= 6 else f'{date.year}-S2'

messages_df['semester'] = messages_df['date'].apply(get_semester)
semesterly_counts = messages_df.groupby('semester').size()

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# monthly_counts.sort_index().plot(label='Monthly')
quarterly_counts.sort_index().plot(label='Quarterly')
# semesterly_counts.sort_index().plot(label='Semesterly')

plt.title('Message Volume Over Time')
plt.xlabel('Time')
plt.ylabel('Number of Messages')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## Store POS tagged and filtered messsages

In [None]:
from datetime import datetime

messages_df['date'] = messages_df['date'].dt.tz_localize(None)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
excel_filename = f'pos_tagged_telegram_messages_{timestamp}.xlsx'
messages_df.to_excel(excel_filename, index=False)

if env == 'colab':
  file_path = f'/content/drive/MyDrive/telegram-data/'
  !mkdir -p $file_path
  !mv $excel_filename $file_path
  print(f'Successfully moved {excel_filename} to {file_path}')
elif env == 'conda':
  file_path = f'./data/'
  !mkdir -p $file_path
  !mv $excel_filename $file_path
  print(f'Successfully moved {excel_filename} to {file_path}')
else:
  print('Unable to detect suitable environment!')

## Retrieve POS tagged and filtered messages

In [None]:
import pandas as pd

filename = ''

try:
  if env == 'colab':
    file_path = f'/content/drive/MyDrive/telegram-data/{filename}'
    messages_df = pd.read_excel(file_path)
    messages_df['date'] = pd.to_datetime(messages_df['date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
    print(f'Successfully read from {file_path} to dataframe')
  elif env == 'conda':
    file_path = f'./data/{filename}'
    messages_df = pd.read_excel(file_path)
    messages_df['date'] = pd.to_datetime(messages_df['date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
    print(f'Successfully read from {file_path} to dataframe')
  else:
    print('Unable to detect suitable environment!')
except FileNotFoundError:
  messages_df = pd.DataFrame()
  print(f'File not found')
except Exception as e:
  messages_df = pd.DataFrame()
  print(f'Error reading file: {e}')

display(messages_df)

## Install necessary library for topic modeling

In [None]:
# Network Analysis
%pip install networkx

# Ploting
%pip install matplotlib
%pip install seaborn

## Run LDA topic modeling

In [None]:
from collections import defaultdict
from gensim import corpora, models
from gensim.models import CoherenceModel

def get_semester(date):
  if pd.isnull(date):
    return None
  return f'{date.year}-S1' if date.month <= 6 else f'{date.year}-S2'

messages_df['semester'] = messages_df['date'].apply(get_semester)

messages_df['assigned_topic'] = None
messages_df['topic_keywords'] = None

semesterly_topics = defaultdict(dict)
lda_objects = defaultdict(dict)

for semester, group in messages_df.groupby(['semester']):
  tokenized_docs_series = group['advanced_clean'].apply(str.split)

  indexed_docs = [(idx, doc) for idx, doc in zip(group.index, tokenized_docs_series) if len(doc) > 0]
  if len(indexed_docs) < 5:
    continue

  indices, tokenized_docs = zip(*indexed_docs)

  dictionary = corpora.Dictionary(tokenized_docs)
  dictionary.filter_extremes(no_below=5, no_above=0.85)

  corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

  filtered = [(idx, doc_bow, doc) for idx, doc_bow, doc in zip(indices, corpus, tokenized_docs) if len(doc_bow) > 0]
  if not filtered:
    continue

  indices, corpus, tokenized_docs = zip(*filtered)

  num_topics = 5

  lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
  )

  topic_id_to_keywords = {
    topic_id: ', '.join([word for word, _ in lda_model.show_topic(topic_id, topn=5)])
    for topic_id in range(num_topics)
  }

  dominant_topics = []
  topic_keywords = []

  for doc_bow in corpus:
    topic_probs = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
    dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
    dominant_topics.append(dominant_topic)
    topic_keywords.append(topic_id_to_keywords[dominant_topic])

  messages_df.loc[list(indices), 'assigned_topic'] = dominant_topics
  messages_df.loc[list(indices), 'topic_keywords'] = topic_keywords

  topics = [
    {
      'topic_id': topic_id,
      'top_words': topic_id_to_keywords[topic_id].split(', ')
    }
    for topic_id in range(num_topics)
  ]

  coherence_model = CoherenceModel(
    model=lda_model,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence='c_v'
  )

  semesterly_topics[semester] = {
    'topics': topics,
    'coherence': coherence_model.get_coherence()
  }

  lda_objects[semester] = {
    'model': lda_model,
    'corpus': corpus,
    'dictionary': dictionary
  }

## Topic heatmap

In [None]:
from collections import defaultdict
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

keyword_semester_freq = defaultdict(lambda: defaultdict(int))

for semester, topic_info in semesterly_topics.items():
  for topic in topic_info['topics']:
    for word in topic['top_words']:
      keyword_semester_freq[word][semester] += 1

heatmap_df = pd.DataFrame(keyword_semester_freq).T.fillna(0).astype(int)

top_keywords = heatmap_df.sum(axis=1).sort_values(ascending=False).head(20).index
heatmap_df = heatmap_df.loc[top_keywords]
heatmap_df = heatmap_df.sort_index(axis=1)

plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_df, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Keyword Frequency Across Semesters')
plt.xlabel('Semester')
plt.ylabel('Keyword')
plt.tight_layout()
plt.show()

## Identify knowledge seeker, contributor, or neutral

In [None]:
import pandas as pd
import networkx as nx
from datetime import timedelta

messages_df['assigned_topic'] = messages_df['assigned_topic'].astype('Int64')

def get_semester(date):
  if pd.isnull(date):
    return None
  return f'{date.year}-S1' if date.month <= 6 else f'{date.year}-S2'

messages_df['semester'] = messages_df['date'].apply(get_semester)

def get_semester_range(semester_str, buffer_days=30):
  year, half = semester_str.split('-S')
  year = int(year)

  if half == '1':
    start = pd.Timestamp(f'{year}-01-01')
    end = pd.Timestamp(f'{year}-06-30')
  else:
    start = pd.Timestamp(f'{year}-07-01')
    end = pd.Timestamp(f'{year}-12-31')

  return start - timedelta(days=buffer_days), end

role_dist_results = []
top_users_results = []

sem_topic_pairs = (
  messages_df.dropna(subset=['assigned_topic'])[['semester', 'assigned_topic']]
  .drop_duplicates()
  .sort_values(['semester', 'assigned_topic'])
)

for _, row in sem_topic_pairs.iterrows():
  semester = row['semester']
  topic = row['assigned_topic']
  if pd.isnull(semester) or pd.isnull(topic):
    continue

  topic_keywords = (
    messages_df[
      (messages_df['semester'] == semester) &
      (messages_df['assigned_topic'] == topic)
    ]['topic_keywords']
    .dropna()
    .unique()
  )

  topic_keywords = ', '.join(topic_keywords) if len(topic_keywords) > 0 else ''

  buffered_start, semester_end = get_semester_range(semester, buffer_days=30)

  subset = messages_df[
    (messages_df['date'] >= buffered_start) &
    (messages_df['date'] <= semester_end) &
    (messages_df['assigned_topic'] == topic)
  ].copy()

  if subset.empty:
    continue

  id_to_sender = subset.set_index('id')['sender_id'].to_dict()

  reply_msgs = subset[
    (subset['date'] >= get_semester_range(semester)[0] + timedelta(days=30)) &
    (subset['date'] <= semester_end) &
    (subset['reply_to_msg_id'].notna())
  ]

  edges = []
  for _, msg in reply_msgs.iterrows():
    reply_to_id = msg['reply_to_msg_id']
    replier = msg['sender_id']
    original_sender = id_to_sender.get(reply_to_id)

    if pd.notna(replier) and pd.notna(original_sender) and replier != original_sender:
      edges.append((replier, original_sender))

  G = nx.DiGraph()
  G.add_edges_from(edges)

  in_deg = dict(G.in_degree())
  out_deg = dict(G.out_degree())
  all_users = list(set(in_deg) | set(out_deg))

  user_roles = pd.DataFrame({
    'sender_id': all_users,
    'in_degree': [in_deg.get(uid, 0) for uid in all_users],
    'out_degree': [out_deg.get(uid, 0) for uid in all_users]
  })

  user_roles['role'] = user_roles.apply(
    lambda row: 'contributor' if row['in_degree'] > row['out_degree']
    else 'seeker' if row['out_degree'] > row['in_degree']
    else 'neutral',
    axis=1
  )

  semester_msgs = subset[
    (subset['date'] >= get_semester_range(semester)[0] + timedelta(days=30)) &
    (subset['date'] <= semester_end)
  ]

  msg_counts = semester_msgs.groupby('sender_id').size().reset_index(name='message_count')
  user_roles = user_roles.merge(msg_counts, on='sender_id', how='left').fillna({'message_count': 0})

  role_counts = user_roles['role'].value_counts().to_dict()

  role_dist_results.append({
    'semester': semester,
    'assigned_topic': topic,
    'topic_keywords': topic_keywords,
    'contributor': role_counts.get('contributor', 0),
    'seeker': role_counts.get('seeker', 0),
    'neutral': role_counts.get('neutral', 0)
  })

  top_users = {}
  for role in ['contributor', 'seeker', 'neutral']:
    top = (
      user_roles[user_roles['role'] == role]
      .sort_values('message_count', ascending=False)
      .head(3)[['sender_id', 'message_count']]
      .to_dict('records')
    )
    top_users[f'top_{role}s'] = top

  top_users_results.append({
    'semester': semester,
    'assigned_topic': topic,
    'topic_keywords': topic_keywords,
    **top_users
  })

role_distribution_topicwise = pd.DataFrame(role_dist_results)
top_users_df = pd.DataFrame(top_users_results)

## Export

In [None]:
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
excel_filename = f'topic_role_analysis_{timestamp}.xlsx'

with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
  role_distribution_topicwise.to_excel(writer, sheet_name='Role Distribution', index=False)
  top_users_df.to_excel(writer, sheet_name='Top Users', index=False)

if env == 'colab':
  file_path = f'/content/drive/MyDrive/telegram-data/'
  !mkdir -p $file_path
  !mv $excel_filename $file_path
  print(f'Successfully moved {excel_filename} to {file_path}')
elif env == 'conda':
  file_path = f'./data/'
  !mkdir -p $file_path
  !mv $excel_filename $file_path
  print(f'Successfully moved {excel_filename} to {file_path}')
else:
  print('Unable to detect suitable environment!')