In [1]:

import pandas as pd
import gzip
import sys
import glob
import os
import re
from tqdm.notebook import tqdm
import sqlite3
from IPython.display import display

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model)


chats_db_path = "/home/students/s328743/telegram_2024/usc-tg-24-us-election/chats.db"
csv_path_first_nodes = "/home/students/s328743/Thesis/first_nodes.csv.gz"
csv_path_discovery_edges = "/home/students/s328743/Thesis/discovery_edges.csv.gz"

conn = sqlite3.connect(chats_db_path)
df_chats = pd.read_sql("SELECT * FROM chats", conn)
df_first_nodes = pd.read_csv(csv_path_first_nodes, compression='gzip')
df_discovery_edges = pd.read_csv(csv_path_discovery_edges, compression='gzip')

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 200)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.expand_frame_repr', False)

file_paths = glob.glob("/home/students/s328743/telegram_2024/usc-tg-24-us-election/extracted/*/*.tsv.gz")[:1000]

dfs = []

for path in tqdm(file_paths):
    df2 = pd.read_csv(path, sep="\t", quotechar='"', dtype={10: str, 15: str})
    assert df2.timestamp.notna().all()
    
    df2['file_path'] = path

    match = re.search(r"/extracted/([^/]+)/", path)
    df2['group_name'] = match.group(1) if match else None

    dfs.append(df2)

final_df = pd.concat(dfs, ignore_index=True)

print("Fist Nodes\n------------------------------\n\n")
display(df_first_nodes)
print("Discovery edges\n-----------------------------------\n\n")
display(df_discovery_edges)
print("Chats.db\n-----------------------------------\n\n")
display(df_chats)
print("Extracted\n-----------------------------------\n\n")
display(final_df.head())

df_clean = final_df.dropna(subset=['text'])

df_grouped = (
    df_clean.groupby("group_name")["text"]
    .apply(lambda x: " ".join(x.astype(str)))
    .reset_index()
)

documents = df_grouped['text'].tolist()
topics, probs = topic_model.fit_transform(documents)

df_grouped['topic_id'] = topics

df_grouped['topic_label'] = df_grouped['topic_id'].apply(
    lambda x: topic_model.get_topic(x)[0][0] if x != -1 else "unknown"
)

df_result = df_grouped[['group_name', 'topic_id', 'topic_label']]
tools.display_dataframe_to_user(name="Topic per gruppo Telegram", dataframe=df_result)


ModuleNotFoundError: No module named 'pandas'