# Topic Visualization

## Initialization

In [None]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import os

In [None]:
PATH_DIR = './'
os.chdir(PATH_DIR)

## Functions

In [6]:
##################################### GET FILES #####################################

def load_pickle(filename, PATH_FILES, file_ending):
    try:
        with open(PATH_FILES+filename+file_ending+'.pickle', 'rb') as handle:
            object = pickle.load(handle)
        print(filename+file_ending+" file loaded")
        return object
    except:
        return None

def load_model(PATH_MODEL):
    topic_model = BERTopic.load(PATH_MODEL)
    print("BERT model file loaded")
    return topic_model

def get_files(PATH_FILES, file_ending, PATH_MODEL):
    corpus = load_pickle("corpus", PATH_FILES, file_ending)
    embeddings = load_pickle("embeddings", PATH_FILES, file_ending)
    topic_model = load_model(PATH_MODEL)
    return topic_model, corpus, embeddings

##################################### VISUALIZATION #####################################

def visualize_hierarchy_(topic_model, corpus):
    hierarchical_topics = topic_model.hierarchical_topics(corpus)
    return topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

def reduce_topics_(topic_model, corpus, n_topics=None):
    if (n_topics is None):
        topic_model.reduce_topics(corpus, nr_topics="auto")
    else:
        topic_model.reduce_topics(corpus, nr_topics=n_topics)

def merge_topics_(topic_model, corpus, topic_list):
    topic_model.merge_topics(corpus, topic_list)

def update_with_vectorizer(topic_model, corpus):
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))#, min_df=50, max_features=20_000)
    topic_model.update_topics(corpus, vectorizer_model=vectorizer_model)
    return topic_model

## Model loading

In [7]:
file_ending = '_2022_v4'
PATH_FILES = 'Data/bert_out/'
PATH_MODEL = PATH_FILES + 'bert_fitted_model'+file_ending+'/'

topic_model, corpus, embeddings = get_files(PATH_FILES, file_ending, PATH_MODEL)

corpus_2022_v4 file loaded
embeddings_2022_v4 file loaded


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

BERT model file loaded


## Visualization

In [None]:
topic_stats_df = topic_model.get_topic_info()
topic_stats_df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,58626,-1_like_rt_one_people,"[like, rt, one, people, get, follow, love, im,...",
1,0,2442,0_song_album_music_spotify,"[song, album, music, spotify, songs, listen, p...",
2,1,2225,1_trump_biden_president_fbi,"[trump, biden, president, fbi, donald, donald ...",
3,2,2176,2_ukraine_russian_russia_putin,"[ukraine, russian, russia, putin, ukrainian, w...",
4,3,1626,3_movie_film_episode_watch,"[movie, film, episode, watch, scene, trailer, ...",
...,...,...,...,...,...
130,129,108,129_tour_concert_world tour_love tour,"[tour, concert, world tour, love tour, october...",
131,130,106,130_shes_hes_shes pretty_shes shes,"[shes, hes, shes pretty, shes shes, pretty, he...",
132,131,105,131_lisa_celine_lalisa_lalisa human,"[lisa, celine, lalisa, lalisa human, human cel...",
133,132,104,132_breath_smell_breathe_smells,"[breath, smell, breathe, smells, cant breathe,...",


In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart([topic for topic in range(30)])

In [None]:
visualize_hierarchy_(topic_model, corpus)

100%|██████████| 133/133 [00:02<00:00, 46.25it/s]


## Topic reduction (auto)

In [8]:
reduce_topics_(topic_model, corpus)

2024-06-25 13:06:32,213 - BERTopic - Topic reduction - Reducing number of topics
2024-06-25 13:06:47,375 - BERTopic - Topic reduction - Reduced number of topics from 135 to 103


In [9]:
topic_stats_df = topic_model.get_topic_info()
topic_stats_df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,58626,-1_like_rt_one_people,"[like, rt, one, people, get, follow, love, tim...","[mins rt rt like, k hours rt follow, k hours r..."
1,0,5651,0_tag_follow_giveaway_retweet,"[tag, follow, giveaway, retweet, tag friends, ...",[giveaway x og spots enter follow like rt tag ...
2,1,2987,1_birthday_happy birthday_happy_morning,"[birthday, happy birthday, happy, morning, day...","[happy birthday, happy birthday, happy birthday]"
3,2,2442,2_song_album_music_spotify,"[song, album, music, spotify, songs, listen, p...",[seokjin shazam record holder seokjin represen...
4,3,2225,3_trump_biden_president_fbi,"[trump, biden, president, fbi, donald, donald ...",[say president trump hes president anymore sur...
...,...,...,...,...,...
98,97,110,97_cute cute_cute_cutest_cute cutest,"[cute cute, cute, cutest, cute cutest, cutest ...","[cute, cute, cute]"
99,98,109,98_account_suspended_instagram_accounts,"[account, suspended, instagram, accounts, help...","[last account suspended, suspended, give us re..."
100,99,106,99_shes_hes_shes pretty_shes shes,"[shes, hes, shes pretty, shes shes, pretty, he...","[shes, shes, shes]"
101,100,105,100_lisa_celine_lalisa_lalisa human,"[lisa, celine, lalisa, lalisa human, human cel...","[lisa, energy lalisa human celine, one favouri..."


In [10]:
topic_stats_df.to_excel('/content/drive/MyDrive/23-24_Uni/TFG/BERTopic/files/bert'+file_ending+'/topic_stats.xlsx')

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart([topic for topic in range(30)])

In [None]:
visualize_hierarchy_(topic_model, corpus)

100%|██████████| 101/101 [00:03<00:00, 27.78it/s]


## Topic reduction (30)

In [None]:
topic_model = load_model(PATH_MODEL)

BERT model file loaded


In [None]:
reduce_topics_(topic_model, corpus, n_topics=30)

2024-06-25 09:37:10,727 - BERTopic - Topic reduction - Reducing number of topics
2024-06-25 09:37:19,601 - BERTopic - Topic reduction - Reduced number of topics from 135 to 30


In [None]:
topic_stats_df = topic_model.get_topic_info()
topic_stats_df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,58626,-1_like_rt_one_get,"[like, rt, one, get, people, follow, love, tim...","[mins rt rt like, k hours rt follow, k hours r..."
1,0,6994,0_love_im_like_want,"[love, im, like, want, heart, friends, crying,...","[love, love g, love]"
2,1,6067,1_song_video_album_music,"[song, video, album, music, new, movie, watch,...",[bts paved way favorite song best selling song...
3,2,4125,2_birthday_morning_happy_happy birthday,"[birthday, morning, happy, happy birthday, goo...","[happy birthday, happy birthday, happy birthday]"
4,3,3807,3_trump_biden_abortion_president,"[trump, biden, abortion, president, people, fb...",[ivanka trump daughter former president donald...
5,4,3734,4_tag_follow_giveaway_tag friends,"[tag, follow, giveaway, tag friends, airdrop, ...",[giveaway sol x nfts x wl spots enter follow r...
6,5,3381,5_art_hair_moon_look,"[art, hair, moon, look, eyes, like, photo, blu...","[eyes, eyes, work art]"
7,6,2760,6_ukraine_russia_russian_putin,"[ukraine, russia, russian, putin, war, ukraini...",[russias putins war putin actual russian soldi...
8,7,2507,7_god_ji_india_nigeria,"[god, ji, india, nigeria, allah, khan, pakista...",[today nobody bring happiness except god kabir...
9,8,2355,8_team_game_season_league,"[team, game, season, league, ronaldo, player, ...",[get lot hate psg fans say lionel messi treble...


In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart([topic for topic in range(29)])

In [None]:
visualize_hierarchy_(topic_model, corpus)

100%|██████████| 28/28 [00:00<00:00, 33.00it/s]
