### Simple topic modelings - with google news 

In [1]:
import pandas as pd 
import os 
#from utils import Args
import json 
import numpy as np
from sentence_transformers import SentenceTransformer
from topic_model_utils import *
#model_setup,train_topic_model,eval_topic_model,train_and_eval,hyper_param_permutation,Args,pack_update_param

In [2]:
## define some global paths and variables  
LOAD_EMB = True
res_folder = '/data/chuang/news_scrape/data/news_search_res'
news_output_p = os.path.join(res_folder,'news_merged.csv')
emb_path  = os.path.join(res_folder,'sentence_embeddings.npy')
docs_path = os.path.join(res_folder,'docs.npy')
topic_model_out_path = os.path.join(res_folder,'topic_model')

## Evaluate raw data 

In [3]:
## raw data exploration 
df = pd.read_csv(news_output_p)
docs = df['body'].tolist() ## get only the body 
df.head()


Unnamed: 0,newspaper_name,time_start,time_end,year,month,day,title,link,body,body_length
0,iwnsvg.com,2010-05-01,2010-06-01,2010,5,9,Opposition leader: Taiwanese letting Vincies o...,https://news.google.com/rss/articles/CBMiSWh0d...,ST. VINCENT: – Stakeholders in the agricultura...,2960
1,iwnsvg.com,2010-05-01,2010-06-01,2010,5,10,Three artistes bag early Vincy Mas wins – iWit...,https://news.google.com/rss/articles/CBMiTmh0d...,ST. VINCENT: – Three artistes left the launch ...,3133
2,iwnsvg.com,2010-05-01,2010-06-01,2010,5,11,"U.S. to reengage Caribbean, diplomats says – i...",https://news.google.com/rss/articles/CBMiS2h0d...,CARIBBEAN: – The Barack Obama administration i...,3624
3,iwnsvg.com,2010-05-01,2010-06-01,2010,5,25,Vincies safe as 27 die in Jamaica shootouts – ...,https://news.google.com/rss/articles/CBMiTmh0d...,ST. VINCENT:- Vincentian students at the Unive...,3338
4,iwnsvg.com,2013-09-01,2013-10-01,2013,9,1,Concerns about approved BLA capitalization pla...,https://news.google.com/rss/articles/CBMiUmh0d...,Some members of the Building and Loan Associat...,5857


## Prepare data for topic modeling

In [4]:
## define an arg class to read arguments from json
args = Args('./args/train_args.json')
for attr, value in args.__dict__.items():
    print(f"{attr} = {value}")

TUNE = False
verbose = False
model_name = sentence-transformers/all-MiniLM-L6-v2
model_checkpoint = sentence-transformers/all-MiniLM-L6-v2
n_neighbors = 15
n_components = 5
min_cluster_size = 5
min_samples = 5
min_df = 5
nr_topics = auto
metric = euclidean
calculate_probabilities = False
top_n_words = 10


In [5]:
#### load sentence embeding model and encode docs
if not LOAD_EMB:
    print('use model : {}'.format(args.model_name))
    sentence_model = SentenceTransformer(args.model_name)
    embeddings = sentence_model.encode(docs, show_progress_bar=True) ## encode sentences 
    assert len(docs)==len(embeddings)
    embeddings = np.array(embeddings)
    docs = np.array(docs)
    np.save(emb_path,embeddings)
    np.save(docs_path,docs)
else:
    print('Load embeding from {}'.format(emb_path))
    embeddings = np.load(emb_path)
    docs = np.load(docs_path)
    assert len(docs)==len(embeddings)
    print('Number of docs: {}'.format(len(docs)))

Load embeding from /data/chuang/news_scrape/data/news_search_res/sentence_embeddings.npy
Number of docs: 571


### set up parameter search space 

In [6]:
## permutate all conbinations 
train_args_inputs = {
            'n_neighbors':[5,10],
            'n_components':[3,5],
            'min_cluster_size':[5,10],
            'min_samples': [5],
            'metric':['euclidean'],
            'top_n_words':[10],
            }
train_args_space = hyper_param_permutation(train_args_inputs)
#print(train_args_space)

### iterate through all params

In [7]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"  ## set it to false to avoid warning message 
all_res = []
for idx,param in enumerate(tqdm(train_args_space)):
    args.__dict__.update(param)
    # for attr, value in args.__dict__.items():
    #     print(f"{attr} = {value}")
    try:
        topics,probabilities,topic_model = train_topic_model(args,docs,embeddings)
        #coherence_scores,outlier_percent,n_topics,diversity_score = train_and_eval(args,docs,embeddings)
        coherence_scores,outlier_percent,n_topics,diversity_score = eval_topic_model(docs,topics,probabilities,
                                                                                        topic_model,n_workers=16)
    except Exception as e:
        print('-- Error -- \n{}\n{}'.format(param,e))
        coherence_scores,outlier_percent,n_topics,diversity_score = (None,None,None,None)
    res_dict = pack_update_param(param,coherence_scores,outlier_percent,n_topics,diversity_score)
    all_res.append(res_dict)
    if args.verbose:
        print(res_dict)
    

100%|██████████| 8/8 [14:10<00:00, 106.37s/it]


In [8]:
res_df = pd.DataFrame(all_res)
res_df = res_df.sort_values(by='coherence', ascending=False)
best_param = res_df.iloc[0].to_dict()
print(best_param)
res_df.head()

{'n_neighbors': 10, 'n_components': 5, 'min_cluster_size': 10, 'min_samples': 5, 'metric': 'euclidean', 'top_n_words': 10, 'coherence': 0.5073137990175771, 'diversity': 0.9, 'outlier': 0.20665499124343256, 'number_topics': 6}


Unnamed: 0,n_neighbors,n_components,min_cluster_size,min_samples,metric,top_n_words,coherence,diversity,outlier,number_topics
7,10,5,10,5,euclidean,10,0.507314,0.9,0.206655,6
6,10,5,5,5,euclidean,10,0.505834,0.74,0.197898,5
0,5,3,5,5,euclidean,10,0.480776,0.9,0.246935,16
4,10,3,5,5,euclidean,10,0.463478,0.833333,0.227671,15
2,5,5,5,5,euclidean,10,0.422394,0.85,0.178634,14


### retrain model with best param 

In [9]:
args.__dict__.update(best_param)
args.verbose=True
args.TUNE=False
topics,probabilities,topic_model = train_topic_model(args,docs,embeddings)
topic_model.save(topic_model_out_path, save_embedding_model=True)
print()

use sentence-transformers/all-MiniLM-L6-v2 as embeding model


2023-11-19 16:11:16,926 - BERTopic - Reduced dimensionality
2023-11-19 16:11:16,945 - BERTopic - Clustered reduced embeddings
2023-11-19 16:12:19,430 - BERTopic - Reduced number of topics from 16 to 6






### Topic model visual evaluation 

In [10]:
topic_model.visualize_barchart()

In [11]:
topic_model.visualize_topics()