In [21]:
####        \\      Load Relevant Libraries      //      ### 
from bertopic import BERTopic
import json
import pandas as pd
import pathlib

In [22]:
print(pathlib.Path().absolute())

/Users/jenskoning/Documents/Python_projects/dsr_media_nlp/dsr_media_nlp/Topic_model


In [23]:
with open("all_news_items.json", "r", encoding="utf-8") as f:
    docs = json.load(f)
len(docs)

4492

In [24]:
pd_docs = pd.DataFrame(docs)

# List of words to check in the column names
words_to_remove = ['bitcoin', 'Bitcoin', 'crypto', 'cryptocurrency', 'museum', 'macclesfield', 'coins', 'fashion', 'ftx', 'dunhuang', 'heritage', 'archaeological', 'abortion', 'republican', 'hearings',
                   'liverpool', 'temple', 'art', 'cultural', 'nike', 'caves', 'murals','relics', 'exhibition', 'hotel', 'textile', 'ciff', 'migrants', 'meloni', 'gold',
                   'weather', 'meteorological', 'autograph', 'columbus', 'cashmere', 'xifeng', 'wuliangye', 'liquor', 'grottoes', 'ancient', 'furniture', 'tourism', 'tourists',
                   'cancer', 'blockchain', 'isis', 'covid', 'polyamide', 'wine', 'Deodorants', 'cbdc', 'bloodstock', 'ecological', 'festival']

# Find columns containing the specified words and remove them
pd_docs = pd_docs[~pd_docs['title'].str.lower().str.contains('|'.join(words_to_remove))]
len(pd_docs)

3478

In [25]:
# Remmove missing values from target column 
column = pd.DataFrame(pd_docs, columns=['summary'])
column = [text for text in pd_docs['summary'] if text is not None]
cleaned_text_data = column

In [26]:
# Display cleaned text
cleaned_text_data

["As part of China's massive Belt and Road Initiative (BRI), the biggest infrastructure undertaking in the world, Beijing has launched the Digital Silk Road (DSR). Announced in 2015 with a loose mandate, the DSR has become a significant part of Beijing's overall BRI strategy, under which China provides aid, political support, and other assistance to recipient states. DSR also provides support to Chinese exporters, including many well-known Chinese technology companies, such as Huawei. The DSR assistance goes toward improving recipients' telecommunications networks, artificial intelligence capabilities, cloud computing, e-commerce and mobile payment systems, surveillance technology, smart cities, and other high-tech areas.\nChina has already signed agreements on DSR cooperation with, or provided DSR-related investment to, at least sixteen countries. But the true number of agreements and investments is likely much larger, because many of these go unreported: memoranda of understanding (M

In [27]:
#### Load embedding model ####
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")


# Reduce influence of stop words / Improves the representation of topics
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model)

In [28]:
topics, probs = topic_model.fit_transform(cleaned_text_data)

In [29]:
topics = topic_model.get_topic_info()
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,812,-1_terrorist_2016_counterterrorism_attacks,"[terrorist, 2016, counterterrorism, attacks, s...","[WILLIAMSBURG, Va., April 19, 2021 /PRNewswire..."
1,0,225,0_huawei_5g_data_internet,"[huawei, 5g, data, internet, dsr, digital, hua...","[The ""The State of 4G & 5G Pricing in Latin Am..."
2,1,153,1_land_festival_agricultural_film,"[land, festival, agricultural, film, dunhuang,...",[By WANG KAIHAO | China Daily | Updated: 2021-...
3,2,84,2_b3w_g7_labour_infrastructure,"[b3w, g7, labour, infrastructure, carbis, comm...",[G7 rivals China with grand infrastructure pla...
4,3,79,3_ap_biden_g7_mobilize,"[ap, biden, g7, mobilize, england, involves, f...",[By Associated Press \n2022/06/29 02:35 \n \n\...
...,...,...,...,...,...
91,90,12,90_kazakhstan_kazakh_kazakhstans_almaty,"[kazakhstan, kazakh, kazakhstans, almaty, step...","[Xinhua \n19th May 2023, 14:18 GMT+10\n© Provi..."
92,91,12,91_russia_eurasian_murg_spief,"[russia, eurasian, murg, spief, russian, store...",[PHNOM PENH — \nRussia is attempting to expand...
93,92,11,92_arctic_chen_chunjiang_polar,"[arctic, chen, chunjiang, polar, 5713, 421000,...","[BEIJING, March 2 (Xinhua) -- Economic and tra..."
94,93,11,93_tajani_italy_envoys_draghi,"[tajani, italy, envoys, draghi, origins, lijia...","[Ministry of Foreign Affairs, the People's Rep..."


In [30]:
topic_model.visualize_heatmap(n_clusters=15)

In [31]:
topic_model.get_topic_info().head(15)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,812,-1_terrorist_2016_counterterrorism_attacks,"[terrorist, 2016, counterterrorism, attacks, s...","[WILLIAMSBURG, Va., April 19, 2021 /PRNewswire..."
1,0,225,0_huawei_5g_data_internet,"[huawei, 5g, data, internet, dsr, digital, hua...","[The ""The State of 4G & 5G Pricing in Latin Am..."
2,1,153,1_land_festival_agricultural_film,"[land, festival, agricultural, film, dunhuang,...",[By WANG KAIHAO | China Daily | Updated: 2021-...
3,2,84,2_b3w_g7_labour_infrastructure,"[b3w, g7, labour, infrastructure, carbis, comm...",[G7 rivals China with grand infrastructure pla...
4,3,79,3_ap_biden_g7_mobilize,"[ap, biden, g7, mobilize, england, involves, f...",[By Associated Press \n2022/06/29 02:35 \n \n\...
5,4,70,4_asean_chinaasean_rcep_asiapacific,"[asean, chinaasean, rcep, asiapacific, brunei,...","[The 43rd ASEAN Summit, to be held in Jakarta ..."
6,5,69,5_ap_biden_republican_g7,"[ap, biden, republican, g7, ulbricht, court, m...",[WASHINGTON (AP) — The House Jan. 6 panel is c...
7,6,52,6_market_ultrasonic_cleaner_shaking,"[market, ultrasonic, cleaner, shaking, stents,...","[The ""Global Mosquito Repellent Market 2021-20..."
8,7,51,7_gcc_chinagcc_gulf_saudi,"[gcc, chinagcc, gulf, saudi, arabia, oil, riya...","[Posted on December 18, 2022 by martyrashrakat..."
9,8,50,8_party_deng_xis_renaissance,"[party, deng, xis, renaissance, ccp, communist...",[Marx. Lenin. Mao. Deng. Xi. Late last week in...


In [32]:
topic_model.get_topic_info(3)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,3,79,3_ap_biden_g7_mobilize,"[ap, biden, g7, mobilize, england, involves, f...",[By Associated Press \n2022/06/29 02:35 \n \n\...


In [33]:
topic_model.visualize_barchart(
    topics=list(range(1, 10)),
    title='Most Common Topics (n = 3487)' 
)

In [34]:
topic_model.visualize_hierarchy()

In [35]:
#### Remove irrelevant topics? Reduce the number of topics ####
#reduced_topics = topic_model.reduce_topics(cleaned_text_data, nr_topics=50)

In [36]:
#reduced_topics.get_topic_info()

In [37]:
# Things to try out

# Reduce the number of topics
#topic.model.reduce_topics(xxx, nr_topics=25)

# Find specific topics

topic_model.find_topics("Digital Silk Road", top_n=1)



([0], [0.58502394])