In [146]:
####        \\      Load Relevant Libraries      //      ### 
from bertopic import BERTopic
import json
import pandas as pd
import pathlib

In [147]:
print(pathlib.Path().absolute())

/Users/jenskoning/Documents/Python_projects/dsr_media_nlp/dsr_media_nlp/Topic_model


In [148]:
with open("all_news_items.json", "r", encoding="utf-8") as f:
    docs = json.load(f)
len(docs)

4492

In [149]:
pd_docs = pd.DataFrame(docs)

# List of words to check in the column names
words_to_remove = ['bitcoin', 'crypto', 'cryptocurrency', 'museum', 'macclesfield', 'coins', 'fashion', 'ftx', 'dunhuang', 'archaeological', 'abortion', 'republican', 'hearings',
                   'liverpool', 'temple', 'art', 'cultural', 'nike', 'caves', 'murals','relics', 'exhibition', 'hotel', 'textile', 'ciff', 'migrants', 'meloni', 'gold',
                   'weather', 'meteorological', 'autograph', 'columbus', 'cashmere', 'xifeng', 'wuliangye', 'liquor', 'grottoes', 'ancient', 'furniture', 'tourism', 'tourists',
                   'cancer']

# Find columns containing the specified words and remove them
pd_docs = pd_docs[~pd_docs['title'].str.lower().str.contains('|'.join(words_to_remove))]
len(pd_docs)

3565

In [150]:
# Remmove missing values from target column 
column = pd.DataFrame(pd_docs, columns=['summary'])
column = [text for text in pd_docs['summary'] if text is not None]

In [151]:
#### Clean stop words from test data ####
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

stop_words = set(stopwords.words("english"))


cleaned_text_data = [
    " ".join([word for word in sentence.split() if word.lower() not in stop_words])
    for sentence in column
]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jenskoning/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [152]:
# Display cleaned text
cleaned_text_data

["part China's massive Belt Road Initiative (BRI), biggest infrastructure undertaking world, Beijing launched Digital Silk Road (DSR). Announced 2015 loose mandate, DSR become significant part Beijing's overall BRI strategy, China provides aid, political support, assistance recipient states. DSR also provides support Chinese exporters, including many well-known Chinese technology companies, Huawei. DSR assistance goes toward improving recipients' telecommunications networks, artificial intelligence capabilities, cloud computing, e-commerce mobile payment systems, surveillance technology, smart cities, high-tech areas. China already signed agreements DSR cooperation with, provided DSR-related investment to, least sixteen countries. true number agreements investments likely much larger, many go unreported: memoranda understanding (MOUs) necessarily show whether China another country embarked upon close cooperation digital sphere. estimates suggest one-third countries participating BRI—13

In [153]:
#### Load embedding model ####
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [154]:
topics, probs = topic_model.fit_transform(cleaned_text_data)

In [121]:
topics = topic_model.get_topic_info()
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,962,-1_china_security_terrorist_chinese,"[china, security, terrorist, chinese, new, als...","[amazing 2020, electric vehicle (EV) stocks fi..."
1,0,97,0_bri_cooperation_road_belt,"[bri, cooperation, road, belt, countries, deve...","[2021/06/23 June 23, 2021, State Councilor For..."
2,1,74,1_cultural_dunhuang_art_archaeological,"[cultural, dunhuang, art, archaeological, arch...",[* submersible secured 200 cultural relics dat...
3,2,73,2_g7_b3w_infrastructure_bri,"[g7, b3w, infrastructure, bri, initiative, pla...",[Group Seven richest democracies sought Saturd...
4,3,63,3_asean_rcep_china_trade,"[asean, rcep, china, trade, cooperation, econo...","[NANNING, Nov. 27 (Xinhua) -- Chinese Presiden..."
...,...,...,...,...,...
114,113,11,113_zohir_forum_saidzoda_bds,"[zohir, forum, saidzoda, bds, gansu, central, ...","[""Strengthen Solidarity Cooperation Building C..."
115,114,11,114_lianyungang_sco_logistics_table,"[lianyungang, sco, logistics, table, jiangsu, ...","[BEIJING, Aug. 18, 2023 /PRNewswire/ -- 2023 C..."
116,115,11,115_furniture_xianzuo_craftsmanship_classical,"[furniture, xianzuo, craftsmanship, classical,...","[BEIJING, Oct. 20, 2021 /PRNewswire/ -- furnit..."
117,116,11,116_uzbekistan_tashkent_uzbek_temirgali,"[uzbekistan, tashkent, uzbek, temirgali, ferga...",[◀ Back List article Writer Uzbekistan Preside...


In [84]:
topic_model.visualize_heatmap(n_clusters=20)

In [145]:
topic_model.get_topic_info().head(15)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,913,-1_terrorist_china_security_also,"[terrorist, china, security, also, government,...","[Toronto, Ontario--(Newsfile Corp. - March 8, ..."
1,0,106,0_covid_19_health_pandemic,"[covid, 19, health, pandemic, china, africa, v...",[Human civilizations experienced various sever...
2,1,103,1_bri_cooperation_road_belt,"[bri, cooperation, road, belt, countries, init...","[2021/06/23 June 23, 2021, State Councilor For..."
3,2,80,2_asean_china_trade_rcep,"[asean, china, trade, rcep, economic, malaysia...",[(Remarks Chinese Ambassador Huang Xilian Mani...
4,3,74,3_g7_b3w_infrastructure_bri,"[g7, b3w, infrastructure, bri, initiative, pla...",[Group Seven richest democracies sought Saturd...
5,4,66,4_kazakhstan_two_tokayev_cooperation,"[kazakhstan, two, tokayev, cooperation, sides,...","[(Xinhua) 08:17, September 15, 2022 Chinese Pr..."
6,5,60,5_china_nato_chinese_russia,"[china, nato, chinese, russia, us, military, u...","[Like immediate predecessor, Joe Biden committ..."
7,6,56,6_central_asian_asia_summit,"[central, asian, asia, summit, countries, coop...",[Chinese President Xi Jinping chairs first Chi...
8,7,54,7_market_polyamide_report_inc,"[market, polyamide, report, inc, growth, analy...","[""North America Deodorants Market - Growth, Tr..."
9,8,49,8_ap_biden_said_republican,"[ap, biden, said, republican, g7, president, t...",[Associated Press 2022/06/28 22:54 WASHINGTON ...


In [58]:
#### Remove irrelevant topics? Reduce the number of topics ####
reduced_topics = topic_model.reduce_topics(cleaned_text_data, nr_topics=25)

In [59]:
reduced_topics.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,880,-1_china_said_new_global,"[china, said, new, global, chinese, digital, c...","[New York, NY, April 10, 2021 (GLOBE NEWSWIRE)..."
1,0,1482,0_china_cooperation_countries_development,"[china, cooperation, countries, development, r...","[XI'AN, May 19 (Xinhua) -- Chinese President X..."
2,1,556,1_cultural_city_digital_development,"[cultural, city, digital, development, china, ...","[BEIJING, Jan. 3, 2023 /PRNewswire/ -- ongoing..."
3,2,101,2_china_covid_19_year,"[china, covid, 19, year, economy, health, afri...",[© Provided Xinhua huge potential Chinese econ...
4,3,77,3_bitcoin_bank_said_digital,"[bitcoin, bank, said, digital, money, bloomber...","[BRASILIA, March 8 (Reuters) - Brazil Presiden..."
5,4,75,4_india_us_indo_pacific,"[india, us, indo, pacific, afghanistan, china,...","[ANI | Updated: Mar 24, 2022 15:00 IST London ..."
6,5,72,5_brand_xifeng_wuliangye_liquor,"[brand, xifeng, wuliangye, liquor, brands, wor...","[BEIJING , May 12, 2023 /PRNewswire/ -- high-q..."
7,6,72,6_year_bloomberg_company_the,"[year, bloomberg, company, the, said, arm, inc...","[NEW YORK, April 10, 2021 (GLOBE NEWSWIRE) -- ..."
8,7,72,7_2022_prnewswire_new_company,"[2022, prnewswire, new, company, digital, plat...","[BAODING, China, Oct. 13, 2022 /PRNewswire/ --..."
9,8,71,8_asia_2022_asian_chinese,"[asia, 2022, asian, chinese, prnewswire, singa...","[BEIJING , June 8, 2023 /PRNewswire/ -- survey..."


In [None]:
# Things to try out

# Reduce the number of topics
#topic.model.reduce_topics(xxx, nr_topics=25)

# Find specific topics
#topic_model.find_topics("DSR", top_n=1)

