# Bertopic
BERTopic is a topic modeling technique that leverages 🤗 transformers and a custom class-based TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions (Grootendorst, 2022).

In [14]:
import pandas as pd
import re, os, string, json
from numpy import array
from pprint import pprint
import codecs
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


from wordcloud import WordCloud

import matplotlib
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import SyllableTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import plotly.figure_factory as ff
import umap.umap_ as UMAP
from bertopic import BERTopic

In [2]:
corpus=pd.read_csv("/Users/irene/PycharmProjects/webscraping/ESG/ESG/temp/esg_corpus_lda.csv")

In [3]:
corpus.columns

Index(['release_time', 'identifier', 'fiscal_year', 'stock_code', 'en_name',
       'cn_name', 'ind_code1', 'ind_name1', 'ind_code2', 'ind_name2',
       'ind_code3', 'ind_name3', 'esg_report'],
      dtype='object')

# Full sample

In [19]:
sw = stopwords.words("english")
full_sample =corpus
full_sample=full_sample[~full_sample["esg_report"].isna()]

In [21]:
docs=full_sample["esg_report"].to_list()

In [24]:
# we have deleted the stopwords 
# set up the umap model
umap_model = UMAP.UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.01, metric='cosine', random_state=1)

topic_model = BERTopic(min_topic_size=10, verbose=True,umap_model=umap_model)
topics, probs = topic_model.fit_transform(docs)

2023-04-14 18:47:02,703 : INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-04-14 18:47:03,640 : INFO : Use pytorch device: cpu
Batches: 100%|██████████| 189/189 [24:45<00:00,  7.86s/it]
2023-04-14 19:12:50,670 - BERTopic - Transformed documents to Embeddings
2023-04-14 19:14:25,281 - BERTopic - Reduced dimensionality
2023-04-14 19:14:26,682 - BERTopic - Clustered reduced embeddings


In [75]:
topic_model.get_topic_info()[:22]

Unnamed: 0,Topic,Count,Name
0,-1,2844,-1_group_environmental_social_report
1,0,132,0_bank_financial_credit_banking
2,1,127,1_pharmaceutical_drug_medical_shanghai
3,2,126,2_food_group_hei_safety
4,3,112,3_kingdom_group_labour_linen
5,4,95,4_company_china_leasing_financial
6,5,86,5_production_group_environmental_management
7,6,83,6_power_wind_energy_new
8,7,80,7_group_management_environmental_governance
9,8,63,8_property_group_service_social


In [81]:
topic_model.visualize_barchart(top_n_topics=20, n_words=10)

In [36]:
topic_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [71]:
topic_model = BERTopic.load("/Users/irene/PycharmProjects/webscraping/ESG/ESG/model/bertopic/full_sample_bertopic_100")

In [72]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [45]:
topic_model.save("/Users/irene/PycharmProjects/webscraping/ESG/ESG/model/bertopic/full_sample_bertopic_100")

Try different Para

In [37]:
umap_model = UMAP.UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.01, metric='cosine', random_state=1)
topic_model_2 = BERTopic(min_topic_size=30, 
                       verbose=True,
                       umap_model=umap_model,
                       top_n_words=10)
topics_2, probs_2 = topic_model_2.fit_transform(docs)

2023-04-14 19:37:43,242 : INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-04-14 19:37:45,829 : INFO : Use pytorch device: cpu
Batches: 100%|██████████| 189/189 [20:13<00:00,  6.42s/it]
2023-04-14 19:58:32,559 - BERTopic - Transformed documents to Embeddings
2023-04-14 19:58:50,956 - BERTopic - Reduced dimensionality
2023-04-14 19:58:51,452 - BERTopic - Clustered reduced embeddings


In [42]:
topic_model_2.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2492,-1_group_management_environmental_report
1,0,749,0_group_environmental_social_report
2,1,629,1_group_environmental_social_management
3,2,278,2_management_development_report_company
4,3,276,3_company_china_management_insurance
5,4,219,4_pharmaceutical_group_management_medical
6,5,216,5_group_environmental_social_waste
7,6,165,6_power_energy_management_development
8,7,150,7_food_group_safety_environmental
9,8,119,8_bank_financial_credit_banking


- min_topic_size = 10 yield 100 topics 
- min_topic_size = 30 yield 22 topics (x) not good

In [85]:
topic_model_2 = BERTopic.load("/Users/irene/PycharmProjects/webscraping/ESG/ESG/model/bertopic/full_sample_bertopic_20")
topic_model_2.visualize_barchart(top_n_topics=20, n_words=20)

In [86]:
topic_model_2.visualize_hierarchy(top_n_topics=20)

In [87]:
topic_model_2.visualize_heatmap(n_clusters=20, width=1000, height=1000)

In [51]:
topic_model_2.save("/Users/irene/PycharmProjects/webscraping/ESG/ESG/model/bertopic/full_sample_bertopic_20")

How about I don't set the min_topic_size

In [57]:
umap_model = UMAP.UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.01, metric='cosine', random_state=1)
topic_model_3 = BERTopic(verbose=True,
                       umap_model=umap_model,
                       top_n_words=10)
topics_3, probs_3 = topic_model_3.fit_transform(docs)

2023-04-14 20:21:49,465 : INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-04-14 20:21:50,220 : INFO : Use pytorch device: cpu
Batches: 100%|██████████| 189/189 [13:28<00:00,  4.28s/it]
2023-04-14 20:35:42,422 - BERTopic - Transformed documents to Embeddings
2023-04-14 20:36:08,315 - BERTopic - Reduced dimensionality
2023-04-14 20:36:08,993 - BERTopic - Clustered reduced embeddings


In [58]:
topic_model_3.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2844,-1_group_environmental_social_report
1,0,132,0_bank_financial_credit_banking
2,1,127,1_pharmaceutical_drug_medical_shanghai
3,2,126,2_food_group_hei_safety
4,3,112,3_kingdom_group_labour_linen
...,...,...,...
96,95,11,95_kin_wing_construction_alliance
97,96,11,96_offshore_office_na_oil
98,97,11,97_group_aspect_relevant_governance
99,98,11,98_group_ordinance_environmental_social


Similar results as min_topic_size=10

In [59]:
topic_model_3.save("/Users/irene/PycharmProjects/webscraping/ESG/ESG/model/bertopic/full_sample_bertopic_100_2")

# By industry

In [4]:
corpus.ind_name1.value_counts()

ind_name1
非必需性消費    1533
地產建築      1038
工業         810
金融         548
資訊科技       424
醫療保健       384
原材料        372
必需性消費      312
公用事業       292
能源         223
電訊          53
綜合企業        42
Name: count, dtype: int64

## Finance

In [5]:
# let's try finance
fina_sample =corpus[corpus["ind_name1"]=="金融"]
fina_sample=fina_sample[~fina_sample["esg_report"].isna()]

In [6]:
fina_docs = fina_sample["esg_report"].to_list()

In [45]:
umap_model = UMAP.UMAP(n_neighbors=8, n_components=5, 
                  min_dist=0.01, metric='cosine', random_state=1)
topic_model_fina = BERTopic(verbose=True,
                            min_topic_size=5,
                       umap_model=umap_model,
                       top_n_words=10)
topics_fina, probs_fina = topic_model_fina.fit_transform(fina_docs)

2023-04-15 01:37:37,464 : INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-04-15 01:37:38,524 : INFO : Use pytorch device: cpu


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

2023-04-15 01:38:52,079 - BERTopic - Transformed documents to Embeddings
2023-04-15 01:38:57,858 - BERTopic - Reduced dimensionality
2023-04-15 01:38:57,898 - BERTopic - Clustered reduced embeddings


In [46]:
topic_model_fina.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,140,-1_management_company_business_group
1,0,57,0_bank_management_financial_green
2,1,28,1_insurance_china_life_company
3,2,28,2_group_report_governance_environmental
4,3,21,3_group_governance_environmental_social
5,4,20,4_group_environmental_governance_social
6,5,19,5_group_social_governance_environmental
7,6,17,6_hong_report_development_performance
8,7,14,7_group_environmental_hong_social
9,8,13,8_china_company_management_asset


In [78]:
topic_model_fina.visualize_hierarchy(top_n_topics=33)

In [77]:
topic_model_fina.visualize_barchart(top_n_topics=20, n_words=10)

In [82]:
topic_model_fina.save("/Users/irene/PycharmProjects/webscraping/ESG/ESG/model/bertopic/fina_bertopic")

## Real Estate

In [37]:
# let's try real estate
re_sample =corpus[corpus["ind_name1"]=="地產建築"]
re_sample=re_sample[~re_sample["esg_report"].isna()]

re_doc = re_sample["esg_report"].to_list()

umap_model = UMAP.UMAP(n_neighbors=5, n_components=5, 
                  min_dist=0.01, metric='cosine', random_state=1)
topic_model_re = BERTopic(verbose=True,
                          min_topic_size=10,
                          umap_model=umap_model,
                          top_n_words=10)
topics_re, probs_re = topic_model_re.fit_transform(re_doc)

2023-04-15 01:24:10,675 : INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-04-15 01:24:11,261 : INFO : Use pytorch device: cpu


Batches:   0%|          | 0/33 [00:00<?, ?it/s]

2023-04-15 01:26:18,330 - BERTopic - Transformed documents to Embeddings
2023-04-15 01:26:27,234 - BERTopic - Reduced dimensionality
2023-04-15 01:26:27,348 - BERTopic - Clustered reduced embeddings


In [38]:
topic_model_re.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,225,-1_group_management_environmental_social
1,0,336,0_group_management_environmental_development
2,1,88,1_group_management_property_environmental
3,2,50,2_group_safety_environmental_construction
4,3,48,3_group_environmental_social_governance
5,4,37,4_group_social_environmental_governance
6,5,36,5_hong_land_management_report
7,6,27,6_management_development_responsibility_china
8,7,26,7_cement_production_china_company
9,8,24,8_service_management_group_social


In [79]:
topic_model_re.visualize_hierarchy(top_n_topics=18)

In [80]:
topic_model_re.visualize_barchart(top_n_topics=19, n_words=10)

In [83]:
topic_model_re.save("/Users/irene/PycharmProjects/webscraping/ESG/ESG/model/bertopic/re_bertopic")

## IT

In [64]:
# 資訊科技
# let's try IT
it_sample =corpus[corpus["ind_name1"]=="資訊科技"]
it_sample=it_sample[~it_sample["esg_report"].isna()]

it_doc = it_sample["esg_report"].to_list()

umap_model = UMAP.UMAP(n_neighbors=2, n_components=5, 
                  min_dist=0.01, metric='cosine', random_state=1)
topic_model_it = BERTopic(verbose=True,
                          min_topic_size=7,
                          umap_model=umap_model,
                          top_n_words=10)
topics_it, probs_it = topic_model_it.fit_transform(it_doc)


2023-04-15 02:13:00,039 : INFO : Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-04-15 02:13:01,144 : INFO : Use pytorch device: cpu


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

2023-04-15 02:13:51,002 - BERTopic - Transformed documents to Embeddings
2023-04-15 02:13:53,322 - BERTopic - Reduced dimensionality
2023-04-15 02:13:53,368 - BERTopic - Clustered reduced embeddings


In [65]:
topic_model_it.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,166,-1_management_group_environmental_report
1,0,30,0_group_social_environmental_governance
2,1,21,1_group_social_environmental_governance
3,2,18,2_company_cloud_management_group
4,3,16,3_group_social_environmental_business
5,4,16,4_management_data_compliance_business
6,5,15,5_group_social_environmental_report
7,6,13,6_supply_global_chain_metrics
8,7,13,7_group_hong_financial_report
9,8,11,8_group_social_governance_report


In [66]:
topic_model_it.visualize_hierarchy(top_n_topics=20)

In [67]:
topic_model_it.visualize_barchart(top_n_topics=19, n_words=10)

In [84]:
topic_model_it.save("/Users/irene/PycharmProjects/webscraping/ESG/ESG/model/bertopic/it_bertopic")

All the trained models are saved locally for faster access. Just call BERTopic.load("model_path").