In [51]:
import pandas as pd
import numpy as np

from umap import UMAP

# Plotting library
import matplotlib.pyplot as plt

# Topic modelling
from bertopic import BERTopic
from hdbscan import HDBSCAN

# Removing stop words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

In [52]:
df_ipcc_mentions_body = pd.read_csv('df_ipccmentions_body.csv')
df_ipcc_mentions_body['text']  = df_ipcc_mentions_body['text'].astype('str')
mentions = df_ipcc_mentions_body.text.to_list()
len(mentions)

18848

In [53]:
df_ipccreport = pd.read_csv('df_ipccreport_body.csv')
df_ipccreport_sfp = df_ipccreport[df_ipccreport['document_name'].str.contains("Summary for Policymakers")]
report = df_ipccreport.text.to_list()
len(report)

16666

In [54]:
# text.ENGLISH_STOP_WORDS
my_additional_stop_words = {'et','al','institute','university','climate','change','box','figure','table','ipcc','emission','emissions','guidelines'}
stop_words_custom = list(text.ENGLISH_STOP_WORDS.union(my_additional_stop_words))
# stop_words_custom

In [55]:
# https://maartengr.github.io/BERTopic/faq.html
umap_model = UMAP(n_neighbors=15, n_components=5,
                  min_dist=0.0, metric='cosine', random_state=42)

In [56]:
def create_topics(no_clusters, stopword_removal, text):

    # Note: min_cluster_size = the minimum size a final cluster can be; min_samples = the minimum number of neighbours to a core point
    hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean',
                        cluster_selection_method='eom', prediction_data=True, min_samples=5)

    match stopword_removal:
        case "CountVectorizer":
            vectorizer_model = CountVectorizer(stop_words=stop_words_custom)
            model = BERTopic(verbose=True,
                             nr_topics=no_clusters,
                             vectorizer_model=vectorizer_model,
                             hdbscan_model=hdbscan_model,
                             calculate_probabilities=True,
                             umap_model=umap_model)

        case "ClassTfidfTransformer":
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
            model = BERTopic(ctfidf_model=ctfidf_model,
                             nr_topics=no_clusters,
                             hdbscan_model=hdbscan_model,
                             calculate_probabilities=True,
                             umap_model=umap_model)

        case "KeyBERTInspired":
            representation_model = KeyBERTInspired()
            model30_nostopwords3 = BERTopic(representation_model=representation_model,
                                            nr_topics=no_clusters,
                                            hdbscan_model=hdbscan_model,
                                            calculate_probabilities=True,
                                            umap_model=umap_model)

    topics, probs = model.fit_transform(text)
    return topics, probs, model

In [82]:
def add_topics_to_df(topics, probs, model, df_text, text_list):
    # Reduce outliers by using probabilities to assign them to topics
    new_topics = model.reduce_outliers(text_list, topics, probabilities=probs, strategy="probabilities")
    topics_old = pd.DataFrame(topics).value_counts()
    topics_new = pd.DataFrame(new_topics).value_counts()

    # Create new topic dataframe and merge with full df
    df_doctopic = pd.DataFrame({"text_dup": text_list, "Topic": new_topics})
    df_full = pd.concat([df_text , df_doctopic], axis=1,  join='inner')
    # print(df_full)

    # Merge in topic name into full df
    df_topicnames = model.get_topic_info()[['Topic','Name']]
    df_full = df_full.merge(df_topicnames, on = 'Topic',how='left')
    return topics_old, topics_new, df_full
            #df_text, df_doctopic,


## IPCC mentions body

In [None]:
topics, probs, model = create_topics("auto", "CountVectorizer", mentions)
model.visualize_barchart()

In [None]:
representative_docs = model.get_representative_docs()
representative_docs

In [None]:
pd.DataFrame(representative_docs).to_csv("mentions_topics_final_reps_fixed_20240521.csv")

In [None]:
topics_old, topics_new, df_full_wtopics = add_topics_to_df(topics, probs, model, df_ipcc_mentions_body, mentions)

In [None]:
topics_oldandnew = pd.merge(topics_old, topics_new, how='left',left_index=True,right_index=True)
topics_oldandnew.to_csv('topic_dist_20240521.csv')

In [None]:
df_full_wtopics.to_csv("mentions_topics_final_fixed_20240521.csv")

## IPCC report

In [91]:
topics, probs, model = create_topics("auto", "CountVectorizer", report)
model.visualize_barchart()

2024-05-21 08:11:17,284 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/521 [00:00<?, ?it/s]

2024-05-21 08:16:14,845 - BERTopic - Embedding - Completed ✓
2024-05-21 08:16:14,848 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-21 08:16:28,473 - BERTopic - Dimensionality - Completed ✓
2024-05-21 08:16:28,476 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [92]:
representative_docs = model.get_representative_docs()
representative_docs

{-1: ['confidence). The number of people whose livelihood depends on degraded lands has been estimated to be about 1.5 billion worldwide (very low confidence). People in degraded areas who directly depend on natural resources for subsistence, food security and income, including women and youth with limited adaptation options, are especially vulnerable to land degradation and dimate change (high confidence). Land degradation reduces land productivity and increases the workload of managing the land, affecting women disproportionally in some regions. Land degradation and climate change act as threat multipliers for already precarious livelihoods (very high confidence), leaving them highly sensitive to extreme climatic events, with consequences such as poverty and food insecurity (high confidence) and, in some cases, migration, conflict and loss of cultural heritage (low confidence). Changes in vegetation cover and distribution due to climate change increase the risk of land degradation in

In [93]:
pd.DataFrame(representative_docs).to_csv("ipcc_topics_final_reps_fixed_20240521.csv")

In [95]:
topics_old, topics_new, df_full_wtopics = add_topics_to_df(topics, probs, model, df_ipccreport, report)
topics_oldandnew = pd.merge(topics_old, topics_new, how='left',left_index=True,right_index=True)
topics_oldandnew.to_csv('ipcc_topic_dist_20240521.csv')

In [96]:
df_full_wtopics.to_csv("ipcc_topics_final_fixed_20240521.csv")

## Combined corpus

In [58]:
df_comb = pd.concat([df_ipccreport, df_ipcc_mentions_body], ignore_index=True, sort=False)
df_comb["doc_type_major"] = df_comb["types0"].where(df_comb["types0"] == "IPCC Report", other='Country Policy Reports')

In [59]:
list_comb = df_comb.text.to_list()

In [60]:
topics, probs, model = create_topics("auto", "CountVectorizer", list_comb)
model.visualize_barchart()

2024-05-21 06:57:42,141 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1110 [00:00<?, ?it/s]

2024-05-21 07:06:33,305 - BERTopic - Embedding - Completed ✓
2024-05-21 07:06:33,308 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-21 07:07:00,597 - BERTopic - Dimensionality - Completed ✓
2024-05-21 07:07:00,600 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [83]:
topics_old, topics_new, df_full_comb = add_topics_to_df(topics, probs, model, df_comb, list_comb)
# df_full_comb["doc_type_major"] = df_full_comb["types0"].where(df_full_comb["types0"] == "IPCC Report", other='Country Policy Reports')
# df_full_comb.groupby(['Name','doc_type_major'])['text'].size().unstack().plot(kind='bar', stacked=True)
# df_full_comb

In [63]:
topics_oldandnew = pd.merge(topics_old, topics_new, how='left',left_index=True,right_index=True)
topics_oldandnew.to_csv('combined_topic_dist_20240521.csv')

In [None]:
representative_docs = model.get_representative_docs()
representative_docs
pd.DataFrame(representative_docs).to_csv("combined_topics_final_reps_fixed_20240521.csv")

In [80]:
df_full_comb.to_csv("combined_topics_final_fixed_20240521.csv")

In [86]:
topics, probs, model = create_topics(29, "CountVectorizer", list_comb)
model.visualize_barchart()

2024-05-21 07:41:45,401 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1110 [00:00<?, ?it/s]

2024-05-21 07:50:52,053 - BERTopic - Embedding - Completed ✓
2024-05-21 07:50:52,055 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-21 07:51:19,572 - BERTopic - Dimensionality - Completed ✓
2024-05-21 07:51:19,575 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [87]:
topics_old, topics_new, df_full_comb = add_topics_to_df(topics, probs, model, df_comb, list_comb)
# df_full_comb["doc_type_major"] = df_full_comb["types0"].where(df_full_comb["types0"] == "IPCC Report", other='Country Policy Reports')
# df_full_comb.groupby(['Name','doc_type_major'])['text'].size().unstack().plot(kind='bar', stacked=True)
# df_full_comb

In [88]:
topics_oldandnew = pd.merge(topics_old, topics_new, how='left',left_index=True,right_index=True)
topics_oldandnew.to_csv('combined_topic_dist_20240521.csv')

In [89]:
representative_docs = model.get_representative_docs()
representative_docs
pd.DataFrame(representative_docs).to_csv("combined_topics_final_reps_fixed_20240521.csv")

In [90]:
df_full_comb.to_csv("combined_topics_final_fixed_20240521.csv")

### STM
- Times run: 1
- Number of topics: 30
- Stopword removal: Yes
- Results: Most minor topics also finely skewed
- Note: For topics that are similar to the BERTtopic models, the distributions is just as skewed

In [None]:
# Pull in STM results from 5000 samples
topics_STM = pd.read_csv("topics_STM.csv")
topics_STM_labels = pd.read_csv("topics_STM_labels.csv")
# Check distribution in the sample

In [None]:
topics_STM['doc_type_major'].value_counts()

In [None]:
topics_STM_labels = topics_STM_labels.rename(columns={'Unnamed: 0': 'topic', 'x': 'topic_name'})
topics_STM = topics_STM.merge(topics_STM_labels, on = 'topic',how='left')

In [None]:
top10topics = pd.DataFrame(topics_STM.groupby(['topic_name'])['text'].size().head(10)).index.values.tolist()

In [None]:
topics_STM = topics_STM[topics_STM['topic_name'].isin(top10topics)]

In [None]:
topics_STM.groupby(['topic_name','doc_type_major'])['text'].size().unstack().plot(kind='bar', stacked=True)

In [None]:
# Next:
# Read how other papers process topics
## Manually label all hundreds of topics? Then do % of each?
## Just show top topics

# Create table highlighting findings from topic modelling section
## Table 1: IPCC mentions - topic modelling
## Table 2: IPCC mentions vs. IPCC report - topic modelling and distributions
