In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Plotting library
import matplotlib.pyplot as plt

# Topic modelling
from umap import UMAP
from bertopic import BERTopic
from hdbscan import HDBSCAN

# Removing stop words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

In [None]:
link1 = r'/Users/jiaminlim/Documents/Research/climate_nlp/IPCCandPolicy/datasets/df_ipccmentions_body.csv'
df_ipcc_mentions_body = pd.read_csv(link1)
df_ipcc_mentions_body['text']  = df_ipcc_mentions_body['text'].astype('str')
mentions = df_ipcc_mentions_body.text.to_list()
len(mentions)

In [None]:
link2 = r'/Users/jiaminlim/Documents/Research/climate_nlp/IPCCandPolicy/datasets/df_ipccreport_body.csv'
df_ipccreport = pd.read_csv(link2)
df_ipccreport_sfp = df_ipccreport[df_ipccreport['document_name'].str.contains("Summary for Policymakers")]
report = df_ipccreport.text.to_list()
len(report)

In [None]:
my_additional_stop_words = {'et','al','institute','university','climate','change','box','figure','table','ipcc','emission','emissions','guidelines'}
stop_words_custom = list(text.ENGLISH_STOP_WORDS.union(my_additional_stop_words))

In [None]:
# https://maartengr.github.io/BERTopic/faq.html
umap_model = UMAP(n_neighbors=15, n_components=5,
                  min_dist=0.0, metric='cosine', random_state=42)

In [None]:
def create_topics(no_clusters, stopword_removal, text):

    hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean',
                        cluster_selection_method='eom', prediction_data=True, min_samples=5)

    match stopword_removal:
        case "CountVectorizer":
            vectorizer_model = CountVectorizer(stop_words=stop_words_custom)
            model = BERTopic(verbose=True,
                             nr_topics=no_clusters,
                             vectorizer_model=vectorizer_model,
                             hdbscan_model=hdbscan_model,
                             calculate_probabilities=True,
                             umap_model=umap_model)

        case "ClassTfidfTransformer":
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
            model = BERTopic(ctfidf_model=ctfidf_model,
                             nr_topics=no_clusters,
                             hdbscan_model=hdbscan_model,
                             calculate_probabilities=True,
                             umap_model=umap_model)

        case "KeyBERTInspired":
            representation_model = KeyBERTInspired()
            model30_nostopwords3 = BERTopic(representation_model=representation_model,
                                            nr_topics=no_clusters,
                                            hdbscan_model=hdbscan_model,
                                            calculate_probabilities=True,
                                            umap_model=umap_model)

    topics, probs = model.fit_transform(text)
    return topics, probs, model

In [None]:
def add_topics_to_df(topics, probs, model, df_text, text_list):
    # Reduce outliers by using probabilities to assign them to topics
    new_topics = model.reduce_outliers(text_list, topics, probabilities=probs, strategy="probabilities")
    topics_old = pd.DataFrame(topics).value_counts()
    topics_new = pd.DataFrame(new_topics).value_counts()

    # Create new topic dataframe and merge with full df
    df_doctopic = pd.DataFrame({"text_dup": text_list, "Topic": new_topics})
    df_full = pd.concat([df_text , df_doctopic], axis=1,  join='inner')
    # print(df_full)

    # Merge in topic name into full df
    df_topicnames = model.get_topic_info()[['Topic','Name']]
    df_full = df_full.merge(df_topicnames, on = 'Topic',how='left')
    return topics_old, topics_new, df_full

## IPCC mentions body

In [None]:
# Run topic model
topics, probs, model = create_topics("auto", "CountVectorizer", mentions)
model.visualize_barchart()

In [None]:
# Get representative documents
representative_docs = model.get_representative_docs()
representative_docs

In [None]:
pd.DataFrame(representative_docs).to_csv("mentions_topics_final_reps_fixed.csv")

In [None]:
# Add topic labels to each row in the dataframe
topics_old, topics_new, df_full_wtopics = add_topics_to_df(topics, probs, model, df_ipcc_mentions_body, mentions)

In [None]:
# Output topics before and after outlier reclassification
topics_oldandnew = pd.merge(topics_old, topics_new, how='left',left_index=True,right_index=True)
topics_oldandnew.to_csv('topic_dist.csv')

In [None]:
df_full_wtopics.to_csv("mentions_topics_final_fixed.csv")

## IPCC report

In [None]:
# Create topic model
topics, probs, model = create_topics("auto", "CountVectorizer", report)
model.visualize_barchart()

In [None]:
# Get representative docs
representative_docs = model.get_representative_docs()
representative_docs

In [None]:
pd.DataFrame(representative_docs).to_csv("ipcc_topics_final_reps_fixed.csv")

In [None]:
# Get topic distribution before and after outlier reassignment
topics_old, topics_new, df_full_wtopics = add_topics_to_df(topics, probs, model, df_ipccreport, report)
topics_oldandnew = pd.merge(topics_old, topics_new, how='left',left_index=True,right_index=True)
topics_oldandnew.to_csv('ipcc_topic_dist.csv')

In [None]:
df_full_wtopics.to_csv("ipcc_topics_final_fixed.csv")

## Combined corpus

In [None]:
# Combine IPCC mentions and IPCC report dataframe
df_comb = pd.concat([df_ipccreport, df_ipcc_mentions_body], ignore_index=True, sort=False)
df_comb["doc_type_major"] = df_comb["types0"].where(df_comb["types0"] == "IPCC Report", other='Country Policy Reports')

In [None]:
list_comb = df_comb.text.to_list()

In [None]:
# Create topic model
topics, probs, model = create_topics("auto", "CountVectorizer", list_comb)
model.visualize_barchart()

In [None]:
# Add topic labels to each row
topics_old, topics_new, df_full_comb = add_topics_to_df(topics, probs, model, df_comb, list_comb)

In [None]:
# Output topics before and after outlier reclassification
topics_oldandnew = pd.merge(topics_old, topics_new, how='left',left_index=True,right_index=True)
topics_oldandnew.to_csv('combined_topic_dist_20240521.csv')

In [None]:
representative_docs = model.get_representative_docs()
representative_docs
pd.DataFrame(representative_docs).to_csv("combined_topics_final_reps_fixed.csv")

In [None]:
df_full_comb.to_csv("combined_topics_final_fixed.csv")

### STM
- Analysing output from STM and comparing them to BERTtopic

In [None]:
# Pull in STM results from 5000 samples
topics_STM = pd.read_csv("topics_STM.csv")
topics_STM_labels = pd.read_csv("topics_STM_labels.csv")

In [None]:
topics_STM['doc_type_major'].value_counts()

In [None]:
topics_STM_labels = topics_STM_labels.rename(columns={'Unnamed: 0': 'topic', 'x': 'topic_name'})
topics_STM = topics_STM.merge(topics_STM_labels, on = 'topic',how='left')

In [None]:
top10topics = pd.DataFrame(topics_STM.groupby(['topic_name'])['text'].size().head(10)).index.values.tolist()

In [None]:
topics_STM = topics_STM[topics_STM['topic_name'].isin(top10topics)]

In [None]:
topics_STM.groupby(['topic_name','doc_type_major'])['text'].size().unstack().plot(kind='bar', stacked=True)