In [1]:
import pandas as pd
import re

In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Casa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Casa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
my_stop_words = set(stopwords.words('english'))

In [7]:
additional_words = ["2020", "2021", "2022",'2023','2042','2010','non-state', 'conflicts', 'conflict','non state conflicts','non state']
my_stop_words.update(additional_words)

In [8]:
my_stop_words = list(my_stop_words)

In [9]:
comments = pd.read_csv('data/input/comments.csv')

In [10]:
comments.head()

Unnamed: 0,Topic,Comment
0,Non.State.Conflicts_comment_1,Each year the average goes up slightly. Going...
1,Non.State.Conflicts_comment_1,Honestly I was just trying to base an opinion...
2,Non.State.Conflicts_comment_1,Based on the ranges of the chart above it is a...
3,Non.State.Conflicts_comment_1,I just observed how the graph went up and down...
4,Non.State.Conflicts_comment_1,I was looking at the average of the entries.


In [11]:
words_to_remove = ['non-state', 'conflicts', 'conflict']

In [12]:
# Remove URLs
def preprocess(text):
    
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                  'URL', text, flags=re.MULTILINE)
    # for word in words_to_remove:
    #     text = text.replace(word, '')
    return text

In [13]:
comments.Comment = comments['Comment'].apply(preprocess)

In [14]:
df_ns = comments[comments.Topic == 'Non.State.Conflicts_comment_1'] 

In [15]:
comments_lst = df_ns.Comment

In [16]:
# Pre-calculate embeddings
#embedding_model = SentenceTransformer("all-mpnet-base-v2")
embedding_model = SentenceTransformer("sentence-transformers/sentence-t5-base")
embeddings = embedding_model.encode(comments_lst, show_progress_bar=True)

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

In [17]:
from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True,bm25_weighting=True)


In [18]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0, metric='cosine', random_state=42)

In [19]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=7, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples = 1, gen_min_span_tree=True)


In [20]:
from sklearn.cluster import KMeans

cluster_model = KMeans(n_clusters=20)

In [21]:
vectorizer_model = CountVectorizer(ngram_range=(1, 3) ,max_df = 0.9, min_df= 5)

In [22]:
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
client = openai.OpenAI(api_key="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")
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model,
    "POS": pos_model
}


In [None]:

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  #ctfidf_model=ctfidf_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=5,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(comments_lst, embeddings)

2024-02-11 20:12:10,325 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-11 20:12:17,913 - BERTopic - Dimensionality - Completed ✓
2024-02-11 20:12:17,914 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-11 20:12:17,937 - BERTopic - Cluster - Completed ✓
2024-02-11 20:12:17,941 - BERTopic - Representation - Extracting topics from clusters using representation models.
 81%|████████████████████████████████████████████████████████████████▎              | 22/27 [2:29:59<28:46, 345.36s/it]

In [None]:
topic_info = topic_model.get_topic_info()

In [None]:
topic_info

In [None]:
topic_info.to_csv('data/output/nonstate_topic_info_sentence_t5_hdbscan_large.csv')

In [None]:
topic_model.visualize_documents(comments_lst)

In [None]:
doc_info = topic_model.get_document_info(comments_lst)

In [None]:
topic_model.visualize_topics()

In [None]:
doc_info.to_csv('data/output/non_state_conflicts_sentence_t5_hdbscan_large.csv', index=False)