In [187]:
import re

import nltk
import pandas as pd
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

nltk.download("stopwords")
nltk.download("punkt")
import re
import string

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from bertopic.representation import KeyBERTInspired
from better_profanity import profanity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package stopwords to C:\Users\Yi
[nltk_data]     Jing\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Yi
[nltk_data]     Jing\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df1 = pd.read_csv("../full_data_with_topic_-1.csv")

In [None]:
# Get the default NLTK English stop words
nltk_stop_words = stopwords.words("english")
custom_stop_words = [
    "hes",
    "shes",
    "singapore",
    "singaporean",
    "sg",
    "singaporeans",
    "man",
    "woman",
]

combined_stop_words = set(nltk_stop_words + custom_stop_words)

In [190]:
def extract_title_words(link):
    # Use regex to find the title in the link
    match = re.search(r"/comments/[^/]+/([^/]+)/", link)
    if match:
        title = match.group(1).replace("_", " ")  # Replace underscores with spaces
        words = title.split()  # Split the title into individual words
        return words
    return []


# Apply the function to the 'link' column and create a new column 'Title_Words'
df1["Title_Words"] = df1["link"].apply(extract_title_words)

# Display the first few rows to verify
print(df1[["link", "Title_Words"]].head())
df1.to_csv("subset_df_with_reddit.csv", index=False)

                                                link  \
0  /r/singapore/comments/gyxf42/the_myth_of_syste...   
1  /r/SingaporeRaw/comments/swdnv3/its_funny_beca...   
2  /r/singapore/comments/rotpqm/moh_debunks_bloom...   
3  /r/singapore/comments/rpixyb/spore_must_expect...   
4  /r/singapore/comments/oi613v/man_to_be_charged...   

                                         Title_Words  
0     [the, myth, of, systemic, police, racism, wsj]  
1                   [its, funny, because, its, true]  
2     [moh, debunks, bloombergs, claim, that, spore]  
3  [spore, must, expect, new, wave, of, covid19, ...  
4  [man, to, be, charged, for, criminal, trespass...  


In [None]:
# Filter out rows where Title_Words is equal to the specific list
df1 = df1[
    ~df1["Title_Words"].apply(
        lambda x: x
        == ["rsingapore", "random", "discussion", "and", "small", "questions"]
        or x == ["deleted", "by", "user"]
    )
]


def clean_title_words(title_words):
    # Convert list of words to lowercase
    title_words = [word.lower() for word in title_words]

    # Remove stop words
    cleaned_words = [word for word in title_words if word not in combined_stop_words]

    return cleaned_words


df1["Title_Words"] = df1["Title_Words"].apply(clean_title_words)
df1["title_words_str"] = df1["Title_Words"].apply(lambda x: " ".join(x))
texts = df1["title_words_str"].tolist()

# 2. Perform topic modeling on the title words, clustering into 15 topics
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")  # embedding
umap_model = UMAP(
    n_neighbors=10,
    n_components=3,
    min_dist=0.1,
    metric="cosine",
    random_state=31,  ##can experiment with bigger clusters
)
hdbscan_model = HDBSCAN(
    min_cluster_size=20,
    min_samples=1,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)
representation_model = KeyBERTInspired()
# train the bertopic model
topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    nr_topics=30,
)
topics, _ = topic_model.fit_transform(texts)

In [214]:
topic_model.get_topic_info()[0:40]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3384,-1_dining_polling_group_elections,"[dining, polling, group, elections, 2020, resu...","[covid19 diningin group size back 2 groups 5, ..."
1,0,1626,0_singapores_malaysians_chinese_malaysian,"[singapores, malaysians, chinese, malaysian, t...","[chinese formed 75 singapores, chinese formed ..."
2,1,1278,1_unpopular_opinion_discussion_wrong,"[unpopular, opinion, discussion, wrong, though...","[unpopular opinion good thing, unpopular opini..."
3,2,1006,2_jail_jailed_arrest_detained,"[jail, jailed, arrest, detained, probation, ar...","[6 months jail police officer took, gets jail ..."
4,3,501,3_ns_serve_women_females,"[ns, serve, women, females, ladies, gender, me...","[real reason women serve ns, dont women serve ..."
5,4,493,4_fairprice_delivery_bags_ikea,"[fairprice, delivery, bags, ikea, food, shoppi...","[fairprice starts selling plastic bags 4, fair..."
6,5,474,5_sporean_sporeans_spore_race,"[sporean, sporeans, spore, race, spores, racia...","[sporean rejected renting due race, sporean re..."
7,6,407,6_pap_paps_candidate_2020,"[pap, paps, candidate, 2020, ge2020, ge, polit...",[pap ge 2020 candidate says people misundersto...
8,7,405,7_covid19_coronavirus_covid_outbreak,"[covid19, coronavirus, covid, outbreak, pandem...",[coronavirus record 120 new covid19 cases spor...
9,8,393,8_racism_racist_racial_racists,"[racism, racist, racial, racists, discriminati...","[racism, racism, racism]"


In [215]:
## input topics to merge here
topics_to_merge = [
    [6, 11, 16, 24, 20],
    [23, 22],
    [
        18,
        7,
    ],
    [27, 25, 18, 7],
    [12, 2],
    [8, 0],
]


topic_model.merge_topics(texts, topics_to_merge)

# check topics after merging
topic_keywords = {}
for topic_num in sorted(topic_model.get_topics()):
    try:
        words, scores = zip(*topic_model.get_topic(topic_num))
        topic_keywords[topic_num] = ", ".join(words)
        print(f"Topic {topic_num}: {', '.join(words)}")
    except ValueError:
        print(f"No words found for Topic {topic_num}, skipping...")
        topic_keywords[topic_num] = "No relevant words"  # Handling topics without words

Topic -1: dining, group, people, social, cut, society, think, public, workers, men
Topic 0: racism, racist, racial, discrimination, interracial, asian, xenophobia, chinese, foreigners, white
Topic 1: jail, jailed, arrest, detained, arrested, rape, offender, assault, raping, raped
Topic 2: unpopular, opinion, discussion, parents, thoughts, wrong, parental, say, commentary, think
Topic 3: raeesah, khan, singh, committee, privileges, mp, mps, candidate, statement, incident
Topic 4: covid19, coronavirus, covid, outbreak, pandemic, virus, viral, infected, endemic, infection
Topic 5: ns, women, serve, females, ladies, gender, men, female, sex, national
Topic 6: food, foodpanda, meal, ikea, mcdonalds, snacks, delivery, trays, rice, diners
Topic 7: sporean, spore, sporeans, spores, church, influenced, ad, rsingapore, grabfood, race
Topic 8: salary, salaries, 8000, wage, ends, hiring, pay, hr, jobs, careers
Topic 9: mrt, trains, train, commuters, buses, fares, bus, transport, station, passenger

In [204]:
updated_topics, _ = topic_model.transform(
    texts
)  # Replace with your relevant text column

# Step 3: Add the updated topics to df1
df1["Updated_Topic"] = updated_topics

In [205]:
df1.to_csv("to_see.csv", index=False)

In [None]:
topic_mapping = {
    0: (1, "Racism"),
    # 6: (2, "Religion"),
    # : (3, "Generational"),
    # 8: (4, "LGBTQ+"),
    8: (5, "Work"),
    4: (6, "COVID-19"),
    5: (7, "Gender"),
    3: (8, "Government"),
    1: (9, "Crimes"),
    10: (10, "Housing"),
    9: (11, "Transportation"),
    11: (12, "Education"),
}

# Step 2: Create new columns based on the mapping


def map_final_topics(updated_topic):
    if updated_topic in topic_mapping:
        return topic_mapping[updated_topic]
    else:
        return (None, "None")


# Apply the mapping to create new columns
df1[["Final Topic", "Final Topic Name"]] = (
    df1["Updated_Topic"].apply(map_final_topics).apply(pd.Series)
)

print(df1.head())

                                                text            timestamp  \
0  It's a claim that is well supported by various...       6/8/2020 11:37   
1  Why come all the way here and complain dipshit...       20/2/2022 2:21   
2  You mean like their 'right' to lie and malign ...     12/26/2021 13:33   
3  Here we go again with the fear mongering. Can'...     27/12/2021 15:29   
4  Idiot. Fucking lucky the rhino did not just ch...  2021-07-11 15:12:52   

            username                                               link  \
0  Talkingtomytoilet  /r/singapore/comments/gyxf42/the_myth_of_syste...   
1           viviseca  /r/SingaporeRaw/comments/swdnv3/its_funny_beca...   
2               sec5  /r/singapore/comments/rotpqm/moh_debunks_bloom...   
3  Typical_Leave1988  /r/singapore/comments/rpixyb/spore_must_expect...   
4          LaZZyBird  /r/singapore/comments/oi613v/man_to_be_charged...   

     link_id   parent_id       id subreddit_id  \
0  t3_gyxf42  t1_ftcyjev  ftczbxh   

In [217]:
df1 = df1.drop(columns=["Title_Words", "title_words_str"])

# Optional: Display the updated DataFrame to confirm the columns are removed
print(df1.head())

                                                text            timestamp  \
0  It's a claim that is well supported by various...       6/8/2020 11:37   
1  Why come all the way here and complain dipshit...       20/2/2022 2:21   
2  You mean like their 'right' to lie and malign ...     12/26/2021 13:33   
3  Here we go again with the fear mongering. Can'...     27/12/2021 15:29   
4  Idiot. Fucking lucky the rhino did not just ch...  2021-07-11 15:12:52   

            username                                               link  \
0  Talkingtomytoilet  /r/singapore/comments/gyxf42/the_myth_of_syste...   
1           viviseca  /r/SingaporeRaw/comments/swdnv3/its_funny_beca...   
2               sec5  /r/singapore/comments/rotpqm/moh_debunks_bloom...   
3  Typical_Leave1988  /r/singapore/comments/rpixyb/spore_must_expect...   
4          LaZZyBird  /r/singapore/comments/oi613v/man_to_be_charged...   

     link_id   parent_id       id subreddit_id  \
0  t3_gyxf42  t1_ftcyjev  ftczbxh   

In [218]:
df1.to_csv("topics_-1_redistribute.csv", index=False)