In [1]:
import os
import warnings
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

# For better visualizations
sns.set_theme(style="ticks", palette="muted", color_codes=True)

plt.rcParams['axes.labelsize'] = 15
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.weight'] = 'bold'

from wordcloud import WordCloud

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
dataset="datasets/reddit_posts_with_topics_keywords_sentiments.csv"

df = pd.read_csv(dataset)
df.head()

Unnamed: 0,post_id,title,timestamp,body,body_type,topic_name,cleaned_text,keywords,sentiment_label,sentiment_score
0,1b0m6c1,r/CasualConversation is looking for new modera...,2024-02-26 16:48:14,"Hello everyone,It's that time again, [we're lo...",post,22_subs_subreddits_trolls_subreddit,hello everyone time look new mod join team doc...,"modmail courteous professional,grow strict quo...",positive,0.592145
1,1b5h6x7,Deleting social media was one of the best thin...,2024-03-03 13:29:12,I know that technically reddit is a social med...,post,1_fb_deleted_instagram_facebook,know technically reddit social medium not nega...,"consider delete social,instagram tiktok,health...",positive,0.817773
2,1b5lu9j,Lonlieness is not about gender! My Opinion.,2024-03-03 16:56:09,Okay so I just wanted to get this off of my mi...,post,359_loneliness_lonely_epidemic_intiate,okay want get mind keep see people post man lo...,"man loneliness epidemic,understand gender thin...",negative,0.736416
3,1b5jo0m,Have you gave up any hobbies?,2024-03-03 15:23:08,I used to be a doll collector. Then when I was...,post,-1_breakfast_cream_listening_bus,use doll collector tell weird old donate every...,"use doll collector,throw away hobby,day happy ...",negative,0.63878
4,1b5c8ah,Is it me or has the world just stopped moving ...,2024-03-03 08:24:06,"This is strange, but I feel like:1) I don’t re...",post,-1_breakfast_cream_listening_bus,strange feel like not remember anything happen...,"good year covid,like remember happen,people tr...",negative,0.756849


In [3]:
(
    df['topic_name'].value_counts()
    .nlargest(25)
)

topic_name
-1_breakfast_cream_listening_bus        52156
0_attacks_prescribed_symptoms_benzos     2555
1_fb_deleted_instagram_facebook          1490
2_nicotine_cigarette_smoked_smoker        953
3_bday_birthdays_25th_belated             940
4_masks_vaccinated_mask_wearing           867
5_cats_cat_kitty_kitties                  740
6_pete_stories_casconvo_sharing           648
7_removed_backspaced_bailing_erased       572
8_grammar_native_english_language         565
9_dreamt_dreaming_dreamed_dreams          554
10_crying_tear_cry_tears                  515
11_bully_bullied_bullies_bullying         496
12_upvote_upvotes_upvoted_downvotes       481
13_bachelors_bachelor_diploma_ged         481
14_channel_subreddit_sub_lego             469
15_teens_30s_20s_aging                    461
16_savings_invest_401k_saving             444
17_pregnancy_adoption_pregnant_adopt      412
18_bald_shaved_shave_shaving              405
19_dads_fathers_father_dadding            399
20_cloudy_gloomy_rain_s

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'  # Default to noun

def lemmatize_string(s):
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Split the string into words
    words = s.split("_")

    # Get the POS tag for each word
    pos_tags = nltk.pos_tag(words)

    # Lemmatize each word based on its POS tag
    lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(pos)) for w, pos in pos_tags]

    # lemmatized_words = [lemmatizer.lemmatize(w, pos='v') for w in words]

    # Remove duplicates while preserving order
    lemmatized_words = list(dict.fromkeys(lemmatized_words))

    # Join the words back into a string
    result = "_".join(lemmatized_words)

    return result

In [None]:
# Apply lemmatize function to topics name and then return as a dict
lemmatized_topics_dict = (
    df.loc[:, 'topic_name']
    .drop_duplicates(ignore_index=True)
    .to_frame('topics')
    .assign(topic_num=lambda x: x['topics'].str.extract(r'^(-?\d+)_', expand=False),
            topic_name=lambda x: x['topics'].str.extract(r'^-?\d+_(.+)$', expand=False))
    .assign(lema_topic_name=lambda x: x['topic_num'] + "_"+ x['topic_name'].apply(lemmatize_string))
    .set_index("topics")
    .loc[:, 'lema_topic_name']
    .to_dict()
)

# Showing first 3 key-pairs only
dict(list(lemmatized_topics_dict.items())[:3])

In [None]:
df = (
    df.assign(topics=lambda x: x['topic_name'].map(lemmatized_topics_dict))
    .drop('topic_name', axis=1)
)

df.head()

In [None]:
(
    df['topics']
    .value_counts()
    .nlargest(25)
)

In [None]:
i=5

text_data = (
    df.loc[lambda x: x['topics'].str.contains(fr'^{i}_'), 'keywords']
    .replace(" ", "_", regex=True)
    .replace(",", " ", regex=True)
    .str.cat(sep=" ")
)

fig, ax = plt.subplots(tight_layout=True)

wordcloud = WordCloud(width=1200,
                      height=800,
                      colormap='YlGnBu', 
                      background_color="black").generate(text_data)

ax.imshow(wordcloud, interpolation="bilinear")
ax.axis("off")
ax.grid(None);