In [1]:
!pip install contractions
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
pd.set_option("max_colwidth", None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import json
import torch
import contractions 
import re
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(123)


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

#### **Data Preprocessing methods**

In [3]:
def get_datasets():
    
    # common: post_title, post_content, username
    
    ######## non-throwaway vs throwaway
    # title	author	selftext	throwaway	subreddit
    dataset_non_throwaway = pd.read_csv('data_non_throwaway.csv')
    dataset_non_throwaway = dataset_non_throwaway.rename(columns={'title': 'post_title',
                                                                  'author': 'username',
                                                                  'selftext': 'post_content'})
    dataset_non_throwaway = dataset_non_throwaway.drop('throwaway', axis=1)

    
    dataset_throwaway = pd.read_csv('data_throwaway.csv')
    dataset_throwaway = dataset_throwaway.rename(columns={'title': 'post_title',
                                                                  'author': 'username',
                                                                  'selftext': 'post_content'})
    dataset_throwaway = dataset_throwaway.drop('throwaway', axis=1)
    
    
    ######## gender comparision
    # title	author	selftext	throwaway	op_age	op_gender	sp_age	sp_gender	subreddit 
    # op: original poster , sp: second person
    dataset_gender_non_throwaway = pd.read_csv('data_with_gender_non_throwaway.csv')
    dataset_gender_non_throwaway = dataset_gender_non_throwaway.rename(columns={'title': 'post_title',
                                                                  'author': 'username',
                                                                  'selftext': 'post_content'})
    dataset_gender_non_throwaway = dataset_gender_non_throwaway.drop(['throwaway', 'op_age', 'sp_age'], axis=1)
    
    dataset_gender_throwaway = pd.read_csv('data_with_gender_throwaway.csv')
    dataset_gender_throwaway = dataset_gender_throwaway.rename(columns={'title': 'post_title',
                                                                  'author': 'username',
                                                                  'selftext': 'post_content'})
    dataset_gender_throwaway = dataset_gender_throwaway.drop(['throwaway', 'op_age', 'sp_age'], axis=1)

    
    
    ######### combined dataset
    dataset_large = pd.concat([dataset_non_throwaway, dataset_throwaway], axis=0)
    dataset_combined = pd.concat([dataset_non_throwaway, dataset_throwaway], axis=0)
    
    return dataset_large, dataset_combined, dataset_non_throwaway, dataset_throwaway, dataset_gender_non_throwaway, dataset_gender_throwaway

In [4]:
def text_preprocessing(text):
    # Lowercase each word
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
      
    # Remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub('[^\w\s,]', '', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\b\d{2}[f|m]\b', '', text)
    text = re.sub(r'\b[f|m]\d{2}\b', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize each word
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['husband', 'wife','dating','wedding','marriage', 'marry', 'engaged', 'engage',
                       'marrying', 'married', 'advice', 'tl', 'dr', 'spouse', 'spouses', 'boyfriend','girlfriend','bf','gf'])
    filtered_tokens = [token for token in tokens if token not in set(stop_words)]
    
    # Join the tokens back into a string
    preprocessed_text = " ".join(filtered_tokens)
    
    return preprocessed_text

# GET  Datasets

In [5]:
dataset_large, dataset_combined, dataset_non_throwaway, dataset_throwaway, dataset_gender_non_throwaway, dataset_gender_throwaway = get_datasets()

print ("BEFORE PREPROCESS:")
print(f"dataset_large: {dataset_large.shape}")
print(f"dataset_combined: {dataset_combined.shape}")
print(f"dataset_non_throwaway: {dataset_non_throwaway.shape}")
print(f"dataset_throwaway: {dataset_throwaway.shape}")
print(f"dataset_gender_non_throwaway: {dataset_gender_non_throwaway.shape}")
print(f"dataset_gender_throwaway: {dataset_gender_throwaway.shape}")

BEFORE PREPROCESS:
dataset_large: (1449, 4)
dataset_combined: (1449, 4)
dataset_non_throwaway: (1343, 4)
dataset_throwaway: (106, 4)
dataset_gender_non_throwaway: (283, 6)
dataset_gender_throwaway: (42, 6)


## REMOVE NON-MARRIED POSTS

In [6]:
def drop_duplicates_and_non_marriage(dataset):
    dataset = dataset.dropna()
    dataset = dataset.drop_duplicates()
    #non_marriage = dataset[dataset['post_content'].str.contains('(?i)girlfriend|boyfriend|fiancee|fiance|fiancé|fiancée|bf|gf') & ~dataset['post_content'].str.contains('(?i)wife|husband|spouse|married')]
    #dataset = dataset.drop(non_marriage.index)
    return dataset 

dataset_large = drop_duplicates_and_non_marriage(dataset_large)
dataset_combined = drop_duplicates_and_non_marriage(dataset_combined)
dataset_non_throwaway = drop_duplicates_and_non_marriage(dataset_non_throwaway)
dataset_throwaway = drop_duplicates_and_non_marriage(dataset_throwaway)
dataset_gender_non_throwaway = drop_duplicates_and_non_marriage(dataset_gender_non_throwaway)
dataset_gender_throwaway = drop_duplicates_and_non_marriage(dataset_gender_throwaway)

print("AFTER REMOVING NON MARRIED POSTS:")
print(f"dataset_combined: {dataset_large.shape}")
print(f"dataset_combined: {dataset_combined.shape}")
print(f"dataset_non_throwaway: {dataset_non_throwaway.shape}")
print(f"dataset_throwaway: {dataset_throwaway.shape}")
print(f"dataset_gender_non_throwaway: {dataset_gender_non_throwaway.shape}")
print(f"dataset_gender_throwaway: {dataset_gender_throwaway.shape}")

AFTER REMOVING NON MARRIED POSTS:
dataset_combined: (1449, 4)
dataset_combined: (1449, 4)
dataset_non_throwaway: (1343, 4)
dataset_throwaway: (106, 4)
dataset_gender_non_throwaway: (283, 6)
dataset_gender_throwaway: (42, 6)


#### **Tokenization, lemmetalization, and stopword removal**

In [7]:
def text_preprocess_titles_and_content(dataset):
    dataset['post_title_clean']   = dataset.post_title.apply(text_preprocessing)
    dataset['post_content_clean'] = dataset.post_content.apply(text_preprocessing)
    dataset['post_title_content'] = dataset.apply(lambda row: row['post_title_clean'] + ' ' + row['post_content_clean'], axis=1)
    return dataset

In [8]:
#dataset_combined = pd.read_csv('dataset_combined_summarized.csv')
dataset_large = text_preprocess_titles_and_content(dataset_large)
dataset_combined = text_preprocess_titles_and_content(dataset_combined)
dataset_non_throwaway = text_preprocess_titles_and_content(dataset_non_throwaway)
dataset_throwaway = text_preprocess_titles_and_content(dataset_throwaway)
dataset_gender_non_throwaway = text_preprocess_titles_and_content(dataset_gender_non_throwaway)
dataset_gender_throwaway = text_preprocess_titles_and_content(dataset_gender_throwaway)

print("AFTER TEXT PREPROCESSING:")
print(f"dataset_large: {dataset_large.shape}")
print(f"dataset_combined: {dataset_combined.shape}")
print(f"dataset_non_throwaway: {dataset_non_throwaway.shape}")
print(f"dataset_throwaway: {dataset_throwaway.shape}")
print(f"dataset_gender_non_throwaway: {dataset_gender_non_throwaway.shape}")
print(f"dataset_gender_throwaway: {dataset_gender_throwaway.shape}")

AFTER TEXT PREPROCESSING:
dataset_large: (1449, 7)
dataset_combined: (1449, 7)
dataset_non_throwaway: (1343, 7)
dataset_throwaway: (106, 7)
dataset_gender_non_throwaway: (283, 9)
dataset_gender_throwaway: (42, 9)


# **BERTtopic on entire dataset**

In [9]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

In [10]:
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=68)

hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=10)

topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=True, min_topic_size=10)

topics, probs = topic_model.fit_transform(dataset_large.post_title_content.to_list())

display(topic_model.get_topic_info())

Unnamed: 0,Topic,Count,Name
0,-1,607,-1_like_would_time_feel
1,0,148,0_job_work_money_would
2,1,122,1_sex_feel_like_years
3,2,102,2_phone_cheating_know_said
4,3,99,3_love_like_years_want
5,4,81,4_like_something_get_things
6,5,51,5_time_like_kids_get
7,6,50,6_family_parents_mom_dad
8,7,36,7_divorce_years_time_feel
9,8,29,8_friend_friends_like_feel


In [11]:
topics_identified = pd.DataFrame(columns=["topic_number", "topics"])

no_of_topics = len(topic_model.topic_representations_)
for i in range(-1, no_of_topics-1):
    topics_n = [topic[0] for topic in topic_model.get_topic(i)]
    row = {"topic_number": i, "topics": ", ".join(topics_n)}
    topics_identified = topics_identified.append(row, ignore_index=True)
    
display(topics_identified)

Unnamed: 0,topic_number,topics
0,-1,"like, would, time, feel, know, want, get, things, years, said"
1,0,"job, work, money, would, time, home, house, like, get, want"
2,1,"sex, feel, like, years, would, want, sexual, really, get, time"
3,2,"phone, cheating, know, said, would, found, time, trust, years, cheated"
4,3,"love, like, years, want, know, relationship, really, together, feel, would"
5,4,"like, something, get, things, say, feel, way, said, know, told"
6,5,"time, like, kids, get, would, feel, baby, know, work, said"
7,6,"family, parents, mom, dad, would, like, sister, want, brother, us"
8,7,"divorce, years, time, feel, still, separated, love, like, want, relationship"
9,8,"friend, friends, like, feel, told, best, relationship, said, one, want"


In [12]:
topic_model.merge_topics(dataset_combined.post_title_content.to_list(), [[9,10],[5,14],[3,13]])
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,607,-1_like_would_time_feel
1,0,148,0_job_work_money_would
2,1,122,1_sex_like_feel_years
3,2,118,2_like_love_feel_know
4,3,102,3_phone_cheating_know_said
5,4,81,4_like_something_get_things
6,5,62,5_time_like_kids_get
7,6,52,6_therapy_like_feel_things
8,7,50,7_family_parents_mom_dad
9,8,36,8_divorce_years_time_feel


In [13]:
topics_identified = pd.DataFrame(columns=["topic_number", "topics"])

no_of_topics = len(topic_model.topic_representations_)
for i in range(-1, no_of_topics-1):
    topics_n = [topic[0] for topic in topic_model.get_topic(i)]
    row = {"topic_number": i, "topics": ", ".join(topics_n)}
    topics_identified = topics_identified.append(row, ignore_index=True)
    
display(topics_identified)

Unnamed: 0,topic_number,topics
0,-1,"like, would, time, feel, know, want, get, things, years, said"
1,0,"job, work, money, would, time, home, house, like, get, want"
2,1,"sex, like, feel, years, would, want, sexual, really, time, get"
3,2,"like, love, feel, know, want, years, together, relationship, really, would"
4,3,"phone, cheating, know, said, would, found, time, years, trust, friend"
5,4,"like, something, get, things, feel, say, said, way, know, told"
6,5,"time, like, kids, get, would, said, work, know, want, feel"
7,6,"therapy, like, feel, things, want, mental, know, help, time, really"
8,7,"family, parents, mom, dad, would, like, want, sister, us, brother"
9,8,"divorce, years, time, feel, still, like, love, separated, want, relationship"


In [14]:
#display(topic_model.visualize_topics())

In [15]:
hierarchical_topics = topic_model.hierarchical_topics(dataset_large.post_title_content.to_list())

100%|██████████| 11/11 [00:00<00:00, 92.74it/s]


In [16]:
display(topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics))

In [17]:
display(topic_model.visualize_documents(dataset_large.post_title_content.tolist(), hide_document_hover=True))

In [18]:
display(topic_model.visualize_barchart(top_n_topics=no_of_topics, n_words=8))

# Evaluation

In [19]:
import gensim

# create a dictionary and corpus of the documents
docs = dataset_large['post_title_content'].apply(word_tokenize)
id2word = gensim.corpora.Dictionary(docs)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in docs]

# create a list of the topics
topics_top_n = topics_identified.topics.str.split(', ').tolist()[1:]

coherence_model = gensim.models.CoherenceModel(topics=topics_top_n, 
                                       texts=docs, 
                                       dictionary=id2word,
                                       coherence='c_v')

coherence_score = coherence_model.get_coherence()

print()
print("############## COHERENCE ##########")
print()
print(f"The coherence score of the topics generated by BERTopic is {coherence_score:.4f}")


############## COHERENCE ##########

The coherence score of the topics generated by BERTopic is 0.4018
