In [1]:
!pip install contractions
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
pd.set_option("max_colwidth", None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import json
import torch
import contractions 
import re
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(123)


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

#### **Data Preprocessing methods**

In [3]:
def get_datasets():
    
    # common: post_title, post_content, username
    
    ######## non-throwaway vs throwaway
    # title	author	selftext	throwaway	subreddit
    dataset_non_throwaway = pd.read_csv('data_non_throwaway.csv')
    dataset_non_throwaway = dataset_non_throwaway.rename(columns={'title': 'post_title',
                                                                  'author': 'username',
                                                                  'selftext': 'post_content'})
    dataset_non_throwaway = dataset_non_throwaway.drop('throwaway', axis=1)

    
    dataset_throwaway = pd.read_csv('data_throwaway.csv')
    dataset_throwaway = dataset_throwaway.rename(columns={'title': 'post_title',
                                                                  'author': 'username',
                                                                  'selftext': 'post_content'})
    dataset_throwaway = dataset_throwaway.drop('throwaway', axis=1)
    
    
    ######## gender comparision
    # title	author	selftext	throwaway	op_age	op_gender	sp_age	sp_gender	subreddit 
    # op: original poster , sp: second person
    dataset_gender_non_throwaway = pd.read_csv('data_with_gender_non_throwaway.csv')
    dataset_gender_non_throwaway = dataset_gender_non_throwaway.rename(columns={'title': 'post_title',
                                                                  'author': 'username',
                                                                  'selftext': 'post_content'})
    dataset_gender_non_throwaway = dataset_gender_non_throwaway.drop(['throwaway', 'op_age', 'sp_age'], axis=1)
    
    dataset_gender_throwaway = pd.read_csv('data_with_gender_throwaway.csv')
    dataset_gender_throwaway = dataset_gender_throwaway.rename(columns={'title': 'post_title',
                                                                  'author': 'username',
                                                                  'selftext': 'post_content'})
    dataset_gender_throwaway = dataset_gender_throwaway.drop(['throwaway', 'op_age', 'sp_age'], axis=1)

    
    
    ######### combined dataset
    dataset_large = pd.concat([dataset_non_throwaway, dataset_throwaway], axis=0)
    dataset_combined = pd.concat([dataset_non_throwaway, dataset_throwaway], axis=0)
    
    return dataset_large, dataset_combined, dataset_non_throwaway, dataset_throwaway, dataset_gender_non_throwaway, dataset_gender_throwaway

In [4]:
def text_preprocessing(text):
    # Lowercase each word
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
      
    # Remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub('[^\w\s,]', '', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\b\d{2}[f|m]\b', '', text)
    text = re.sub(r'\b[f|m]\d{2}\b', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize each word
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['husband', 'wife','dating','wedding','marriage', 'marry', 'engaged', 'engage',
                       'marrying', 'married', 'advice', 'tl', 'dr', 'spouse', 'spouses', 'boyfriend','girlfriend','bf','gf'])
    filtered_tokens = [token for token in tokens if token not in set(stop_words)]
    
    # Join the tokens back into a string
    preprocessed_text = " ".join(filtered_tokens)
    
    return preprocessed_text

# GET  Datasets

In [5]:
dataset_large, dataset_combined, dataset_non_throwaway, dataset_throwaway, dataset_gender_non_throwaway, dataset_gender_throwaway = get_datasets()

print ("BEFORE PREPROCESS:")
print(f"dataset_large: {dataset_large.shape}")
print(f"dataset_combined: {dataset_combined.shape}")
print(f"dataset_non_throwaway: {dataset_non_throwaway.shape}")
print(f"dataset_throwaway: {dataset_throwaway.shape}")
print(f"dataset_gender_non_throwaway: {dataset_gender_non_throwaway.shape}")
print(f"dataset_gender_throwaway: {dataset_gender_throwaway.shape}")

BEFORE PREPROCESS:
dataset_large: (1449, 4)
dataset_combined: (1449, 4)
dataset_non_throwaway: (1343, 4)
dataset_throwaway: (106, 4)
dataset_gender_non_throwaway: (283, 6)
dataset_gender_throwaway: (42, 6)


## REMOVE NON-MARRIED POSTS

In [6]:
def drop_duplicates_and_non_marriage(dataset):
    dataset = dataset.dropna()
    dataset = dataset.drop_duplicates()
    non_marriage = dataset[dataset['post_content'].str.contains('(?i)girlfriend|boyfriend|fiancee|fiance|fiancé|fiancée|bf|gf') & ~dataset['post_content'].str.contains('(?i)wife|husband|spouse|married')]
    dataset = dataset.drop(non_marriage.index)
    return dataset 

dataset_large = drop_duplicates_and_non_marriage(dataset_large)
dataset_combined = drop_duplicates_and_non_marriage(dataset_combined)
dataset_non_throwaway = drop_duplicates_and_non_marriage(dataset_non_throwaway)
dataset_throwaway = drop_duplicates_and_non_marriage(dataset_throwaway)
dataset_gender_non_throwaway = drop_duplicates_and_non_marriage(dataset_gender_non_throwaway)
dataset_gender_throwaway = drop_duplicates_and_non_marriage(dataset_gender_throwaway)

print("AFTER PREPROCESS:")
print(f"dataset_combined: {dataset_combined.shape}")
print(f"dataset_non_throwaway: {dataset_non_throwaway.shape}")
print(f"dataset_throwaway: {dataset_throwaway.shape}")
print(f"dataset_gender_non_throwaway: {dataset_gender_non_throwaway.shape}")
print(f"dataset_gender_throwaway: {dataset_gender_throwaway.shape}")

AFTER PREPROCESS:
dataset_combined: (1409, 4)
dataset_non_throwaway: (1317, 4)
dataset_throwaway: (102, 4)
dataset_gender_non_throwaway: (274, 6)
dataset_gender_throwaway: (40, 6)


## SELECT RANDOM 100

In [7]:
def select_random_100(dataset_large, dataset_combined, dataset_non_throwaway, dataset_throwaway, dataset_gender_non_throwaway, dataset_gender_throwaway):
    
    if dataset_non_throwaway.shape[0] > 100:
        dataset_non_throwaway = dataset_non_throwaway.sample(n=100, random_state=42)
    
    if dataset_throwaway.shape[0] > 100:
        dataset_throwaway = dataset_throwaway.sample(n = 100, random_state=42)
    
    
    dataset_gender_non_throwaway = pd.merge(dataset_gender_non_throwaway, dataset_non_throwaway, on=['post_title', 'post_content', 'username'], how='inner')
    dataset_gender_throwaway = pd.merge(dataset_gender_throwaway, dataset_throwaway, on=['post_title', 'post_content', 'username'], how='inner')
    
    dataset_combined = pd.concat([dataset_non_throwaway, dataset_throwaway], axis=0)
    
    
    ####### REST INDICES:
    dataset_large = dataset_large.reset_index(drop=True)
    dataset_non_throwaway = dataset_non_throwaway.reset_index(drop=True)
    dataset_throwaway = dataset_throwaway.reset_index(drop=True)
    dataset_gender_non_throwaway = dataset_gender_non_throwaway.reset_index(drop=True)
    dataset_gender_throwaway = dataset_gender_throwaway.reset_index(drop=True)
    dataset_combined = dataset_combined.reset_index(drop=True)
    
    
    return dataset_large, dataset_combined, dataset_non_throwaway, dataset_throwaway, dataset_gender_non_throwaway, dataset_gender_throwaway

dataset_large, dataset_combined, dataset_non_throwaway, dataset_throwaway, dataset_gender_non_throwaway, dataset_gender_throwaway = select_random_100(dataset_large, dataset_combined, dataset_non_throwaway, dataset_throwaway, dataset_gender_non_throwaway, dataset_gender_throwaway)


print("AFTER RANDOM 100 selection:")
print(f"dataset_large: {dataset_large.shape}")
print(f"dataset_combined: {dataset_combined.shape}")
print(f"dataset_non_throwaway: {dataset_non_throwaway.shape}")
print(f"dataset_throwaway: {dataset_throwaway.shape}")
print(f"dataset_gender_non_throwaway: {dataset_gender_non_throwaway.shape}")
print(f"dataset_gender_throwaway: {dataset_gender_throwaway.shape}")

AFTER RANDOM 100 selection:
dataset_large: (1409, 4)
dataset_combined: (200, 4)
dataset_non_throwaway: (100, 4)
dataset_throwaway: (100, 4)
dataset_gender_non_throwaway: (24, 7)
dataset_gender_throwaway: (40, 7)


In [8]:
selected_throwaway = dataset_throwaway.loc[:, ['post_title', 'username', 'post_content']].rename(columns={'post_title': 'title', 'username': 'author', 'post_content': 'selftext'})
selected_non_throwaway = dataset_non_throwaway.loc[:, ['post_title', 'username', 'post_content']].rename(columns={'post_title': 'title', 'username': 'author', 'post_content': 'selftext'})

selected_gender_throwaway = dataset_gender_throwaway.loc[:, ['post_title', 'username', 'post_content', 'op_gender', 'sp_gender']].rename(columns={'post_title': 'title', 'username': 'author', 'post_content': 'selftext'})
selected_gender_non_throwaway = dataset_gender_non_throwaway.loc[:, ['post_title', 'username', 'post_content', 'op_gender', 'sp_gender']].rename(columns={'post_title': 'title', 'username': 'author', 'post_content': 'selftext'})


#selected_throwaway.to_csv('selected_throwaway.csv', index=False)
#selected_non_throwaway.to_csv('selected_non_throwaway.csv', index=False)
#selected_gender_throwaway.to_csv('selected_gender_throwaway.csv', index=False)
#selected_gender_non_throwaway.to_csv('selected_gender_non_throwaway.csv', index=False)

#### **Tokenization, lemmetalization, and stopword removal**

In [9]:
def text_preprocess_titles_and_content(dataset):
    dataset['post_title_clean']   = dataset.post_title.apply(text_preprocessing)
    dataset['post_content_clean'] = dataset.post_content.apply(text_preprocessing)
    dataset['post_title_content'] = dataset.apply(lambda row: row['post_title_clean'] + ' ' + row['post_content_clean'], axis=1)
    return dataset

In [10]:
#dataset_combined = pd.read_csv('dataset_combined_summarized.csv')
dataset_large = text_preprocess_titles_and_content(dataset_large)
dataset_combined = text_preprocess_titles_and_content(dataset_combined)
dataset_non_throwaway = text_preprocess_titles_and_content(dataset_non_throwaway)
dataset_throwaway = text_preprocess_titles_and_content(dataset_throwaway)
dataset_gender_non_throwaway = text_preprocess_titles_and_content(dataset_gender_non_throwaway)
dataset_gender_throwaway = text_preprocess_titles_and_content(dataset_gender_throwaway)

print(dataset_combined.shape)

(200, 7)


# **BERTtopic on selected dataset**

In [11]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

In [12]:
umap_model_combined = UMAP(n_neighbors=18, n_components=5, min_dist=0.0, metric='cosine', random_state=68)

hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=3)

topic_model_combined = BERTopic(umap_model=umap_model_combined, hdbscan_model=hdbscan_model, calculate_probabilities=True, min_topic_size=3)

topics_combined, probs_combined = topic_model_combined.fit_transform(dataset_combined.post_title_content.to_list())

topic_model_combined.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,52,-1_would_like_years_want
1,0,22,0_friend_said_saw_phone
2,1,16,1_kids_parents_time_would
3,2,14,2_sex_really_like_feel
4,3,12,3_know_things_like_wrong
5,4,10,4_like_feel_want_years
6,5,9,5_love_years_trip_together
7,6,9,6_divorce_daughter_need_years
8,7,8,7_relationship_friend_friends_one
9,8,7,8_time_day_told_alice


In [13]:
topics_identified_combined = pd.DataFrame(columns=["topic_number", "topics"])

no_of_topics = len(topic_model_combined.topic_representations_)
for i in range(-1, no_of_topics-1):
    topics = [topic[0] for topic in topic_model_combined.get_topic(i)]
    row = {"topic_number": i, "topics": ", ".join(topics)}
    topics_identified_combined = topics_identified_combined.append(row, ignore_index=True)
    
display(topics_identified_combined)

Unnamed: 0,topic_number,topics
0,-1,"would, like, years, want, time, relationship, feel, know, together, also"
1,0,"friend, said, saw, phone, know, texts, cheating, talking, cheated, want"
2,1,"kids, parents, time, would, family, like, get, job, feel, work"
3,2,"sex, really, like, feel, well, years, want, would, even, could"
4,3,"know, things, like, wrong, get, issues, need, door, feel, time"
5,4,"like, feel, want, years, never, child, kids, relationship, wants, time"
6,5,"love, years, trip, together, vegas, feel, time, want, current, sam"
7,6,"divorce, daughter, need, years, pack, say, like, want, last, joint"
8,7,"relationship, friend, friends, one, things, love, would, know, even, used"
9,8,"time, day, told, alice, home, son, would, money, garage, work"


In [14]:
topic_model_combined.merge_topics(dataset_combined.post_title_content.to_list(), [[0,7,9],[2,10],[3,4],[6,13],[11,16],[8,12]])
topic_model_combined.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,52,-1_would_like_want_years
1,0,36,0_friend_relationship_know_would
2,1,22,1_like_know_feel_things
3,2,20,2_sex_really_like_found
4,3,16,3_kids_time_would_parents
5,4,14,4_divorce_years_separated_want
6,5,12,5_time_day_home_would
7,6,10,6_talk_know_compliments_also
8,7,9,7_love_years_trip_together
9,8,5,8_smoking_weed_feel_leave


In [15]:
topics_identified_combined = pd.DataFrame(columns=["topic_number", "topics"])

no_of_topics = len(topic_model_combined.topic_representations_)
for i in range(-1, no_of_topics-1):
    topics = [topic[0] for topic in topic_model_combined.get_topic(i)]
    row = {"topic_number": i, "topics": ", ".join(topics)}
    topics_identified_combined = topics_identified_combined.append(row, ignore_index=True)
    
display(topics_identified_combined)

Unnamed: 0,topic_number,topics
0,-1,"would, like, want, years, time, relationship, know, feel, together, also"
1,0,"friend, relationship, know, would, friends, said, things, want, ex, got"
2,1,"like, know, feel, things, years, time, never, get, want, issues"
3,2,"sex, really, like, found, porn, would, said, feel, years, even"
4,3,"kids, time, would, parents, like, family, get, feel, job, want"
5,4,"divorce, years, separated, want, together, daughter, last, like, need, say"
6,5,"time, day, home, would, told, know, work, get, also, alice"
7,6,"talk, know, compliments, also, like, hair, looks, something, never, time"
8,7,"love, years, trip, together, feel, vegas, time, want, would, break"
9,8,"smoking, weed, feel, leave, sober, like, get, scared, want, mean"


In [16]:
display(topic_model_combined.visualize_topics())

In [17]:
hierarchical_topics = topic_model_combined.hierarchical_topics(dataset_combined.post_title_content.to_list())

100%|██████████| 9/9 [00:00<00:00, 144.19it/s]


In [18]:
display(topic_model_combined.visualize_hierarchy(hierarchical_topics=hierarchical_topics))

In [19]:
display(topic_model_combined.visualize_documents(dataset_combined.post_title_content.tolist(), hide_document_hover=True))

In [20]:
display(topic_model_combined.visualize_barchart(top_n_topics=no_of_topics, n_words=8))

## **Evaluation Metrics**

In [21]:
import gensim

# create a dictionary and corpus of the documents
docs = dataset_combined['post_title_content'].apply(word_tokenize)
id2word = gensim.corpora.Dictionary(docs)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in docs]

# create a list of the topics
topics_top_n = topics_identified_combined.topics.str.split(', ').tolist()

coherence_model = gensim.models.CoherenceModel(topics=topics_top_n, 
                                       texts=docs, 
                                       dictionary=id2word,
                                       topn = 5,  
                                       coherence='c_v')

coherence_score = coherence_model.get_coherence()

print()
print("############## COHERENCE ##########")
print()
print(f"The coherence score of the topics generated by BERTopic is {coherence_score:.4f}")


############## COHERENCE ##########

The coherence score of the topics generated by BERTopic is 0.4802


## **Export the clustered sampled data**

In [22]:
classification_labels = topic_model_combined.get_document_info(dataset_combined.post_title_content.to_list())
#display(classification_labels.head())

final_classification_dataset = dataset_combined.copy()
final_classification_dataset = final_classification_dataset.drop(['post_title_clean', 'post_content_clean', 'post_title_content'], axis=1)
final_classification_dataset['topic_identified'] = classification_labels['Topic']
final_classification_dataset['topic_name'] = classification_labels['Name']
final_classification_dataset['top_n_words'] = classification_labels['Top_n_words']
final_classification_dataset['probability'] = classification_labels['Probability']

merged = pd.merge(final_classification_dataset, dataset_throwaway.drop(['subreddit'], axis=1), on=['post_title', 'username','post_content'], how='left', indicator=True)
final_classification_dataset['throwaway'] = merged['_merge'] == 'both'

gender_combined = pd.concat([dataset_throwaway, dataset_gender_non_throwaway])
gender_combined = gender_combined.drop(['post_title_clean', 'post_content_clean', 'post_title_content', 'subreddit'], axis=1)
final_classification_dataset = pd.merge(final_classification_dataset, gender_combined, on=['post_title', 'username','post_content'], how='left')
final_classification_dataset.shape

(200, 13)

In [23]:
final_classification_dataset.to_csv('labelled_topics_dataset.csv', index=False)