In [None]:
import string

%pip install nltk
%pip install pandas
%pip install scikit-learn

import pandas as pd
import nltk
from nltk.corpus import wordnet

nltk.download('punkt')


from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')



## Pre-processing

In [None]:
lemmatizer = WordNetLemmatizer()

sw = stopwords.words('english')
porter_stemmer = PorterStemmer()


def text_cleaning(text):
    text = str (text)

    printable = set(string.printable)
    text = ''.join(filter(lambda x: x in printable, text))
    text = text.replace('\x00', ' ')  # remove nulls
    text = text.replace('\r', ' ')
    text = text.replace('\n', ' ')
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    # text = re.sub("(<.*?>)", "", text)  # remove html markup
    # text = re.sub("(\W|\d)", " ", text)  # remove non-ascii and digits
    text = text.lower()  # Lowercasing

    # def remove_sw(word_list):
    #     keep = []
    #     for word in word_list:
    #         if not word in sw:
    #             keep.append(word)
    #     return keep

    # words = text.split()
    # removed_stop_words = remove_sw(words)
    # text = ' '.join([str(elem) for elem in removed_stop_words])

    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    wn_tags = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    # lemmatize each word with its POS tag
    lemmatized_words = []
    for word, pos in pos_tags:
        if pos[0] in wn_tags:
            wn_tag = wn_tags[pos[0]]
            lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
        else:
            lemmatized_words.append(word)
    text = ' '.join([str(elem) for elem in lemmatized_words])

    words= text.split()
    stemmed_words=[porter_stemmer.stem(word=word) for word in words] # Stemming
    text = ' '.join([str(elem) for elem in stemmed_words])

    text = text.strip()
    return text


In [None]:
text_cleaning("I'm getting the same error, on 2.9.0 but I reproduced it in 2.8.0 and 2.9.1 too.")

In [None]:
df = pd.read_csv("anger_causes.csv")

In [None]:
comments = df["Comment"]
comments

In [None]:
corpus = df["Emotion Causes"]
corpus

In [None]:
processed_corpus = corpus.apply(text_cleaning)
processed_corpus

In [None]:
vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,1))
X = vectorizer.fit_transform(processed_corpus)

In [None]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X.toarray())

In [None]:
len(X_norm)

## Clustering (DBSCAN)

In [None]:
db = DBSCAN(metric='cosine', min_samples=3, eps=0.45).fit(X)
labels = db.labels_

no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)

print('Estimated num of clusters: %d' % no_clusters)
print('Estimated num of noise points: %d' % no_noise)

In [None]:
print(db.labels_)
unique, counts = np.unique(db.labels_, return_counts = True)
print(dict(zip(unique, counts)))

In [None]:
cluster_freq = dict(zip(unique, counts))
print(type(cluster_freq))
print(sorted(dict(zip(unique, counts)).items(), key=lambda x:x[1], reverse=True))

In [None]:
# Obtain text points for each cluster
text_points = []
unique_labels = set(db.labels_)
cluster_ids = []

for label in unique_labels:
    if cluster_freq[label] < 10 or label == -1:
        continue
    cluster_indices = np.where(db.labels_ == label)[0]
    cluster_docs = [corpus[j] for j in cluster_indices]
    cluster_vecs = X[cluster_indices]
    centroid_vec = np.mean(cluster_vecs.toarray(), axis=0)
    similarity_scores = cosine_similarity(cluster_vecs, [centroid_vec])
    # print(cluster_indices)
    text_point_index = cluster_indices[np.argmax(similarity_scores)]
    text_points.append(corpus[text_point_index])
    print(f"Cluster #{label} text point: {corpus[text_point_index]}")

    for idx in cluster_indices:
        cluster_ids.append([label, comments[idx], corpus[idx]])
        # print(idx, corpus[idx])


In [None]:
import pandas as pd 
import numpy as np

arr = np.asarray(cluster_ids)
pd.DataFrame(arr).to_csv('clusters.csv', index_label = "Index")    

## Top features

In [None]:
from collections import Counter

def remove_sw(word_list):
    keep = []
    for word in word_list:
        if not word in sw:
            keep.append(word)
    return keep


# Get the top features for each cluster
unique_labels = set(db.labels_)
top_n = 5 # Number of top features to retrieve for each cluster
feature_names = np.array(vectorizer.get_feature_names_out())
for label in unique_labels:
    if cluster_freq[label] < 10 or label == -1:
        continue
    indices = np.where(db.labels_ == label)[0]
    indices = indices.tolist()
    sentences = []
    for idx in indices:
        sentences.append(processed_corpus[idx])
    words = []

    for sentence in sentences:
        # Convert the sentence to lowercase and split it into words
        words += sentence.lower().split()
    # words = remove_sw(words)
    # Count the frequency of each word using the Counter class
    word_counts = Counter(words)

    # Get the top 5 most common words
    top_words = word_counts.most_common(15)

    print(top_words)

    # cluster_vecs = X[indices].toarray()
    # centroid_vec = np.mean(cluster_vecs, axis=0)
    # # print(np.max(centroid_vec))
    # # print(np.sort(centroid_vec))
    # centroid_vec = np.argsort(centroid_vec)
    # # print(centroid_vec)
    # top_features_indices = centroid_vec[::-1][:top_n]
    # top_features = feature_names[top_features_indices]
    # # print(top_features)
    # print(f"Cluster #{label} top features: {', '.join(top_features)}")


In [None]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
# import seaborn as sns

# # # Apply t-SNE to reduce the dimensionality of the data to 2D
# # tsne = TSNE(n_components=2, random_state=42)
# # X_tsne = tsne.fit_transform(X_norm)

# # # Plot the clusters
# # plt.scatter(X_tsne[:,0], X_tsne[:,1], c=db.labels_)
# # plt.title("DBSCAN Clustering of Emotional Cause Dataset")
# # plt.show()


In [None]:
# X_norm_filtered = []
# labels_filtered = []
# idx = 0
# for idx in range(len(X_norm)):
#     if db.labels_[idx] < 2:
#         continue
#     # print (db.labels_[idx], X_norm[idx])
#     X_norm_filtered.append(X_norm[idx])
#     labels_filtered.append(db.labels_[idx])

# X_norm_filtered = np.array(X_norm_filtered)

In [None]:
# # Apply t-SNE to reduce the dimensionality of the data to 2D
# tsne = TSNE(n_components=2, random_state=42)
# X_tsne = tsne.fit_transform(X_norm_filtered)

# # # Plot the clusters
# # plt.scatter(X_tsne[:,0], X_tsne[:,1], c=labels_filtered)
# # plt.title("DBSCAN Clustering of Emotional Cause Dataset")
# # plt.show()

# # Define a list of colors with one color per scatter plot
# num_scatter_plots = len(set(labels_filtered))
# colors = [
#     '#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#00FFFF', 
#     '#FF00FF', '#800000', '#008000', '#000080', '#808000', 
#     '#800080', '#008080', '#C00000', '#00C000', '#0000C0', 
#     '#C0C000', '#C000C0', '#00C0C0', '#400000', '#004000', 
#     '#000040', '#404000', '#400040', '#004040', '#200000', 
#     '#002000', '#000020', '#202000', '#200020', '#002020', 
#     '#600000', '#006000', '#000060', '#606000', '#600060', 
#     '#006060', '#A00000', '#00A000', '#0000A0', '#A0A000', 
#     '#A000A0', '#00A0A0', '#E00000', '#00E000', '#0000E0', 
#     '#E0E000', '#E000E0', '#00E0E0', '#100000', '#001000', 
#     '#000010', '#101000', '#100010', '#001010', '#500000', 
#     '#005000', '#000050', '#505000', '#500050', '#005050', 
#     '#900000', '#009000', '#000090', '#909000', '#900090', 
#     '#009090', '#D00000', '#00D000', '#0000D0', '#D0D000', 
#     '#D000D0', '#00D0D0', '#300000', '#003000', '#000030', 
#     '#303000', '#300030', '#003030', '#700000', '#007000', 
#     '#000070', '#707000', '#700070', '#007070', '#B00000', 
#     '#00B000', '#0000B0', '#B0B000', '#B000B0', '#00B0B0', 
#     '#F00000', '#00F000', '#0000F0', '#F0F000', '#F000F0', 
#     '#00F0F0'
# ]
# print(colors)

# # Plot the scatter plots with different colors based on their index
# fig, ax = plt.subplots()
# for i, label in enumerate(set(labels_filtered)):
#     X_plot = X_tsne[labels_filtered == label]
#     ax.scatter(X_plot[:, 0], X_plot[:, 1], label=label, c=colors[i])

# # Add a legend and title to the plot
# ax.set_title("DBSCAN Clustering of Emotional Cause Dataset")
# # ax.legend()
# plt.show()

## Topic Modeling

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.decomposition import LatentDirichletAllocation


# # Perform topic modeling with LDA
# lda = LatentDirichletAllocation(n_components=5, random_state=0)
# lda.fit(X)

# # Print the top words for each topic
# feature_names = vectorizer.get_feature_names_out()
# for topic_idx, topic in enumerate(lda.components_):
#     top_words_indices = topic.argsort()[:-6:-1]
#     top_words = [feature_names[i] for i in top_words_indices]
#     print(f"Topic #{topic_idx}: {' '.join(top_words)}")


In [None]:
# %pip install pyLDAvis

In [None]:
# import pyLDAvis
# import pyLDAvis.gensim_models as gensimvis
# import gensim
# from gensim import corpora

In [None]:
# text_clean=[]

# for text in processed_corpus:
#     text_clean.append(text.split())


# dictionary = corpora.Dictionary(text_clean)
# text_term_matrix = [dictionary.doc2bow(text) for text in text_clean]

# number_of_topics = 73

# LDA = gensim.models.ldamodel.LdaModel
# ldamodel = LDA(text_term_matrix, num_topics=number_of_topics, id2word = dictionary, passes=10)

# # Show Topics
# display(ldamodel.show_topics(formatted=True))

In [None]:
# pyLDAvis.enable_notebook()
# vis = gensimvis.prepare(ldamodel, text_term_matrix, dictionary)
# vis