In [None]:
import string

%pip install nltk
%pip install pandas
%pip install scikit-learn

import pandas as pd
import nltk
from nltk.corpus import wordnet

nltk.download('punkt')


from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')



## Pre-processing

In [None]:
lemmatizer = WordNetLemmatizer()

sw = stopwords.words('english')
porter_stemmer = PorterStemmer()


def text_cleaning(text):
    text = str (text)

    printable = set(string.printable)
    text = ''.join(filter(lambda x: x in printable, text))
    text = text.replace('\x00', ' ')  # remove nulls
    text = text.replace('\r', ' ')
    text = text.replace('\n', ' ')
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    # text = re.sub("(<.*?>)", "", text)  # remove html markup
    # text = re.sub("(\W|\d)", " ", text)  # remove non-ascii and digits
    text = text.lower()  # Lowercasing

    # def remove_sw(word_list):
    #     keep = []
    #     for word in word_list:
    #         if not word in sw:
    #             keep.append(word)
    #     return keep

    # words = text.split()
    # removed_stop_words = remove_sw(words)
    # text = ' '.join([str(elem) for elem in removed_stop_words])

    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    wn_tags = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    # lemmatize each word with its POS tag
    lemmatized_words = []
    for word, pos in pos_tags:
        if pos[0] in wn_tags:
            wn_tag = wn_tags[pos[0]]
            lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
        else:
            lemmatized_words.append(word)
    text = ' '.join([str(elem) for elem in lemmatized_words])

    words= text.split()
    stemmed_words=[porter_stemmer.stem(word=word) for word in words] # Stemming
    text = ' '.join([str(elem) for elem in stemmed_words])

    text = text.strip()
    return text


In [None]:
text_cleaning("I'm getting the same error, on 2.9.0 but I reproduced it in 2.8.0 and 2.9.1 too.")

In [None]:
df = pd.read_csv("anger_causes.csv")

In [None]:
comments = df["Comment"]
comments

In [None]:
corpus = df["Emotion Causes"]
corpus

In [None]:
processed_corpus = corpus.apply(text_cleaning)
processed_corpus

In [None]:
vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,1))
X = vectorizer.fit_transform(processed_corpus)

In [None]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X.toarray())

In [None]:
len(X_norm)

## Clustering (DBSCAN)

In [None]:
db = DBSCAN(metric='cosine', min_samples=3, eps=0.45).fit(X)
labels = db.labels_

no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)

print('Estimated num of clusters: %d' % no_clusters)
print('Estimated num of noise points: %d' % no_noise)

In [None]:
print(db.labels_)
unique, counts = np.unique(db.labels_, return_counts = True)
print(dict(zip(unique, counts)))

In [None]:
cluster_freq = dict(zip(unique, counts))
print(type(cluster_freq))
print(sorted(dict(zip(unique, counts)).items(), key=lambda x:x[1], reverse=True))

In [None]:
# Obtain text points for each cluster
text_points = []
unique_labels = set(db.labels_)
cluster_ids = []

for label in unique_labels:
    if cluster_freq[label] < 10 or label == -1:
        continue
    cluster_indices = np.where(db.labels_ == label)[0]
    cluster_docs = [corpus[j] for j in cluster_indices]
    cluster_vecs = X[cluster_indices]
    centroid_vec = np.mean(cluster_vecs.toarray(), axis=0)
    similarity_scores = cosine_similarity(cluster_vecs, [centroid_vec])
    # print(cluster_indices)
    text_point_index = cluster_indices[np.argmax(similarity_scores)]
    text_points.append(corpus[text_point_index])
    print(f"Cluster #{label} text point: {corpus[text_point_index]}")

    for idx in cluster_indices:
        cluster_ids.append([label, comments[idx], corpus[idx]])
        # print(idx, corpus[idx])
