In [None]:
import string

# %pip install nltk
# %pip install pandas
# %pip install scikit-learn

import pandas as pd
import nltk
from nltk.corpus import wordnet



from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')

## Pre-processing

In [None]:
lemmatizer = WordNetLemmatizer()

sw = stopwords.words('english')
porter_stemmer = PorterStemmer()


def text_cleaning(text):
    text = str (text)

    printable = set(string.printable)
    text = ''.join(filter(lambda x: x in printable, text))
    text = text.replace('\x00', ' ')  # remove nulls
    text = text.replace('\r', ' ')
    text = text.replace('\n', ' ')

    # Define a regular expression pattern to match URLs
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_pattern, '', text)

    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    # text = re.sub("(<.*?>)", "", text)  # remove html markup
    # text = re.sub("(\W|\d)", " ", text)  # remove non-ascii and digits
    text = text.lower()  # Lowercasing

    # def remove_sw(word_list):
    #     keep = []
    #     for word in word_list:
    #         if not word in sw:
    #             keep.append(word)
    #     return keep

    # words = text.split()
    # removed_stop_words = remove_sw(words)
    # text = ' '.join([str(elem) for elem in removed_stop_words])

    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    wn_tags = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    # lemmatize each word with its POS tag
    lemmatized_words = []
    for word, pos in pos_tags:
        if pos[0] in wn_tags:
            wn_tag = wn_tags[pos[0]]
            lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
        else:
            lemmatized_words.append(word)
    text = ' '.join([str(elem) for elem in lemmatized_words])

    # words= text.split()
    # stemmed_words=[porter_stemmer.stem(word=word) for word in words] # Stemming
    # text = ' '.join([str(elem) for elem in stemmed_words])

    text = text.strip()
    return text


In [None]:
text_cleaning("I'm getting the same error, on 2.9.0 but I reproduced it in 2.8.0 and 2.9.1 too.")

In [None]:
df = pd.read_csv("filtered_contributors.csv")
df

In [None]:
comments = df["Comment"]

In [None]:
corpus = df["Emotion Causes"]

In [None]:
len(corpus)

In [None]:
processed_corpus = corpus.apply(text_cleaning)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

model = SentenceTransformer('all-mpnet-base-v2')
sentence_embeddings = model.encode(processed_corpus)

pairwise_distances = cosine_distances(sentence_embeddings)

## Clustering (DBSCAN)

In [None]:
db = DBSCAN(min_samples=4, eps=0.3, metric='precomputed').fit(pairwise_distances)
labels = db.labels_

no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)

print('Estimated num of clusters: %d' % no_clusters)
print('Estimated num of noise points: %d' % no_noise)

In [None]:
print(db.labels_)
unique, counts = np.unique(db.labels_, return_counts = True)
print(dict(zip(unique, counts)))

cluster_freq = dict(zip(unique, counts))
print(type(cluster_freq))
print(sorted(dict(zip(unique, counts)).items(), key=lambda x:x[1], reverse=True))

In [None]:
# Obtain text points for each cluster
text_points = []
unique_labels = set(db.labels_)

cluster_counter = 0

cluster_rows = []

for label in unique_labels:
    if label == -1:
        continue
    if cluster_freq[label] > 7:
        cluster_indices = np.where(db.labels_ == label)[0]
        print("")
        print(f"Custer number: {cluster_counter}, member in cluster: {len(cluster_indices)}")
        cluster_counter = cluster_counter+1
        for ids in cluster_indices:
            print(corpus[ids])
            cluster_rows.append([comments[ids], corpus[ids], cluster_counter])


df = pd.DataFrame(cluster_rows, columns=['Comment', 'Cause', 'Cluster'])

# Save the DataFrame to a CSV file
df.to_csv('clusters.csv', index=False)
