In [1]:
import numpy as np
import tomotopy as tp
from scipy.spatial.distance import cdist
import pandas as pd
from sklearn.metrics.cluster import pair_confusion_matrix
from sklearn import metrics
import sklearn.metrics.pairwise as smp
import sklearn.cluster as sc
import sentence_transformers as st
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from dateutil import parser
import csv
import string
import re
from prefixspan import PrefixSpan
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lorenchamplin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lorenchamplin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lorenchamplin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def lowercase(input):
  """
  Returns lowercase text
  """
  return input.lower()

def remove_punctuation(input):
  """
  Returns text without punctuation
  """
  return input.translate(str.maketrans('','', string.punctuation+'–’”“—·'))

def remove_whitespaces(input):
  """
  Returns text without extra whitespaces
  """
  return " ".join(input.split())
  
def remove_html_tags(input):
  """
  Returns text without HTML tags
  """
  soup = BeautifulSoup(input, "html.parser")
  stripped_input = soup.get_text(separator=" ")
  return stripped_input

def tokenize(input):
  """
  Returns tokenized version of text
  """
  return word_tokenize(input)

def remove_stop_words(input):
  """
  Returns text without stop words
  """
  input = word_tokenize(input)
  return [word for word in input if word not in stopwords.words('english') or word == "no" or word == "not"]

def lemmatize(input):
  """
  Lemmatizes input using NLTK's WordNetLemmatizer
  """
  lemmatizer=WordNetLemmatizer()
  input_str=word_tokenize(input)
  new_words = []
  for word in input_str:
    new_words.append(lemmatizer.lemmatize(word))
  return ' '.join(new_words)


def nlp_pipeline(input):
  """
  Function that calls all other functions together to perform NLP on a given text
  """
  return lemmatize(' '.join(remove_stop_words(remove_whitespaces(remove_punctuation(remove_html_tags(lowercase(input)))))))

In [3]:
def x_in_y(query, base):
    try:
        l = len(query)
    except TypeError:
        l = 1
        query = type(base)((query,))

    for i in range(len(base) - l + 1):
        if base[i:i+l] == query:
            return True
    return False

In [4]:
sentences = []
cleaned_s = []
with open("../comments_2.csv", newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        s = re.sub('hay[a-z]*','haystaq',row[0].lower())
        s = re.sub("o’bama|o’\nbama","obama",s)
        sentences.append(s)
        cleaned_s.append(nlp_pipeline(s).split())
n = len(sentences)

In [5]:
ps = PrefixSpan(cleaned_s)
ps.minlen = 3
ps.maxlen = 12

In [6]:
common_phrases = ps.frequent(50,closed=True)

In [7]:
def find_prototype(cluster,dist_matrix):
    min_mmd2 = np.inf
    min_proto = cluster[0]
    for c in cluster:
        z_x = 0
        for c_z in cluster:
            z_x = z_x + dist_matrix[c][c_z]
        z_x = (2/len(cluster))*z_x
        
        x_x = 0
        for c_x_i in cluster:
            for c_x_j in cluster:
                x_x = x_x + dist_matrix[c_x_i][c_x_j]
        x_x = (1/(len(cluster)*len(cluster)))*x_x
        mmd2 = x_x - z_x
        if mmd2 < min_mmd2:
            min_proto = c
            min_mmd2 = mmd2
    return min_proto

In [8]:
def cluster_and_evaluate(texts, common_phrases,distances,d1 = 0.2,d2 = 0.1,k=4,p=False):
    clustering = sc.AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=d1,
        affinity='precomputed',
        linkage='average')
    clustering.fit(distances)
    clusters = {i: [] for i in set(clustering.labels_)}
    for i, text in zip(clustering.labels_, texts):
        clusters[i].append(text)
    idx_clusters = {i: [] for i in set(clustering.labels_)}
    for i, t in enumerate(clustering.labels_):
        idx_clusters[t].append(i)
    mdl = tp.LDAModel(tw=tp.TermWeight.PMI,k=k,seed=2021)
    data = []
    for i,c in idx_clusters.items():
        p = nlp_pipeline(texts[find_prototype(c,distances)]).split()
        for _,j in common_phrases:
            if x_in_y(j,p):
                p.append(str(j))
        if p:
            mdl.add_doc(p)
            data.append(p)
    for i in range(0, 5000, 10):
        mdl.train(10)
    print(mdl.summary(),"\n")
    samples = []
    for i in data:
        samples.append(mdl.make_doc(i))
    X = mdl.infer(samples,5000)
    X = np.array(X[0])
    pdist = cdist(X,X,'jensenshannon')
    clusteringLDA = sc.AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=d2,
        affinity='precomputed',
        linkage='average')
    clusteringLDA.fit(pdist)
    labels = []
    t = []
    for i,l in enumerate(clusteringLDA.labels_):
        for c in idx_clusters[i]:
            labels.append(l)
            t.append(texts[c])
    print("\n")
    print("=== Unsupervised Metric ===\n")
    print("== Silhouette Coefficient [-1,1*]==")
    print(metrics.silhouette_score(pdist, clusteringLDA.labels_, metric='precomputed'),"\n")
    print("\n")
    print("=== Clusters ===\n")
    f_clusters = {i: [] for i in set(labels)}
    print("Number of Clusters: ",len(f_clusters),"\n")
    for i, text in zip(labels, t):
        f_clusters[i].append(text)
    c = []
    per = []
    for i, cluster in f_clusters.items():
        print(f'== Cluster {i} ==',"\n")
        x = 0
        for text in cluster:
            if re.search('haystaq|obama|bernie|sanders',text):
                x = x + 1 
            print(f'* {text}',"\n")
        c.append(i)
        per.append(x/len(cluster))
        print("Percentage of keyword comments: ",x/len(cluster),"\n")
    if p:
        data = {"data": texts,"labels": labels}
        df = pd.DataFrame(data)
        df = df.sort_values(by=['labels'])
        df.to_csv("../sbert_lda_agg_prototype_clustering.csv",index=False)
        kdata = {"cluster": c, "keyword_percentage": per}
        kdf = pd.DataFrame(kdata)
        kdf.to_csv("../sbert_lda_agg_prototype_keyword_mixture.csv",index=False)

In [25]:
print("=== SentenceTransformer ===\n")
model = st.SentenceTransformer('paraphrase-distilroberta-base-v1')
embeddings = model.encode(sentences)
embed_dist = smp.cosine_distances(embeddings)
cluster_and_evaluate(sentences, common_phrases,embed_dist,0.23,0.12,2,True)

=== SentenceTransformer ===

<Basic Info>
| LDAModel (current version: 0.12.2)
| 517 docs, 17143 words
| Total Vocabs: 3132, Used Vocabs: 3132
| Entropy of words: 6.71161
| Entropy of term-weighted words: 7.43148
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 5000, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -7.51323
|
<Initial Parameters>
| tw: TermWeight.PMI
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 2 (the number of topics between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 2021 (random seed)
| trained in version 0.12.2
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distrib