In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.cluster import pair_confusion_matrix
from sklearn import metrics
import sklearn.metrics.pairwise as smp
import sklearn.cluster as sc
import sentence_transformers as st
import csv
import re

In [2]:
sentences = []
with open("../coi_comments_clean.csv", newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        sentences.append(row[0].lower())
n = len(sentences)

In [3]:
def cluster_and_evaluate(texts, distances,p=False):
    clustering = sc.AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=0.28,
        affinity='precomputed',
        linkage='average')
    clustering.fit(distances)
    print("\n")
    print("=== Unsupervised Metric ===\n")
    print("== Silhouette Coefficient [-1,1*]==")
    print(metrics.silhouette_score(distances, clustering.labels_, metric='precomputed'),"\n")
    print("\n")
    print("=== Clusters ===\n")
    clusters = {i: [] for i in set(clustering.labels_)}
    print("Number of Clusters: ",len(clusters),"\n")
    for i, text in zip(clustering.labels_, texts):
        clusters[i].append(text)
    c = []
    per = []
    for i, cluster in clusters.items():
        print(f'== Cluster {i} ==',"\n")
        x = 0
        for text in cluster:
            if re.search('haystaq|obama|bernie|sanders',text):
                x = x + 1 
            print(f'* {text}',"\n")
        c.append(i)
        per.append(x/len(cluster))
        print("Percentage of keyword comments: ",x/len(cluster),"\n")
    if p:
        data = {"data": texts,"labels": clustering.labels_}
        df = pd.DataFrame(data)
        df = df.sort_values(by=['labels'])
        df.to_csv("../sbert_agg_clustering.csv",index=False)
        kdata = {"cluster": c, "keyword_percentage": per}
        kdf = pd.DataFrame(kdata)
        kdf.to_csv("../sbert_agg_keyword_mixture.csv",index=False)

In [4]:
print("=== SentenceTransformer ===\n")
model = st.SentenceTransformer('paraphrase-distilroberta-base-v1')
embeddings = model.encode(sentences)
embed_dist = smp.cosine_distances(embeddings)
cluster_and_evaluate(sentences, embed_dist,True)

=== SentenceTransformer ===



=== Unsupervised Metric ===

== Silhouette Coefficient [-1,1*]==
0.047541816 



=== Clusters ===

Number of Clusters:  800 

== Cluster 0 == 

* my comments are simple and straightforward. it is vitally important to our democracy that the irc follow the provisions of voter approved prop 106 to ensure that the new districts which will impact all of us for the next 10 years are fair & competitive. our democracy cannot survive with one party in power with no obligation to listen to & address the needs of all constituents, not just those who voted for them. further, fair and competitive districts will protect against the extremism that we are currently seeing across the nation which is again, a threat to our democracy.nadditionally, i cannot stress enough how important it is that the voting power of the communities of color be protected pursuant to the voting rights act. over the last 10 years, arizona has become a more diverse state and we must ensure that 


* our community is more east to west from springerville to show low and the whole white mtn area our main concerns  are the forest conditions, wolf predations , hunting, fishing and tourismnour concerns are never addressed and i feel we are poorly represented 

Percentage of keyword comments:  0.0 

== Cluster 249 == 

* i recently moved from mid-town tucson to oro valley, so my community of interest extends beyond my current ov neighborhood as i continue to go to the mid-town and downtown .now living in oro valley, i'm learning about the events, community concerns, shopping and cultural affairs here.   covid and the delta variant are concerns.  most wear masks in public spaces as public safety is valued. the population is homogeneous, mostly middle and upper middle class. average age is about 50, income over $70,000 a year, over 50% have a bachelors degree or higher.  many people go to church. charitable institutions like the food bank are strongly supported.   the police are respect

* anthem is a "corporation" with homes, country club, business and services.  we are most economically and socially connected to the city of phoenix and the northern section of maricopa county.  many of us must travel south on hwy 17 into phoenix to attend medical appointments, to make major purchases (ie., vehicles, furniture, recreational equipment, etc.) and rely on water, power and other utility services from the phoenix area.  part of the anthem community literally has water service from phoenix, and rely on public safety from the phoenix police department.  other parts of our community rely o the maricopa cy sheriff's department for public safety.  we are most closely linked with and associated with phoenix and maricopa county, rather than yavapai county.  the  northern most border for the city of phoenix is approximately one mile south of the southern border of anthem.  it makes sense to include anthem with phoenix for redistricting purposes.n 

Percentage of keyword comments:  

Percentage of keyword comments:  0.0 

== Cluster 677 == 

* public lands issues (mainly coconino national forest, but also multiple nps units), healthcare facilities, transportation (highways, airport, train), watershed and ground water, wildfire, air quality, environmental conditions, natural beauty, outdoor recreation, real estate issues (including viewsheds and natural beauty), affordable housing issues, employers (of all sizes), tourism and related issues (including affordable housing, wildfire, traffic), education (including nau), ranching and agriculture, and culture and history (indigenous, hispanic, caucasian). nflagstaff services not just city residents, but a very large region of the state, including western and southern navajo nation, hopi, small towns like williams, and the vast rural areas for 60 or more miles in any direction. it is the central hub of "northern arizona," and the city and surrounding area are linked by jobs, workers, shopping, healthcare, transportation, 