In [None]:
# read in cluster data

import pandas as pd

time = 3
path = f"data/en_engagement_clusters_time{time}.csv"
#path = "data/en_engagement_clusters_full.csv"
en_chirper_data = pd.read_csv(path, usecols=["chirper", "sample_chirps", "cluster"])
#en_chirper_data = pd.read_csv(path, usecols=["name", "sample_chirps", "label"])
en_chirper_data = en_chirper_data.rename(
    columns={"label": "cluster", "name": "chirper"}
)

# Clean chirps

In [None]:
en_chirper_data.cluster.nunique()

In [None]:
import re

# Remove chinese characters
en_chirper_data["sample_chirps"] = en_chirper_data["sample_chirps"].apply(
    lambda x: re.sub(r"[\u4e00-\u9fff]+", "", str(x))
)
# remove all puctuation
en_chirper_data["sample_chirps"] = en_chirper_data["sample_chirps"].apply(
    lambda x: re.sub(r"[^\w\s]", "", str(x))
)

# Encode chirps and get semantic centroids

In [None]:
# Use `all-MiniLM-L6-v2` to encode each chirper
from sentence_transformers import SentenceTransformer
import pickle

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

embeddings = model.encode(
    en_chirper_data["sample_chirps"].to_list(), show_progress_bar=True
)

#with open(f'data/embeddings/engagement_time{time}_chirper_embeddings.pkl', 'wb') as f:
with open(f"data/embeddings/engagement_full_chirper_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

In [None]:
import pickle

with open(f'data/embeddings/engagement_time{time}_chirper_embeddings.pkl', 'rb') as f:
#with open(f"data/embeddings/engagement_full_chirper_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

cluster_semantic_centroids = pd.DataFrame(
    embeddings
)
cluster_semantic_centroids["cluster"] = en_chirper_data["cluster"].to_list()
cluster_semantic_centroids["chirper"] = en_chirper_data["chirper"].to_list()
cluster_semantic_centroids = cluster_semantic_centroids.groupby("cluster").mean()

global_semantic_centroid = embeddings.mean(axis=0)

all_centroids = pd.concat(
    [cluster_semantic_centroids, pd.DataFrame([global_semantic_centroid])], axis=0
)

all_centroids.index = ["cluster_" + str(i) for i in all_centroids.index[:-1]] + [
    "global"
]

#all_centroids.to_csv(f"data/embeddings/engagement_time{time}_cluster_centroids.csv")
#all_centroids.to_csv(f"data/embeddings/engagement_full_cluster_centroids.csv")

# Get semantic distances

In [None]:
from scipy.spatial.distance import cosine

for idx, chirper in enumerate(en_chirper_data["chirper"].to_list()):
    their_cluster = en_chirper_data.loc[idx, "cluster"]
    en_chirper_data.loc[idx, "semantic_distance_to_cluster"] = cosine(
        embeddings[idx], cluster_semantic_centroids.loc[their_cluster]
    )
    en_chirper_data.loc[idx, "semantic_distance_to_global"] = cosine(
        embeddings[idx], global_semantic_centroid
    )
    if idx % 1000 == 0: print(idx, end=" ")

In [None]:
# save data for analysis in R
#en_chirper_data.to_csv(f"data/networks/en_engagement_clusters_time{time}.csv", index=False)
en_chirper_data.to_csv(f"data/networks/en_engagement_clusters_full.csv", index=False)

# Get visualisations

In [None]:
# make a cluster data frame that has:
# - cluster
# - cluster size
# - cluster key words - join all the chirpers' key words

cluster_data = en_chirper_data.groupby("cluster").agg(
    {
        "chirper": "count",
        "sample_chirps": lambda x: " ".join(x),
    }
)
cluster_data = cluster_data.rename(columns={"chirper": "cluster_size"})

In [None]:
# use tf-idf to get the top 3 words for each cluster, concatenate into a label
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix: csr_matrix = vectorizer.fit_transform(cluster_data["sample_chirps"])
feature_names = vectorizer.get_feature_names_out()


def get_key_words(i, tfidf_matrix, feature_names):
    cluster_document = tfidf_matrix[i]
    non_zero_indices = cluster_document.nonzero()[1]
    tfidf_scores = zip(feature_names[non_zero_indices], cluster_document.data)
    sorted_tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    key_words = [word for word, score in sorted_tfidf_scores[:3]]
    return " ".join(key_words)


# extract the top 3 words for each cluster
num_clusters = len(cluster_data)
cluster_data["top_3_words"] = [get_key_words(i, tfidf_matrix, feature_names) for i in range(num_clusters)]

#cluster_data.to_csv(f"data/networks/en_engagement_clusters_time{time}_cluster_data.csv")
cluster_data.to_csv(f"data/networks/en_engagement_clusters_full_cluster_data.csv")

In [None]:
### Use `UMAP` for dimensionality reduction to visualise clusters in 2D
import umap

chirper_embeddings_2d = umap.UMAP(
    n_neighbors=15, n_components=2, metric="cosine"
).fit_transform(embeddings)
chirper_embeddings_2d = pd.DataFrame(chirper_embeddings_2d)

chirper_embeddings_2d["cluster"] = en_chirper_data["cluster"].to_list()
chirper_embeddings_2d["chirper"] = en_chirper_data["chirper"].to_list()

In [None]:
# plot clusters in 2D
import matplotlib.pyplot as plt
import seaborn as sns

global_centroid_2d = chirper_embeddings_2d.iloc[:, :-2].mean(axis=0).to_list()

sns.set_style("whitegrid")
plt.figure(figsize=(9, 9))
plt.scatter(
    chirper_embeddings_2d.iloc[:, 0],
    chirper_embeddings_2d.iloc[:, 1],
    # colour by cluster
    c=chirper_embeddings_2d["cluster"],
    cmap="tab20",
    alpha=0.8,
    s=5,
)

plt.title(f"Semantic Distribution at Time {time}")
#plt.title(f"Distribution of full engagement communities' semantic centroids")
plt.savefig(f"nlp_results/engagement_time{time}_semantic_distribution.png")
#plt.savefig(f"nlp_results/engagement_full_semantic_clusters.png")
plt.show()

In [None]:
# produce a word cloud for some chosen clusters
from wordcloud import WordCloud
chosen_clusters = [1, 2, 3, 4]
num_chirpers = len(en_chirper_data)
for cluster in chosen_clusters:
    wordcloud = WordCloud(
        background_color="white",
        width=1000,
        height=500,
        max_words=50,
        contour_width=3,
        contour_color="steelblue",
    ).generate(cluster_data.loc[cluster, "sample_chirps"])
    plt.figure(figsize=(9, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    cluster_size = cluster_data.loc[cluster, "cluster_size"]
    plt.title(
        #f"English Engagement Network T{time} Cluster {cluster} | N = {cluster_size} [{cluster_size/num_chirpers:.2%}]"
        f"English Engagement Network Full Cluster {cluster} | N = {cluster_size} [{cluster_size/num_chirpers:.2%}]"
    )
    #plt.savefig(f"nlp_results/engagement_time{time}_cluster{cluster}_wordcloud.png")
    plt.savefig(f"nlp_results/engagement_full_cluster{cluster}_wordcloud.png")
    plt.show()

# Get cluster distance summary stats

In [None]:
#time = 3
#path = f"data/embeddings/engagement_time{time}_cluster_centroids.csv"
path = "data/embeddings/engagement_full_cluster_centroids.csv"
cluster_centroids = pd.read_csv(path, index_col=0)

In [None]:
# cosine distance from each cluster to the global centroid
from scipy.spatial.distance import cosine
from numpy import mean, std


distances_to_global = cluster_centroids.apply(
    lambda x: cosine(x, cluster_centroids.loc["global"]), axis=1
)
print(mean(distances_to_global), std(distances_to_global))

# get confidence interval
from scipy.stats import t

confidence = 0.95
n = len(distances_to_global)
m = mean(distances_to_global)
std_err = std(distances_to_global) / n ** 0.5
h = std_err * t.ppf((1 + confidence) / 2, n - 1)

print(m - h, m + h)

# do a t-test to see if higher than 0,
from scipy.stats import ttest_1samp

data = distances_to_global
stat, p = ttest_1samp(data, 0)
d = (mean(data) - 0) / (std(data))

print("Statistics=%.3f, p=%.3f" % (stat, p))
print("Cohen's d=%.3f" % (d))

len(distances_to_global)

T1: N(Cluster) = 35
- Mean, SD: 0.06579766958060736 0.03724406952116002
- 95%CI: 0.05300388407331168 0.07859145508790304
- Statistics=10.301, p=0.000
- Cohen's d=1.767

T2: N(Cluster) = 20
- 0.016283563254110643 0.008962746575326903
- 0.012088868735766947 0.02047825777245434
- Statistics=7.919, p=0.000
- Cohen's d=1.817

T3: N(Cluster) = 12
- 0.028265152586819293 0.019004585208362488
- 0.016190215225181194 0.04034008994845739
- Statistics=4.933, p=0.000
- Cohen's d=1.487

Full: N = 4
- 0.03697118019196259 0.019783590353789918
- 0.012406608294450335 0.06153575208947484
- Statistics=3.738, p=0.020
- Cohen's d=1.869