In [None]:
import json
import time
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import umap.umap_ as umap
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import CountVectorizer

# load tweets
css_authors = pd.read_csv("path/to/tweet/file.tsv", sep="\t")
css_authors["dict"] = css_authors["json_str"].apply(lambda x: json.loads(x))

# load fake account IDs
fake_accounts = pd.read_csv("path/to/status_file.csv")

# select English tweets from accounts that are still active/alive
alive_ids = list(fake_accounts[fake_accounts["status"] == "alive"]["Unnamed: 0"])
alive_ids = [str(x) for x in alive_ids]

data = [
    x
    for x in css_authors["dict"]
    if (
        datetime.strptime(x["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").year >= 2023
    )
    and x["data"]["lang"] == "en"
]
data_available_user = [x for x in data if x["data"]["author_id"] in alive_ids]

In [None]:
# define sentence transformer
model = SentenceTransformer("all-mpnet-base-v2", device="cuda:0")

# extract and pre-process tweet texts
tweets = [x["data"]["text"] for x in data_available_user]
ignore_words = ["rt"]
tweets_text_unique = [x.lower() for x in tweets]
tweets_text_unique = set(tweets_text_unique)
tweets_text_unique = [
    " ".join([w for w in t.split() if w not in ignore_words])
    for t in tweets_text_unique
]
corpus_sentences = [x for x in tweets_text_unique]
max_corpus_size = len(corpus_sentences)

# encode corpus
corpus_sentences = corpus_sentences[0:max_corpus_size]
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(
    corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True
)

In [None]:
len(corpus_embeddings)

In [None]:
print("Start clustering")
start_time = time.time()

# Two parameters to tune:
# min_cluster_size: Only consider cluster that have at least 100 elements
# threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
# clusters = util.community_detection(corpus_embeddings, min_community_size=8, threshold=0.4)
clusters = util.community_detection(
    corpus_embeddings, min_community_size=47, threshold=0.6, batch_size=1024
)

print("Clustering done after {:.2f} sec".format(time.time() - start_time))

In [None]:
# Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i + 1, len(cluster)))
    for sentence_id in cluster[0:10]:
        print(sentence_id)
        print("\t", corpus_sentences[sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-1:]:
        print("\t", corpus_sentences[sentence_id])

In [None]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(
        ngram_range=ngram_range, stop_words="english", token_pattern=r"[^\s]+"
    ).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count


def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {
        label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1]
        for i, label in enumerate(labels)
    }
    return top_n_words


def extract_topic_sizes(df):
    topic_sizes = (
        df.groupby(["Topic"])
        .Doc.count()
        .reset_index()
        .rename({"Topic": "Topic", "Doc": "Size"}, axis="columns")
        .sort_values("Size", ascending=False)
    )
    return topic_sizes


def create_representative_table(clusters, top_n_words):
    representative_table = pd.DataFrame({"cluster": [], "text": []})
    # Print for all clusters the top 3 and bottom 3 elements
    for i, cluster in enumerate(clusters):
        for sentence_id in cluster[0:1]:
            representative_table = pd.concat(
                [
                    representative_table,
                    pd.DataFrame(
                        [", ".join([x[0] for x in top_n_words[i + 1]][0:8])],
                        columns=["text"],
                    ),
                ]
            )
    representative_table["cluster"] = range(1, len(clusters) + 1)
    return representative_table


# determine representative class tokens using TF-IDF
label = 1
lst = [np.nan] * len(corpus_embeddings)
for x in clusters:
    for y in x:
        lst[y] = label
    label = label + 1

filter_clust = corpus_embeddings


docs_df = pd.DataFrame(corpus_sentences, columns=["Doc"])
docs_df["Topic"] = lst
docs_df["Doc_ID"] = range(len(docs_df))
docs_per_topic = docs_df.groupby(["Topic"], as_index=False).agg({"Doc": " ".join})


tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(corpus_sentences))
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df)
np.sum(topic_sizes.Size)

In [None]:
pd.set_option("max_colwidth", None)
pd.set_option("expand_frame_repr", False)
create_representative_table(clusters, top_n_words)[0:30]

In [None]:
# create UMAP plot
plt.rc("axes", unicode_minus=False)
fig, (ax1) = plt.subplots()

filter_clust = corpus_embeddings
filter_clust = filter_clust.cpu()

# Prepare data for UMAP
umap_data = umap.UMAP(
    n_neighbors=20, n_components=2, min_dist=0.4, metric="cosine"
).fit_transform(filter_clust.cpu())
result = pd.DataFrame(umap_data, columns=["x", "y"])

result["labels"] = lst
res = result

outliers = res.loc[result.labels == -1, :]
clustered = res.loc[result.labels != -1, :]
ax1.scatter(outliers.x, outliers.y, color="#BDBDBD", s=0.05)
ax1.scatter(
    clustered.x, clustered.y, c=clustered.labels, s=50, cmap="tab20", marker="+"
)
ax1.axis("off")

for i in range(1, len(clusters) + 1):
    x = np.median(res[res["labels"] == i]["x"])
    y = np.median(res[res["labels"] == i]["y"])
    ax1.annotate(i, (x, y), fontsize=14)
plt.show()