In [6]:
import nltk
import os
import re
import ast
import hdbscan
import umap
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn_extra.cluster import KMedoids
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from transformers import BertModel, BertTokenizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from tqdm import tqdm

In [19]:
# Read the CSV file into a DataFrame
df = pd.read_csv('../Data/News Article Preprocessed.csv')
texts = df['preprocessed_articles'].tolist()
txt_order = [f'{i}.txt' for i in df['id'].tolist()]
preprocessed_texts = [ast.literal_eval(t) for t in texts]
len(preprocessed_texts)

681

In [25]:
# Join inner lists into single strings
preprocessed_texts = [" ".join(text) for text in preprocessed_texts]

# Convert texts to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_texts)

# Perform K-Medoids clustering
kmedoids = KMedoids(n_clusters=5, random_state=0).fit(X)

# Get cluster centers (medoids)
medoid_indices = kmedoids.medoid_indices_
medoids = [preprocessed_texts[index] for index in medoid_indices]

# Get the cluster labels
labels = kmedoids.labels_

# Print the center of gravity (medoids) of each cluster
for i, medoid in enumerate(medoids):
    print(f"Cluster {i + 1} medoid: {medoid}")

Cluster 1 medoid: asian_americans fear resolve trethan asian_americans fear resolve journal falcon lib csub asian_american fear resolve check onesearch availability calstate csub rft_val_fmt info ofi fmt kev journal genre article sid proq proq asian_american express resolve journal issn date volume issue spage trethan phaedra journal btitle info info none allison wang cherry hill resident come country generation immigrant believe ideal make culture diversity accounting director healthcare company mother attract talented people asian_american wonder country sign attack asian_american bring misplace blame covid pandemic create sense fear diverse community make culture nationality language history see monolith murder people woman atlanta fuel asian_american jersey sense resolve time act end hate zoom meeting arrange alliance people talk experience discrimination racism fear family express hope way killing george floyd spark reckoning treatment shooting bring recognition support action sto

In [26]:
for i, medoid in enumerate(medoid_indices):
    print(f"Cluster {i + 1} medoid: {txt_order[medoid]}")

Cluster 1 medoid: 344.txt
Cluster 2 medoid: 625.txt
Cluster 3 medoid: 619.txt
Cluster 4 medoid: 628.txt
Cluster 5 medoid: 649.txt


In [27]:
# Create dictionaries to store each cluster
tfidf_kmedoid_cluster = {}
tfidf_kmedoid_cluster_idx = {}
labels = kmedoids.labels_

for index, label in enumerate(labels):
    if label not in tfidf_kmedoid_cluster:
        tfidf_kmedoid_cluster[label] = []
        tfidf_kmedoid_cluster_idx[label] = []
        
    tfidf_kmedoid_cluster[label].append(preprocessed_texts[index])
    tfidf_kmedoid_cluster_idx[label].append(index)

# Print the dictionary
for cluster_id, texts in tfidf_kmedoid_cluster.items():
    print(f"Cluster {cluster_id + 1} has {len(texts)} items: {texts}")

Cluster 3 has 144 items: ['asian_american attack hit struggle rise racism woman see rampage confirmation fear pan asian_american attack hit struggle rise racism woman see rampage confirmation fear falcon lib csub asian_americans attack hit check onesearch availability calstate csub rft_val_fmt info ofi fmt kev journal genre article sid proq proq atitle asian_american attack hit struggle rise racism woman see rampage confirmation fear issn date volume issue spage pan deanna jtitle btitle info info none full_text read headline danielle knew bone horror come pass series shooting spa atlanta suburb leave people include woman woman descent attack business appear base see lot community woman suspect custody year man name aaron claim motivate racism asian_american rampage felt culmination year racism soar wake coronavirus asian_american insult spat upon shun beaten kill organization stop aapi hate record hate incident group start track attack third victim group note woman law enforcement offi

In [28]:
""" Get keywords of each clusters """
# Create a DataFrame with the TF-IDF scores and the corresponding cluster labels
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df['cluster'] = labels

# Calculate the average TF-IDF score for each word within each cluster
cluster_keywords = {}
for cluster in range(5):
    cluster_data = tfidf_df[tfidf_df['cluster'] == cluster].drop('cluster', axis=1)
    cluster_mean_tfidf = cluster_data.mean(axis=0)
    top_keywords = cluster_mean_tfidf.sort_values(ascending=False).head(10).index.tolist()
    cluster_keywords[cluster] = top_keywords

# Print the top keywords for each cluster
for cluster, keywords in cluster_keywords.items():
    print(f"Cluster {cluster + 1} top keywords: {', '.join(keywords)}")


Cluster 1 top keywords: people, student, china, csub, school, asian_american, time, racism, virus, coronavirus
Cluster 2 top keywords: crime, hate, asian_american, racism, people, report, incident, csub, community, police
Cluster 3 top keywords: crime, attack, hate, police, year, man, asian_american, info, woman, charge
Cluster 4 top keywords: violence, shooting, hate, spa, crime, woman, atlanta, people, community, asian_american
Cluster 5 top keywords: hate, report, asian_american, community, incident, csub, people, info, aapi, racism


In [24]:
""" Get order of distance from nearest to farest in each cluster """
# Create a dictionary to hold the ordered lists
ordered_clusters = {i: [] for i in range(kmedoids.n_clusters)}
ordered_clusters_idx = {i: [] for i in range(kmedoids.n_clusters)}

# Compute distances and sort within each cluster
for cluster in range(kmedoids.n_clusters):
    medoid_index = medoid_indices[cluster]
    distances = []
    distances_txt = []
    
    for index, label in enumerate(labels):
        if label == cluster:
            distance = np.linalg.norm(X[index].toarray() - X[medoid_index].toarray())
            distances.append((distance, preprocessed_texts[index]))
            distances_txt.append((distance, txt_order[index]))
    
    # Sort distances in ascending order (nearest to farthest)
    distances.sort(key=lambda x: x[0])
    distances_txt.sort(key=lambda x: x[0])
    
    # Store the sorted list in the dictionary
    ordered_clusters[cluster] = distances
    ordered_clusters_idx[cluster] = distances_txt

# Print the ordered lists for each cluster
for cluster, items in ordered_clusters.items():
    print(f"Cluster {cluster + 1} ordered by distance to medoid:")
    for distance, text in items:
        print(f"Distance: {distance:.4f}, Text: {text}")

Cluster 1 ordered by distance to medoid:
Distance: 0.0000, Text: ['asian_americans', 'fear', 'resolve', 'trethan', 'asian_americans', 'fear', 'resolve', 'journal', 'falcon', 'lib', 'csub', 'asian_american', 'fear', 'resolve', 'check', 'onesearch', 'availability', 'calstate', 'csub', 'rft_val_fmt', 'info', 'ofi', 'fmt', 'kev', 'journal', 'genre', 'article', 'sid', 'proq', 'proq', 'asian_american', 'express', 'resolve', 'journal', 'issn', 'date', 'volume', 'issue', 'spage', 'trethan', 'phaedra', 'journal', 'btitle', 'info', 'info', 'none', 'allison', 'wang', 'cherry', 'hill', 'resident', 'come', 'country', 'generation', 'immigrant', 'believe', 'ideal', 'make', 'culture', 'diversity', 'accounting', 'director', 'healthcare', 'company', 'mother', 'attract', 'talented', 'people', 'asian_american', 'wonder', 'country', 'sign', 'attack', 'asian_american', 'bring', 'misplace', 'blame', 'covid', 'pandemic', 'create', 'sense', 'fear', 'diverse', 'community', 'make', 'culture', 'nationality', 'lan