In [4]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")
nltk.download('punkt')

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saumyaashah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saumyaashah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df_raw = pd.read_json('news_articles_with_text_3_sources.json')

In [6]:
df_raw.head()

Unnamed: 0,source,author,title,url,publishedAt,article_text
0,"{'id': 'abc-news', 'name': 'ABC News'}","MARIAM FAM, DEEPTI HAJELA and LUIS ANDRES HENA...","Two decades after 9/11, Muslim Americans still...",https://abcnews.go.com/Lifestyle/wireStory/dec...,2021-09-07T07:27:55Z,Muslim Americans who grew up under the shadow ...
1,"{'id': 'abc-news', 'name': 'ABC News'}",TERESA M. WALKER AP Pro Football Writer,"Titans' outbreak nears end, other NFL teams de...",https://abcnews.go.com/Sports/wireStory/titans...,2021-09-07T03:31:36Z,The Tennessee Titans COVID-19 outbreak is near...
2,"{'id': 'abc-news', 'name': 'ABC News'}",Dr. Priscilla Hanudel,COVID-19 infection after vaccination and what ...,https://abcnews.go.com/Health/covid-19-infecti...,2021-09-07T14:28:39Z,Vaccines work to dramatically reduce the risk ...
3,"{'id': 'abc-news', 'name': 'ABC News'}",Alisa Wiersema,Texas governor signs GOP-backed 'election inte...,https://abcnews.go.com/Politics/texas-governor...,2021-09-07T17:34:12Z,Three months and two special sessions after Te...
4,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Spanish hospital baby switch discovered two de...,https://abcnews.go.com/Lifestyle/wireStory/spa...,2021-09-07T17:07:13Z,Health authorities in Spain are blaming human ...


In [7]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [8]:
custom_stopwords = set(stopwords.words("english") + ["news", "new", "top"])
text_columns = ["title", "author", "article_text"]

df = df_raw.copy()
df["article_text"] = df["article_text"].fillna("")

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (1500, 6)
Pre-processed dataframe: (1494, 2)


In [9]:
df.head()

Unnamed: 0,text,tokens
1249,"More than 1,000km from Sydney, one of NSW's mo...","[1000km, sydney, one, nsws, precious, ecosyste..."
731,12th inmate dies as New York City's jail crisi...,"[12th, inmate, dies, york, citys, jail, crisis..."
492,16yo critical after 9m fall from Gold Coast ba...,"[16yo, critical, 9m, fall, gold, coast, balcon..."
325,1st trial in college admissions scandal gets u...,"[1st, trial, college, admissions, scandal, get..."
23,2nd Circuit affirms conviction of former top C...,"[2nd, circuit, affirms, conviction, former, cu..."


In [16]:
tokenized_docs = df['tokens'].values.tolist()

In [26]:
docs = df['text'].values.tolist()

In [18]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=SEED)

In [22]:
model.wv.most_similar("biden")

[('trump', 0.9893220663070679),
 ('joe', 0.9862522482872009),
 ('former', 0.9841845631599426),
 ('donald', 0.9841769337654114),
 ('administration', 0.9791196584701538),
 ('president', 0.9724687933921814),
 ('press', 0.9509657621383667),
 ('jen', 0.9441584944725037),
 ('vice', 0.942875325679779),
 ('senate', 0.9407942295074463)]

In [23]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(1494, 100)

In [24]:
def mbkmeans_clusters(X, k, mb, print_silhouette_values):
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [27]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=50,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.20
Inertia:50.06377599928166
Silhouette values:
    Cluster 23: Size:7 | Avg:0.69 | Min:0.55 | Max: 0.76
    Cluster 19: Size:13 | Avg:0.45 | Min:0.13 | Max: 0.63
    Cluster 37: Size:19 | Avg:0.43 | Min:0.03 | Max: 0.63
    Cluster 28: Size:17 | Avg:0.39 | Min:0.11 | Max: 0.57
    Cluster 45: Size:2 | Avg:0.38 | Min:0.30 | Max: 0.46
    Cluster 22: Size:25 | Avg:0.37 | Min:0.10 | Max: 0.56
    Cluster 4: Size:17 | Avg:0.34 | Min:0.02 | Max: 0.58
    Cluster 1: Size:28 | Avg:0.32 | Min:0.05 | Max: 0.55
    Cluster 33: Size:19 | Avg:0.32 | Min:-0.03 | Max: 0.50
    Cluster 9: Size:53 | Avg:0.28 | Min:0.04 | Max: 0.55
    Cluster 29: Size:42 | Avg:0.28 | Min:0.05 | Max: 0.52
    Cluster 5: Size:33 | Avg:0.27 | Min:-0.06 | Max: 0.53
    Cluster 44: Size:7 | Avg:0.26 | Min:0.12 | Max: 0.43
    Cluster 36: Size:6 | Avg:0.26 | Min:0.08 | Max: 0.45
    Cluster 13: Size:24 | Avg:0.26 | Min:0.02 | Max: 0.47
    Cluster 17: Size:31 | Avg:0.25 | Min:-

In [29]:
print("Most representative terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=10)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: result conditions showed remains sent appeared denied june research due 
Cluster 1: moments fun character merchandise music look incredible themed grand musical 
Cluster 2: russian rep warned hes intelligence called congressional hold governor handling 
Cluster 3: within company airport october large began gave full flight entry 
Cluster 4: favorite resorts spa experiences attractions tuned studios entertainment imagineers makers 
Cluster 5: known personal mission recently business looking group train apple father 
Cluster 6: students experts education analysis university protection local women investigators hospital 
Cluster 7: shooting suspect student taken identified taylor chicago charges robinson jail 
Cluster 8: attorney spokesperson york justice staff documents judge public superintendent officers 
Cluster 9: violent saw program building oil bears jobs twitter although highly 
Cluster 10: vaccination risk sev