In [32]:
import numpy as np
import nltk
from sentence_transformers import SentenceTransformer
import pandas as pd

In [33]:
df_raw = pd.read_json('news_articles_with_text_3_sources.json')

In [34]:
df_raw.head()

Unnamed: 0,source,author,title,url,publishedAt,article_text
0,"{'id': 'abc-news', 'name': 'ABC News'}","MARIAM FAM, DEEPTI HAJELA and LUIS ANDRES HENA...","Two decades after 9/11, Muslim Americans still...",https://abcnews.go.com/Lifestyle/wireStory/dec...,2021-09-07T07:27:55Z,Muslim Americans who grew up under the shadow ...
1,"{'id': 'abc-news', 'name': 'ABC News'}",TERESA M. WALKER AP Pro Football Writer,"Titans' outbreak nears end, other NFL teams de...",https://abcnews.go.com/Sports/wireStory/titans...,2021-09-07T03:31:36Z,The Tennessee Titans COVID-19 outbreak is near...
2,"{'id': 'abc-news', 'name': 'ABC News'}",Dr. Priscilla Hanudel,COVID-19 infection after vaccination and what ...,https://abcnews.go.com/Health/covid-19-infecti...,2021-09-07T14:28:39Z,Vaccines work to dramatically reduce the risk ...
3,"{'id': 'abc-news', 'name': 'ABC News'}",Alisa Wiersema,Texas governor signs GOP-backed 'election inte...,https://abcnews.go.com/Politics/texas-governor...,2021-09-07T17:34:12Z,Three months and two special sessions after Te...
4,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Spanish hospital baby switch discovered two de...,https://abcnews.go.com/Lifestyle/wireStory/spa...,2021-09-07T17:07:13Z,Health authorities in Spain are blaming human ...


In [35]:
df = df_raw.copy()
df["article_text"] = df["article_text"].fillna("")

In [36]:
articles_list = df["article_text"].tolist()
articles_corpus = " "
articles_corpus = articles_corpus.join(articles_list[0:50])

In [38]:
a_list = nltk.tokenize.sent_tokenize(articles_corpus)

In [39]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [40]:
sentence_embeddings = sbert_model.encode(a_list)

In [41]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [42]:
def mbkmeans_clusters(X, k, mb, print_silhouette_values):
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[1], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]}"
            )
    return km, km.labels_

In [56]:
clustering, cluster_labels = mbkmeans_clusters(
	X=sentence_embeddings,
    k=50,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": a_list,
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.03
Inertia:129241.0859375
Silhouette values:
    Cluster 14: Size:58
    Cluster 3: Size:49
    Cluster 16: Size:46
    Cluster 31: Size:42
    Cluster 15: Size:41
    Cluster 19: Size:41
    Cluster 5: Size:40
    Cluster 45: Size:39
    Cluster 48: Size:38
    Cluster 41: Size:37
    Cluster 38: Size:36
    Cluster 29: Size:35
    Cluster 42: Size:35
    Cluster 7: Size:34
    Cluster 11: Size:32
    Cluster 17: Size:31
    Cluster 27: Size:31
    Cluster 22: Size:29
    Cluster 33: Size:29
    Cluster 9: Size:28
    Cluster 10: Size:26
    Cluster 28: Size:26
    Cluster 36: Size:26
    Cluster 25: Size:25
    Cluster 35: Size:25
    Cluster 32: Size:24
    Cluster 44: Size:24
    Cluster 23: Size:23
    Cluster 26: Size:20
    Cluster 46: Size:20
    Cluster 0: Size:18
    Cluster 4: Size:18
    Cluster 6: Size:17
    Cluster 18: Size:17
    Cluster 30: Size:17
    Cluster 24: Size:16
    Cluster 37: Size:15
    Cluster 12: Size:14
    

In [66]:
test_cluster = 2
most_representative_docs = np.argsort(
    np.linalg.norm(sentence_embeddings - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[0:10]:
    print(a_list[d])
    print("-------------")

When they leave, may they take the memory of Tenaya Stone home with them until their next visit.
-------------
"You judge your deployments by -- you're missing what holidays.
-------------
"Our kids are going to be (here) well after the 9/11 era," she says.
-------------
What do you want guests to come away with after visiting Tenaya Stone Spa?
-------------
Even as some of the benefits that were provided are set to expire next week, states have the option to extend those benefits and the federal resources from the Rescue Plan to do so.
-------------
Earlier this year, Peyton Elizabeth Lee took some time off from shooting her Disney+ Original Series, "Doogie Kamealoha, M.D.
-------------
After this pretrial phase Sept. 7-17, another pretrial continuation is set for Nov. 1-19.
-------------
Northam, who watched the work, called it "hopefully a new day, a new era in Virginia."
-------------
Continuing resolutions typically are approved by Congress to keep the government operating when th

In [67]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [58]:
sent1 = "Biden has already made significant moves in requiring vaccines among public sector workers"
sent2 = "Fully vaccinated people who test positive may also be eligible for authorized COVID-19 treatments, if their doctor says it's necessary"

In [59]:
sim = cosine(sbert_model.encode([sent1])[0], sbert_model.encode([sent2])[0])

In [60]:
sim

0.558436