In [5]:
import numpy as np
import nltk
from sentence_transformers import SentenceTransformer
import pandas as pd

from gensim.models import Word2Vec

In [6]:
df_raw = pd.read_json('news_articles_with_text_3_sources.json')

In [7]:
df_raw.head()

Unnamed: 0,source,author,title,url,publishedAt,article_text
0,"{'id': 'abc-news', 'name': 'ABC News'}","MARIAM FAM, DEEPTI HAJELA and LUIS ANDRES HENA...","Two decades after 9/11, Muslim Americans still...",https://abcnews.go.com/Lifestyle/wireStory/dec...,2021-09-07T07:27:55Z,Muslim Americans who grew up under the shadow ...
1,"{'id': 'abc-news', 'name': 'ABC News'}",TERESA M. WALKER AP Pro Football Writer,"Titans' outbreak nears end, other NFL teams de...",https://abcnews.go.com/Sports/wireStory/titans...,2021-09-07T03:31:36Z,The Tennessee Titans COVID-19 outbreak is near...
2,"{'id': 'abc-news', 'name': 'ABC News'}",Dr. Priscilla Hanudel,COVID-19 infection after vaccination and what ...,https://abcnews.go.com/Health/covid-19-infecti...,2021-09-07T14:28:39Z,Vaccines work to dramatically reduce the risk ...
3,"{'id': 'abc-news', 'name': 'ABC News'}",Alisa Wiersema,Texas governor signs GOP-backed 'election inte...,https://abcnews.go.com/Politics/texas-governor...,2021-09-07T17:34:12Z,Three months and two special sessions after Te...
4,"{'id': 'abc-news', 'name': 'ABC News'}",The Associated Press,Spanish hospital baby switch discovered two de...,https://abcnews.go.com/Lifestyle/wireStory/spa...,2021-09-07T17:07:13Z,Health authorities in Spain are blaming human ...


In [8]:
df = df_raw.copy()
df["article_text"] = df["article_text"].fillna("")

In [9]:
df["sentence_tokens"] = df["article_text"].map(lambda x: nltk.tokenize.sent_tokenize(x))
# Remove duplicated after preprocessing
_, idx = np.unique(df["sentence_tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.sentence_tokens.map(lambda x: len(x) > 0), ["article_text", "sentence_tokens"]]

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (1500, 6)
Pre-processed dataframe: (570, 2)


In [10]:
df = df.explode('sentence_tokens').reset_index()

In [11]:
df.shape

(16377, 3)

In [12]:
df

Unnamed: 0,index,article_text,sentence_tokens
0,31,"""Disney Enchantment,"" the brand-new nighttime ...","""Disney Enchantment,"" the brand-new nighttime ..."
1,31,"""Disney Enchantment,"" the brand-new nighttime ...","The show takes you on an emotional journey, em..."
2,31,"""Disney Enchantment,"" the brand-new nighttime ...",Philip Lawrence who wrote the original song fo...
3,31,"""Disney Enchantment,"" the brand-new nighttime ...",Philip got his start as a cast member at Walt ...
4,31,"""Disney Enchantment,"" the brand-new nighttime ...","Through hard work and determination, he went o..."
...,...,...,...
16372,838,You may have recently heard that an all-new or...,You might spot a familiar face floating around...
16373,838,You may have recently heard that an all-new or...,Disney Music Group: Disney Music Group will fe...
16374,838,You may have recently heard that an all-new or...,Visit disneymusic.co/DisneyHalloweenPL for mor...
16375,838,You may have recently heard that an all-new or...,"And make sure to watch ""Muppets Haunted Mansio..."


In [13]:
articles_list = df["sentence_tokens"].values.tolist()
# articles_corpus = " "
# articles_corpus = articles_corpus.join(articles_list[0:50])

In [13]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [14]:
sentence_embeddings = sbert_model.encode(articles_list)

In [14]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [16]:
def mbkmeans_clusters(X, k, mb, print_silhouette_values):
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[1], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]}"
            )
    return km, km.labels_

In [17]:
clustering, cluster_labels = mbkmeans_clusters(
	X=sentence_embeddings,
    k=50,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "sentence_tokens": articles_list,
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.03
Inertia:1806666.25
Silhouette values:
    Cluster 38: Size:635
    Cluster 14: Size:630
    Cluster 23: Size:574
    Cluster 39: Size:564
    Cluster 1: Size:543
    Cluster 5: Size:538
    Cluster 26: Size:508
    Cluster 18: Size:493
    Cluster 29: Size:454
    Cluster 10: Size:453
    Cluster 41: Size:449
    Cluster 15: Size:423
    Cluster 27: Size:415
    Cluster 3: Size:413
    Cluster 17: Size:388
    Cluster 19: Size:383
    Cluster 8: Size:377
    Cluster 2: Size:360
    Cluster 40: Size:360
    Cluster 25: Size:349
    Cluster 47: Size:348
    Cluster 28: Size:323
    Cluster 24: Size:316
    Cluster 43: Size:315
    Cluster 36: Size:311
    Cluster 11: Size:309
    Cluster 4: Size:295
    Cluster 48: Size:293
    Cluster 31: Size:286
    Cluster 42: Size:281
    Cluster 33: Size:269
    Cluster 45: Size:267
    Cluster 22: Size:265
    Cluster 21: Size:255
    Cluster 0: Size:254
    Cluster 16: Size:254
    Cluster 37: Size

In [18]:
df_clustered = pd.merge(df, df_clusters, on="sentence_tokens")

In [19]:
df_clustered

Unnamed: 0,index,article_text,sentence_tokens,cluster
0,31,"""Disney Enchantment,"" the brand-new nighttime ...","""Disney Enchantment,"" the brand-new nighttime ...",18
1,31,"""Disney Enchantment,"" the brand-new nighttime ...","The show takes you on an emotional journey, em...",2
2,31,"""Disney Enchantment,"" the brand-new nighttime ...",Philip Lawrence who wrote the original song fo...,2
3,31,"""Disney Enchantment,"" the brand-new nighttime ...",Philip got his start as a cast member at Walt ...,15
4,31,"""Disney Enchantment,"" the brand-new nighttime ...","Through hard work and determination, he went o...",2
...,...,...,...,...
17160,838,You may have recently heard that an all-new or...,You might spot a familiar face floating around...,33
17161,838,You may have recently heard that an all-new or...,Disney Music Group: Disney Music Group will fe...,15
17162,838,You may have recently heard that an all-new or...,Visit disneymusic.co/DisneyHalloweenPL for mor...,18
17163,838,You may have recently heard that an all-new or...,"And make sure to watch ""Muppets Haunted Mansio...",15


In [20]:
df_labels = df_clustered.groupby(['index','cluster']).size().reset_index(name='counts')

In [21]:
df_labelled = pd.merge(df_clustered, df_labels, on=["index","cluster"])

In [22]:
idx = df_labelled.groupby(['index'])['counts'].transform(max) == df_labelled['counts']

In [23]:
df = df_labelled[idx]

In [24]:
df['cluster'].unique()

array([18, 39, 37, 10, 28, 48,  8, 47, 14, 29,  9,  7,  4, 38,  1, 32, 36,
       26,  3,  5, 42,  0, 19, 27, 23, 21, 31, 41, 17, 24, 45,  2, 46, 15,
       34, 22, 16, 40, 33, 44, 43, 25, 11, 12, 30, 13, 49, 35,  6],
      dtype=int32)

In [25]:
df

Unnamed: 0,index,article_text,sentence_tokens,cluster,counts
0,31,"""Disney Enchantment,"" the brand-new nighttime ...","""Disney Enchantment,"" the brand-new nighttime ...",18,5
1,31,"""Disney Enchantment,"" the brand-new nighttime ...",And they soar to emotional high points during ...,18,5
2,31,"""Disney Enchantment,"" the brand-new nighttime ...","Philip Lawrence not only wrote ""You Are the Ma...",18,5
3,31,"""Disney Enchantment,"" the brand-new nighttime ...",Check back here on Disney Parks Blog and follo...,18,5
4,31,"""Disney Enchantment,"" the brand-new nighttime ...",Check back here on Disney Parks Blog and follo...,18,5
...,...,...,...,...,...
17152,838,You may have recently heard that an all-new or...,"Also beginning Oct. 8, you can visit an exhibi...",15,9
17153,838,You may have recently heard that an all-new or...,"Also at Disneys Hollywood Studios, starting on...",15,9
17154,838,You may have recently heard that an all-new or...,"Over at Magic Kingdom Park, beginning Oct. 8, ...",15,9
17155,838,You may have recently heard that an all-new or...,Disney Music Group: Disney Music Group will fe...,15,9


In [26]:
#df_cl = df.groupby(['cluster'])['article_text'].apply(', '.join).reset_index()

In [31]:
df_clustered = df.drop_duplicates(subset=['index', 'article_text','cluster'])
df_clustered

Unnamed: 0,index,article_text,sentence_tokens,cluster,counts
0,31,"""Disney Enchantment,"" the brand-new nighttime ...","""Disney Enchantment,"" the brand-new nighttime ...",18,5
12,744,Its hard to believe we are only 10 days away f...,Check back here on Disney Parks Blog and follo...,18,10
28,121,"""I knew if he got me on the floor, I would be ...","The officer got out of his car and told her, ""...",39,9
37,1210,Nikkita Brown was walking her dog in a closed ...,"The officer got out of his car and told her, ""...",39,6
79,707,"""I think it will be safe. I think it will be e...",Pfizer just released its first safety data abo...,37,12
...,...,...,...,...,...
17073,533,"With the help of Mickey Mouse, Minnie Mouse, o...",Im so proud of them as they worked their way t...,2,7
17095,410,Women driving hundreds of miles alone for an a...,"Under the law, between 85% and 95% of all abor...",24,6
17112,408,Wyoming's governor says the state will ask the...,Wyoming's governor says the state will ask the...,38,4
17138,35,You may have heard about whats new on a Disney...,You may have heard about whats new on a Disney...,18,6


In [33]:
df_clustered['cluster'].unique()

array([18, 39, 37, 10, 28, 48,  8, 47, 14, 29,  9,  7,  4, 38,  1, 32, 36,
       26,  3,  5, 42,  0, 19, 27, 23, 21, 31, 41, 17, 24, 45,  2, 46, 15,
       34, 22, 16, 40, 33, 44, 43, 25, 11, 12, 30, 13, 49, 35,  6],
      dtype=int32)

In [34]:
df_final = df_clustered.groupby(["cluster"])['article_text'].apply(list).reset_index(name='article_list')

In [39]:
len(df_final['article_list'][0])

15

In [41]:
# for i, row in df_final.iterrows():
#     print(row['cluster'])
#     print(row['article_list'])

In [46]:
for i, row in df_final.iterrows():
    if i > len(df_final):
       break
    else:
       f = open('cluster_' + str(row['cluster'])+'.txt', 'w')
       for i in range(len(row['article_list'])):
         f.write(row['article_list'][i])
         f.write('\n')
         f.write('\n')
       f.close()
       i+=1

In [67]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [58]:
sent1 = "Biden has already made significant moves in requiring vaccines among public sector workers"
sent2 = "Fully vaccinated people who test positive may also be eligible for authorized COVID-19 treatments, if their doctor says it's necessary"

In [59]:
sim = cosine(sbert_model.encode([sent1])[0], sbert_model.encode([sent2])[0])

In [60]:
sim

0.558436