In [1]:
import pandas as pd
import glob
import os
import json
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn import metrics
from tqdm import tqdm
from scipy.spatial.distance import cdist
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def cluster_data(df):
    processed_articles = count_articles(df)
    tfidf, words = convert_to_tfidf(processed_articles)
    all_kmeans_models, common_words, mapping1, mapping2, K, distortions = calculate_kMeans(tfidf, words)
    knee = calculate_knee(K, distortions)
    cluster_words_list, df = gather_top_words(all_kmeans_models, knee, words, df)
    df = assign_top_words(cluster_words_list, df)
    df = date_month_publish(df)
    create_cluster_files(df)

# Load JSON Files into Pandas Dataframe

In [3]:
def count_articles(df):
    amount_of_articles = len(df["filtered_maintext"])
    #print(f"Amount of articles: {amount_of_articles}")
    processed_articles = df['filtered_maintext']
    return processed_articles

In [4]:
def convert_to_tfidf(processed_articles):
    tfidfconverter = TfidfVectorizer(lowercase=True, stop_words='english', min_df=0.05 , max_df=0.6)  
    tfidf = tfidfconverter.fit_transform(processed_articles)
    words = tfidfconverter.get_feature_names()
    return tfidf, words

In [5]:
from sklearn.metrics.pairwise import cosine_distances

#cosine_similarity = cosine_distances(tfidf)
#cosine_similarity

In [6]:
from sklearn.cluster import AffinityPropagation

In [7]:
'''
clustering = AffinityPropagation(convergence_iter=5, affinity='precomputed', random_state=10)
clustering.fit(cosine_similarity)
labels = clustering.labels_
print(labels.dtype)
#cluster_centers = clustering.cluster_centers_
cluster_centers_indices = clustering.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
print(n_clusters_)'''

"\nclustering = AffinityPropagation(convergence_iter=5, affinity='precomputed', random_state=10)\nclustering.fit(cosine_similarity)\nlabels = clustering.labels_\nprint(labels.dtype)\n#cluster_centers = clustering.cluster_centers_\ncluster_centers_indices = clustering.cluster_centers_indices_\nn_clusters_ = len(cluster_centers_indices)\nprint(n_clusters_)"

In [8]:
'''
import matplotlib.pyplot as plt
from itertools import cycle

plt.close('all')
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = cosine_similarity[cluster_centers_indices[k]]
    plt.plot(cosine_similarity[class_members, 0], cosine_similarity[class_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    for x in cosine_similarity[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()'''

"\nimport matplotlib.pyplot as plt\nfrom itertools import cycle\n\nplt.close('all')\nplt.figure(1)\nplt.clf()\n\ncolors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')\nfor k, col in zip(range(n_clusters_), colors):\n    class_members = labels == k\n    cluster_center = cosine_similarity[cluster_centers_indices[k]]\n    plt.plot(cosine_similarity[class_members, 0], cosine_similarity[class_members, 1], col + '.')\n    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,\n             markeredgecolor='k', markersize=14)\n    for x in cosine_similarity[class_members]:\n        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)\n\nplt.title('Estimated number of clusters: %d' % n_clusters_)\nplt.show()"

In [9]:
'''
df['processed_articles'] = processed_articles
df['affpropID'] = clustering.labels_'''


"\ndf['processed_articles'] = processed_articles\ndf['affpropID'] = clustering.labels_"

In [11]:
#df.head()

In [12]:
def calculate_kMeans(tfidf, words):
    max_clusters = 10
    distortions = []
    inertias = []
    mapping1 = {}
    mapping2 = {}

    last_distortion = 0

    all_kmeans_models = []

    K = range(2,max_clusters)
    X = np.matrix(tfidf.toarray())
    for i in tqdm(K):
        #print("Iteration: " + str(i))
        kMeans = KMeans(n_clusters=i, max_iter=400).fit(tfidf)
        kMeans.predict(tfidf)
        labels = kMeans.labels_
        cluster_centers = kMeans.cluster_centers_
        inertias.append(kMeans.inertia_)
        distortions.append(sum(np.min(cdist(X, cluster_centers, 'euclidean'), axis=1)) / X.shape[0])

        this_silhouette = metrics.silhouette_score(tfidf.toarray(), labels, metric='sqeuclidean')

        #print("Silhouette Score: " + str(this_silhouette))

        mapping1[i] =  sum(np.min(cdist(X, cluster_centers, 'euclidean'), axis=1)) / X.shape[0]
        mapping2[i] = kMeans.inertia_

        all_kmeans_models.append(kMeans)

        common_words = kMeans.cluster_centers_.argsort()[:,-1:-11:-1]
        #for num, centroid in enumerate(common_words):
            #print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))
    return all_kmeans_models, common_words, mapping1, mapping2, K, distortions

In [13]:
def calculate_knee(K, distortions):
    #print("The elbow point of the curve is: ")
    #print('K: ',len(K))
    #print('dis: ', len(distortions))
    kneedle = KneeLocator(K, distortions, S=1.0, curve="convex", direction="decreasing")
    #print(kneedle.knee_y)
    kneedle.plot_knee()
    return kneedle.knee

In [14]:
#plt.plot(K, inertias, 'bx-')
#plt.xlabel('Values of K')
#plt.ylabel('Inertia')
#plt.title('The Elbow Method using Inertia')
#plt.show()

In [15]:
#plt.plot(K, distortions, 'bx-')
#plt.xlabel('Values of K')
#plt.ylabel('Distortion')
#plt.title('The Elbow Method using Distortion')
#plt.show()

In [16]:
def gather_top_words(all_kmeans_models, knee, words, df):
    # save the clusterIDs to the dataframe
    # minus 2 because we start checking with 2 clusters
    df["kMeans_ID"] = all_kmeans_models[knee-2].labels_
    cluster_words_list = []
    common_words = all_kmeans_models[knee-2].cluster_centers_.argsort()[:,-1:-11:-1]
    for num, centroid in enumerate(common_words):
        cluster_words = []
        for word in centroid:
            cluster_words.append(words[word])
        cluster_words_list.append(cluster_words)
    return cluster_words_list, df

In [17]:
def assign_top_words(cluster_words_list, df):
    row_words = []
    for index, row in df.iterrows():
        cluster = row.kMeans_ID
        row_words.append(cluster_words_list[cluster])
    df['kMeans_words'] = row_words
    return df


# Subcluster by release date

We will determine each articles release date and sort them into individual json files.

In [18]:
def getMonthYear(s):
     return s.split('-')[0]+"-"+s.split('-')[1]

# Generating folder structure

The following code creates the desired folder hierarchy and names each cluster after the top 3 dominant words in each one. Within each cluster/folder we are subclustering all articles by their release date.
The output json file has the format *year-month.json*. 

In [19]:
def date_month_publish(df):
    df['date_publish'] = pd.to_datetime(df['date_publish'])
    df['month_year'] = df['date_publish'].apply(lambda x: getMonthYear(str(x)))
    return df

In [20]:
def create_cluster_files(df):
    for cluster_id, data in df.groupby(df.kMeans_ID):
        item = data.kMeans_words.tolist()
        item = item[0]
        os.makedirs(f'./event_clustered_json/cluster_{cluster_id}-{item[0]}_{item[1]}_{item[2]}')
        for date, date_data in data.groupby(data.month_year):
            json_data = date_data.to_json(orient='records', force_ascii=False, date_format='iso', date_unit='s')
            parsed = json.loads(json_data)
            with open(f'./event_clustered_json/cluster_{cluster_id}-{item[0]}_{item[1]}_{item[2]}/{date}.json', 'w', encoding='utf-8') as f:
                f.write(json.dumps({"data": parsed}, indent=4, ensure_ascii=False))

In [21]:
#filename = 'cluster_20-trump_ president_ king.json'
path = 'lda_clustered_json/'

for filename in tqdm(glob.glob(os.path.join(path, '*.json'))):
    with open(filename, encoding='utf-8', mode='r') as currentFile:
        df = pd.read_json(currentFile, orient='index')
        cluster_data(df)

100%|██████████| 8/8 [01:40<00:00, 12.51s/it]
100%|██████████| 8/8 [02:02<00:00, 15.36s/it]
 18%|█▊        | 2/11 [03:55<17:56, 119.57s/it]