# Document Clustering and Concept extraction

In [5]:
from pathlib import Path
import sys

parent = Path().absolute().parents[0].as_posix()

sys.path.insert(0, parent)

from tqdm import tqdm
from collections import Counter

import pandas as pd
import numpy as np

import pickle
import gensim
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist



In [6]:
with open('../data/processed/words_embedded.pickle', 'rb') as handle:
    words_emb = pickle.load(handle)

In [7]:
with open('../data/processed/docs_cleaned.pickle', 'rb') as handle:
    docs_cleaned = pickle.load(handle)
print(len(docs_cleaned))

30712


In [8]:
model = gensim.models.Word2Vec.load('../models/trained/word2vec.model')

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs_cleaned)
feature_array = vectorizer.get_feature_names()

tf_idf_values = dict(list(zip(feature_array,vectorizer.idf_,)))



In [4]:
tf_idf_values

NameError: name 'tf_idf_values' is not defined

In [10]:
X = [val[0] for val in words_emb]

In [20]:
distortions = []
cluster_numbers = range(2, 20)
for k in cluster_numbers:
    k_means = KMeans(n_clusters=k, random_state=0)
    k_means.fit(X)
    distortions.append(sum(np.min(cdist(X, k_means.cluster_centers_, 'euclidean'), axis=1)) / len(X))


X_line = [cluster_numbers[0], cluster_numbers[-1]]
Y_line = [distortions[0], distortions[-1]]

plt.figure(figsize=(10,10))
plt.plot(cluster_numbers, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()


KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


In [11]:
kmeans = KMeans(n_clusters=10, random_state=0)
kmeans.fit(X)

KMeans(n_clusters=10, random_state=0)

In [12]:
np.unique(kmeans.labels_)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)

In [13]:
concept_mapping = dict(list(zip([val[1] for val in words_emb], kmeans.labels_)))

In [None]:
concept_mapping

In [15]:
res = pd.DataFrame()
res['Word'] = [val[1] for val in words_emb]
res['Emb'] = [val[0] for val in words_emb]
res['Concept'] = kmeans.labels_

In [16]:
res.Concept.value_counts()

3    73714
2     3969
0     2390
7     2310
9     2069
1     1962
5     1832
8     1753
4     1730
6     1192
Name: Concept, dtype: int64

In [17]:
for concept in np.unique(kmeans.labels_):
    print('Concept:', concept)
    print(model.wv.most_similar(positive=[np.mean(res[res.Concept==concept]['Emb'])], topn=10))

Concept: 0
[('argumentista', 0.6818162202835083), ('atriz', 0.6619501113891602), ('trilogia', 0.6498917937278748), ('pianista', 0.6482418179512024), ('compositor', 0.637350857257843), ('trompetista', 0.6336904168128967), ('cantores', 0.6227751970291138), ('guitarrista', 0.6222512722015381), ('editou', 0.6219135522842407), ('romancista', 0.6213992834091187)]
Concept: 1
[('formal', 0.659369170665741), ('rejeitou', 0.6560016870498657), ('assumida', 0.6420570015907288), ('judicial.', 0.6219436526298523), ('demissao', 0.6132233142852783), ('questionado', 0.6036558151245117), ('relator', 0.5992388725280762), ('parlamento', 0.5944909453392029), ('governo', 0.5912910103797913), ('parlamentar', 0.5898439884185791)]
Concept: 2
[('informadas', 0.6677165031433105), ('reguladoras', 0.64616858959198), ('crediveis', 0.645114541053772), ('organizarem', 0.6147018074989319), ('agredidas', 0.5967496633529663), ('beneficiam', 0.5966504812240601), ('constituidas', 0.596274733543396), ('existirao', 0.592471

In [21]:
from collections import defaultdict

In [34]:
t = {key: [] for key in range(0, 10)}


In [None]:
t

In [39]:
for word in docs_cleaned[0].split(' '):
    try:
        t[concept_mapping[word]].append(tf_idf_values[word])
    except KeyError:
        continue

In [None]:
t