In [1]:
import sys
sys.path.append("..")

from services.segments_database import select_chapter_words_by_score, update_chapter_topic, do_query
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans

import numpy as np
import pandas as pd
from beakerx import *
from time import time

import matplotlib
import matplotlib.pyplot as plt

## Settings

In [2]:
no_features = 1000
no_topics = 40
no_top_words = 10
no_top_documents = 2
min_score = 0
header_weight = 2
cluster_numbers = np.arange(5, 105, 5)

## Get Chapters and Texts

In [3]:
chapters = select_chapter_words_by_score(min_score)
df_chapters = pd.DataFrame(chapters.fetchall())
df_chapters.columns = chapters.keys()
df_chapters.head()

Unnamed: 0,chapter_id,text,chapter_number,header,header_preprocessed,parent_header,parent_preprocessed,grandparent_header,grandparent_preprocessed,document_id,preprocessed
0,1,\nStaatskanzlei\n\nRathaus 8750 Glarus\n\nTele...,,\n,,,,,,7878,055 09 11 12 15 60 646 8750 fax glarus mail ra...
1,2,\nAusschreibungsgegenstand\n\nDer Kanton Glaru...,1.0,Ausschreibung; Arbeiten zur Installation und z...,ausschreibung arbeit installation betrieb elek...,,,,,7878,aktiv anforderung anforderungsbereich dabei do...
2,3,\nDer Regierungsrat will die elektronische Sti...,2.0,Ziele Projektziele:\n,ziel projektziel,,,,,7878,2018 2019 ausschreibung bund e elektronisch gl...
3,4,"\nEs wird vorausgesetzt, dass der Anbieter gen...",3.0,Marktpositionierung und Rolle des Anbieters\n,marktpositionierung rolle anbieters,,,,,7878,anbieter bereitstellung betrieb dass e genügen...
4,5,\nDie nachfolgende Grafik zeigt die Abgrenzung...,4.0,Systemabgrenzung\n,systemabgrenzung,,,,,7878,abgrenzung aktivität bestehen betreiben datens...


In [4]:
features = (df_chapters['header_preprocessed'] + " ").str.repeat(header_weight).str.lstrip() \
            + (df_chapters['parent_preprocessed'] + " ").str.repeat(header_weight).str.lstrip() \
            + (df_chapters['grandparent_preprocessed'] + " ").str.repeat(header_weight).str.lstrip() \
            + df_chapters['preprocessed']
features = features.str.strip()
features.head()

0    055 09 11 12 15 60 646 8750 fax glarus mail ra...
1    ausschreibung arbeit installation betrieb elek...
2    ziel projektziel ziel projektziel 2018 2019 au...
3    marktpositionierung rolle anbieters marktposit...
4    systemabgrenzung systemabgrenzung abgrenzung a...
dtype: object

## Calculate tf-idf

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features)
tfidf = tfidf_vectorizer.fit_transform(features)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

## Calculate KMeans

In [None]:
models = []
tt = time()

for n_cluster in cluster_numbers: 
    print('STARTING TO CALCULATE MODEL WITH {} CLUSTERS'.format(n_cluster))
    km = KMeans(n_clusters=n_cluster, init='k-means++', max_iter=100, n_init=10,
                verbose=False)
    t0 = time()
    km.fit(tfidf)
    print("done in %0.3fs" % (time() - t0))
    print('inertia {}'.format(km.inertia_))
    
    models.append({
        'model': km,
        'inertia': km.inertia_,
        'n_clusters': n_cluster
    })
    
print('FINISHED AFTER '.format(time() - tt))

STARTING TO CALCULATE MODEL WITH 5 CLUSTERS


## Plot Inertia

In [None]:
fig, ax = plt.subplots()
ax.plot([model['n_clusters'] for model in models], [model['inertia'] for model in models])

ax.set(xlabel='number of clusters', ylabel='inertia')
ax.grid()

plt.show()

## Find most important words

In [None]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()

In [None]:
for i in range(40):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()