# Imports

In [1]:
import sys
sys.path.append("..")

from services.segments_database import select_chapter_words_by_score, update_chapter_topic, do_query
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
import pandas as pd
from beakerx import *

# Settings

In [2]:
no_features = 1000
no_topics = 40
no_top_words = 10
no_top_documents = 2
min_score = 0
header_weight = 2

# Code
## Get Chapters and Texts

In [3]:
chapters = select_chapter_words_by_score(min_score)
df_chapters = pd.DataFrame(chapters.fetchall())
df_chapters.columns = chapters.keys()
df_chapters.head()

Unnamed: 0,chapter_id,text,chapter_number,header,header_preprocessed,parent_header,parent_preprocessed,grandparent_header,grandparent_preprocessed,document_id,preprocessed
0,1,\nStaatskanzlei\n\nRathaus 8750 Glarus\n\nTele...,,\n,,,,,,7878,055 09 11 12 15 60 646 8750 fax glarus mail ra...
1,2,\nAusschreibungsgegenstand\n\nDer Kanton Glaru...,1.0,Ausschreibung; Arbeiten zur Installation und z...,ausschreibung arbeit installation betrieb elek...,,,,,7878,aktiv anforderung anforderungsbereich dabei do...
2,3,\nDer Regierungsrat will die elektronische Sti...,2.0,Ziele Projektziele:\n,ziel projektziel,,,,,7878,2018 2019 ausschreibung bund e elektronisch gl...
3,4,"\nEs wird vorausgesetzt, dass der Anbieter gen...",3.0,Marktpositionierung und Rolle des Anbieters\n,marktpositionierung rolle anbieters,,,,,7878,anbieter bereitstellung betrieb dass e genügen...
4,5,\nDie nachfolgende Grafik zeigt die Abgrenzung...,4.0,Systemabgrenzung\n,systemabgrenzung,,,,,7878,abgrenzung aktivität bestehen betreiben datens...


features = chapter content + header_weight * chapter headers
because chapter headers is more meaningful than content

In [4]:
features = (df_chapters['header_preprocessed'] + " ").str.repeat(header_weight).str.lstrip() \
            + (df_chapters['parent_preprocessed'] + " ").str.repeat(header_weight).str.lstrip() \
            + (df_chapters['grandparent_preprocessed'] + " ").str.repeat(header_weight).str.lstrip() \
            + df_chapters['preprocessed']
features = features.str.strip()
features.head()

0    055 09 11 12 15 60 646 8750 fax glarus mail ra...
1    ausschreibung arbeit installation betrieb elek...
2    ziel projektziel ziel projektziel 2018 2019 au...
3    marktpositionierung rolle anbieters marktposit...
4    systemabgrenzung systemabgrenzung abgrenzung a...
dtype: object

## Calculate tf-idf
for NMF

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features)
tfidf = tfidf_vectorizer.fit_transform(features)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

## Calculate tf
for LDA

In [6]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features)
tf = tf_vectorizer.fit_transform(features)
tf_feature_names = tf_vectorizer.get_feature_names()

## Calculate NMF

In [7]:
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

## Calculate LDA

In [8]:
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

# Analyze
## Function for displaying

In [9]:
def get_topics(H, W, feature_names, chapters, no_top_words, no_top_documents):
    topics = {
        'id': [],
        'word': []
    }
    for topic_idx, topic in enumerate(H):
        topics['id'].append(topic_idx)
        topics['word'].append(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    return pd.DataFrame(topics)

In [10]:
def get_most_important_topics(H, W, feature_names, no_top_words):
    most_important = []
    for w in W:
        max_id = w.argmax()
        topics = " ".join([feature_names[i]
                            for i in H[max_id].argsort()[:-no_top_words - 1:-1]])
        most_important.append(topics)
        
    return most_important

## Display NMF

In [11]:
nmf_topics = get_topics(nmf_H, nmf_W, tfidf_feature_names, df_chapters['text'], no_top_words, no_top_documents)
nmf_topics

Unnamed: 0,id,word
0,0,angebot gültigkeit einreichung verbindlichkeit...
1,1,art abs bauherr abnahme auftraggebers folgen p...
2,2,angabe weit vergabeverfahren allgemeine untern...
3,3,adresse name ort mail telefon plz fax auftragg...
4,4,zuschlagskriterium zk gewichtung eignungsund b...
5,5,mm st inkl typ le stk dn fabrikat m2 50
6,6,bestimmung besonderer vergabeverfahren werklei...
7,7,information verhandlung administratives vertra...
8,8,variante teilangebot zulassen nein ja zulässig...
9,9,ausschreibung organisation eignungsund gegenst...


## Display LDA

In [12]:
topics_lda = get_topics(lda_H, lda_W, tf_feature_names, df_chapters['text'], no_top_words, no_top_documents)
topics_lda

Unnamed: 0,id,word
0,0,ausschreibungsunterlage offerte los variante s...
1,1,bkp kosten bestätigung phase 300 projektorgani...
2,2,nr table plan bauvorhaben bemerkung sanierung ...
3,3,einzurechnen unternehmer material einheitsprei...
4,4,etc anlage werkleistung richtlinie beim spezie...
5,5,allgemeine bestimmung subunternehmer lieferung...
6,6,unternehmung firma anbieterin objekt name funk...
7,7,ausführung leistungsverzeichnis enthalten gesa...
8,8,zürich stadt zusätzlich preis ag neu vertragsp...
9,9,ausschreibung dokument öffentlich inhaltsverze...


## concat scores and chapters

## Save

save the 5 most important topics

In [None]:
def find_highest_five(nmf_W):
    highest_five = []
    for i, w in enumerate(nmf_W):
        found = False
        for h in highest_five:
            if w >= h['value']:
                highest_five.append({
                    'index': i,
                    'value': w
                })
                found = True
                break
        if len(highest_five) < 5 and not found:
            highest_five.append({
                'index': i,
                'value': w
            })
        highest_five = sorted(highest_five, key=lambda x: x['value'], reverse=True)[0:5]
    return highest_five

In [None]:
l = []
for w in nmf_W:
    l.append(find_highest_five(w))
nmf_df = pd.DataFrame(l)
nmf_df.columns = ['topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5']
nmf_df.insert(0, 'features', features)
nmf_df = pd.concat([df_chapters.iloc[:,0:4], nmf_df], axis=1)
nmf_df.head()

In [None]:
l = []
for w in nmf_W:
    l.append(find_highest_five(w))

In [None]:
print(nmf_W.shape)
print(len(l))

In [None]:
nmf_df2 = df_chapters.iloc[:,0:4]
nmf_df2['topic_1'] = Series(map(lambda x: x[0], l), index=nmf_df2.index)
nmf_df2['topic_2'] = Series(map(lambda x: x[1], l), index=nmf_df2.index)
nmf_df2['topic_3'] = Series(map(lambda x: x[2], l), index=nmf_df2.index)
nmf_df2['topic_4'] = Series(map(lambda x: x[3], l), index=nmf_df2.index)
nmf_df2['topic_5'] = Series(map(lambda x: x[4], l), index=nmf_df2.index)

nmf_df2.head()
nmf_df = nmf_df2

In [None]:
for i, d in nmf_df.iterrows():
    update_chapter_topic(d['chapter_id'], d['topic_1'], d['topic_2'], d['topic_3'], d['topic_4'], d['topic_5'])
    if i % 100 == 0:
        print('updating chapter {0} of {1}'.format(i, len(nmf_df)))

Inserting topics

In [None]:
do_query('truncate table topic')
for i, topic in nmf_topics.iterrows():
    do_query('insert into topic (id, words) values ({0}, "{1}")'.format(topic['id'], topic['word']))

# Evaluate

## Export Tables

In [None]:
nmf_df.to_csv('nmf_topics.csv')
lda_df.to_csv('lda_topics.csv')