# Imports

In [2]:
import sys
sys.path.append("..")

from services.segments_database import select_chapter_words_by_score, update_chapter_topic, do_query
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
import pandas as pd
from beakerx import *
from time import time

# Settings

In [3]:
no_features = 1000
no_topics = 40
no_top_words = 10
no_top_documents = 2
min_score = 0
header_weight = 2

# Functions

returns top words for every topic in topic model

In [65]:
def get_topics(H, W, feature_names, chapters, no_top_words, no_top_documents):
    topics = {
        'id': [],
        'word': []
    }
    for topic_idx, topic in enumerate(H):
        topics['id'].append(topic_idx)
        topics['word'].append(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    return pd.DataFrame(topics)

returns most important topic for every row in dataset

In [66]:
def get_most_important_topics(H, W, feature_names, no_top_words):
    most_important = []
    for w in W:
        max_id = w.argmax()
        topics = " ".join([feature_names[i]
                            for i in H[max_id].argsort()[:-no_top_words - 1:-1]])
        most_important.append(topics)
        
    return most_important

given a column_id, this function checks in how many rows this column has the highest value

In [55]:
def columnIsHighest(lda_W, column_id):
    m = []
    df = pd.DataFrame(lda_W)
    for index, row in df.iterrows():
        # check if column is highest value in row. also check if algorithm didn't guess same value for all cols. round is used because of floating point mistakes.
        if row.max() == row[column_id] and round(row.max(), 4) != round(row.mean(), 4):
            m.append(index)
    return m

# Script
## Get Chapters and Texts

In [4]:
chapters = select_chapter_words_by_score(min_score)
df_chapters = pd.DataFrame(chapters.fetchall())
df_chapters.columns = chapters.keys()
df_chapters.head()

Unnamed: 0,chapter_id,text,chapter_number,header,header_preprocessed,parent_header,parent_preprocessed,grandparent_header,grandparent_preprocessed,document_id,preprocessed
0,1,\nStaatskanzlei\n\nRathaus 8750 Glarus\n\nTele...,,\n,,,,,,7878,055 09 11 12 15 60 646 8750 fax glarus mail ra...
1,2,\nAusschreibungsgegenstand\n\nDer Kanton Glaru...,1.0,Ausschreibung; Arbeiten zur Installation und z...,ausschreibung arbeit installation betrieb elek...,,,,,7878,aktiv anforderung anforderungsbereich dabei do...
2,3,\nDer Regierungsrat will die elektronische Sti...,2.0,Ziele Projektziele:\n,ziel projektziel,,,,,7878,2018 2019 ausschreibung bund e elektronisch gl...
3,4,"\nEs wird vorausgesetzt, dass der Anbieter gen...",3.0,Marktpositionierung und Rolle des Anbieters\n,marktpositionierung rolle anbieters,,,,,7878,anbieter bereitstellung betrieb dass e genügen...
4,5,\nDie nachfolgende Grafik zeigt die Abgrenzung...,4.0,Systemabgrenzung\n,systemabgrenzung,,,,,7878,abgrenzung aktivität bestehen betreiben datens...


features = chapter content + header_weight * chapter headers
because chapter headers is more meaningful than content

In [5]:
features = (df_chapters['header_preprocessed'] + " ").str.repeat(header_weight).str.lstrip() \
            + (df_chapters['parent_preprocessed'] + " ").str.repeat(header_weight).str.lstrip() \
            + (df_chapters['grandparent_preprocessed'] + " ").str.repeat(header_weight).str.lstrip() \
            + df_chapters['preprocessed']
features = features.str.strip()
features.head()

0    055 09 11 12 15 60 646 8750 fax glarus mail ra...
1    ausschreibung arbeit installation betrieb elek...
2    ziel projektziel ziel projektziel 2018 2019 au...
3    marktpositionierung rolle anbieters marktposit...
4    systemabgrenzung systemabgrenzung abgrenzung a...
dtype: object

## Calculate tf-idf
for NMF

In [67]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features)
tfidf = tfidf_vectorizer.fit_transform(features)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

## Calculate tf
for LDA

In [6]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features)
tf = tf_vectorizer.fit_transform(features)
tf_feature_names = tf_vectorizer.get_feature_names()

## Calculate NMF

In [73]:
topics_nmf = []
no_topics = np.arange(5, 105, 5)

for nt in no_topics:
    print('calculating model with {} topics'.format(nt))
    
    nmf_model = NMF(n_components=nt, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
    nmf_W = nmf_model.transform(tfidf)
    nmf_H = nmf_model.components_
    
    topics_nmf.append({
        'no_topics': nt,
        'topics': get_topics(nmf_H, nmf_W, tfidf_feature_names, df_chapters['text'], no_top_words, no_top_documents),
        'model': nmf_model
    })

calculating model with 5 topics
calculating model with 10 topics
calculating model with 15 topics
calculating model with 20 topics
calculating model with 25 topics
calculating model with 30 topics
calculating model with 35 topics
calculating model with 40 topics
calculating model with 45 topics
calculating model with 50 topics
calculating model with 55 topics
calculating model with 60 topics
calculating model with 65 topics
calculating model with 70 topics
calculating model with 75 topics
calculating model with 80 topics
calculating model with 85 topics
calculating model with 90 topics
calculating model with 95 topics
calculating model with 100 topics


## Calculate LDA

In [9]:
topics = []
no_topics = np.arange(5, 105, 5)

for nt in no_topics:
    print('calculating model with {} topics'.format(nt))
    
    lda_model = LatentDirichletAllocation(n_components=nt, max_iter=15, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
    lda_W = lda_model.transform(tf)
    lda_H = lda_model.components_
    
    topics.append({
        'no_topics': nt,
        'topics': get_topics(lda_H, lda_W, tf_feature_names, df_chapters['text'], no_top_words, no_top_documents),
        'model': lda_model
    })

calculating model with 5 topics
calculating model with 10 topics
calculating model with 15 topics
calculating model with 20 topics
calculating model with 25 topics
calculating model with 30 topics
calculating model with 35 topics
calculating model with 40 topics
calculating model with 45 topics
calculating model with 50 topics
calculating model with 55 topics
calculating model with 60 topics
calculating model with 65 topics
calculating model with 70 topics
calculating model with 75 topics
calculating model with 80 topics
calculating model with 85 topics
calculating model with 90 topics
calculating model with 95 topics
calculating model with 100 topics


Save topics as csv

In [74]:
for t in topics_nmf:
    t['topics'].to_csv('topic_sizes/nmf_{}_15i.csv'.format(t['no_topics']))

In [75]:
to_check = [
    [0,4],
    [1,4],
    [2,4],
    [3,4],
    [4,4],
    [5,4],
    [6,4],
    [7,4],
    [8,4],
    [9,4],
    [10,4],
    [11,4],
    [12,4],
    [12,4],
    [13,4],
    [13,4],
    [14,4],
    [15,4],
    [16,4],
    [17,4],
]
for model in to_check:
    nmf_W = topics_nmf[model[0]]['model'].transform(tfidf)
    m = columnIsHighest(nmf_W, model[1])
    print(len(m))

31724
14015
12844
11738
10918
9665
7639
7281
6621
6530
6099
5671
5375
5375
5297
5297
5933
5067
5083
5106


In [None]:
t0 = time()
lda_model = LatentDirichletAllocation(n_components=40, max_iter=50, learning_method='online', learning_offset=50.,random_state=0, n_jobs=-1).fit(tf)
print(time()-t0)

In [None]:
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_
get_topics(lda_H, lda_W, tf_feature_names, df_chapters['text'], no_top_words, no_top_documents).to_csv('topic_sizes/40_50r.csv')
get_topics()

# Analyze

## Display NMF

In [72]:
nmf_topics = get_topics(nmf_H, nmf_W, tfidf_feature_names, df_chapters['text'], no_top_words, no_top_documents)
nmf_topics

Unnamed: 0,id,word
0,0,übrig departement dritter drei dokumentation d...
1,1,allgemeine unternehmen atb teilnahmebedingung ...
2,2,angabe unternehmen sonstig administrative weit...
3,3,übrig departement dritter drei dokumentation d...
4,4,zuschlagskriterium zk gewichtung eignungsund b...
5,5,le m2 ausmass pos m3 beton baustelleneinrichtu...
6,6,bestimmung besonderer 102 objektspezifisch npk...
7,7,ausschreibung eignungsund organisation gegenst...
8,8,wto gatt abkommen ja resp nein gemäss vorausse...
9,9,nr positionstext pos artikel referenzobjekt ab...


## Display LDA

In [None]:
topics_lda = get_topics(lda_H, lda_W, tf_feature_names, df_chapters['text'], no_top_words, no_top_documents)
topics_lda

## concat scores and chapters

## Save

save the 5 most important topics

In [None]:
def find_highest_five(nmf_W):
    highest_five = []
    for i, w in enumerate(nmf_W):
        found = False
        for h in highest_five:
            if w >= h['value']:
                highest_five.append({
                    'index': i,
                    'value': w
                })
                found = True
                break
        if len(highest_five) < 5 and not found:
            highest_five.append({
                'index': i,
                'value': w
            })
        highest_five = sorted(highest_five, key=lambda x: x['value'], reverse=True)[0:5]
    return highest_five

In [None]:
l = []
for w in nmf_W:
    l.append(find_highest_five(w))
nmf_df = pd.DataFrame(l)
nmf_df.columns = ['topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5']
nmf_df.insert(0, 'features', features)
nmf_df = pd.concat([df_chapters.iloc[:,0:4], nmf_df], axis=1)
nmf_df.head()

In [None]:
l = []
for w in nmf_W:
    l.append(find_highest_five(w))

In [None]:
print(nmf_W.shape)
print(len(l))

In [None]:
nmf_df2 = df_chapters.iloc[:,0:4]
nmf_df2['topic_1'] = Series(map(lambda x: x[0], l), index=nmf_df2.index)
nmf_df2['topic_2'] = Series(map(lambda x: x[1], l), index=nmf_df2.index)
nmf_df2['topic_3'] = Series(map(lambda x: x[2], l), index=nmf_df2.index)
nmf_df2['topic_4'] = Series(map(lambda x: x[3], l), index=nmf_df2.index)
nmf_df2['topic_5'] = Series(map(lambda x: x[4], l), index=nmf_df2.index)

nmf_df2.head()
nmf_df = nmf_df2

In [None]:
for i, d in nmf_df.iterrows():
    update_chapter_topic(d['chapter_id'], d['topic_1'], d['topic_2'], d['topic_3'], d['topic_4'], d['topic_5'])
    if i % 100 == 0:
        print('updating chapter {0} of {1}'.format(i, len(nmf_df)))

Inserting topics

In [None]:
do_query('truncate table topic')
for i, topic in nmf_topics.iterrows():
    do_query('insert into topic (id, words) values ({0}, "{1}")'.format(topic['id'], topic['word']))

# Evaluate

## Export Tables

In [None]:
nmf_df.to_csv('nmf_topics.csv')
lda_df.to_csv('lda_topics.csv')