In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 15)

In [2]:
from cptm.utils.experiment import load_topics

config = {
    "inputData": "/path/to/input/data/*",
    "outDir": "/home/jvdzwaan/data/dilipad/results/all_years-adj-selected_parties_100/{}",
    "nTopics": 100
}

topics = load_topics(config)

In [3]:
from cptm.utils.topics import get_top_topic_words, topic_str
import codecs
import os

def topics2txt(n, config):
    """Make text file with top n topic word per topic
    
    The txt file contains a single topic per line. This text file can be used as input for calculating topic
    coherence using Palmetto.
    """
    with codecs.open(config.get('outDir').format('top_{}_topics_100.txt'.format(n)), 'w', encoding='utf-8') as f:
        for i in range(config.get('nTopics')):
            top = get_top_topic_words(topics, {}, i, n)
            t = topic_str(top, single_line=True, weights=False, opinions=False)
            t = t.replace('topic:\t', '')
            # Lowercase: word justitie is capitalized (for some reason)
            t = t.lower()
            f.write(t+'\n')

In [4]:
topics2txt(10, config)

  topic.sort(ascending=False)


In [None]:
tc = !java -jar ~/code/Palmetto/target/Palmetto-jar-with-dependencies.jar ~/data/nlwiki-palmetto-stopped/nlwiki-palmetto-stopped NPMI {config.get('outDir').format('top_10_topics_100.txt')}

In [8]:
def add_data(data, measure, tc):
    
    data[measure] = []
    lines = tc
    for line in lines:
        parts = line.split()
        if len(parts) > 1:
            #print parts[1]
            data[measure].append(float(parts[1]))
    print len(data[measure])
    return data

data = {}
measures = ['C_A', 'C_P', 'C_V', 'NPMI', 'UCI', 'UMass']

for measure in measures:
    print measure, 
    tc = !java -jar ~/code/Palmetto/target/Palmetto-jar-with-dependencies.jar ~/data/nlwiki-palmetto-stopped/nlwiki-palmetto-stopped {measure} {config.get('outDir').format('top_10_topics_100.txt')}
    tc = tc[1:]
    add_data(data, measure, tc)

C_A 100
C_P 100
C_V 100
NPMI 100
UCI 100
UMass 100


In [9]:
result = pd.DataFrame(data)

for k in data.keys():
    print k
    print 'min', min(result[k])
    print 'max', max(result[k])
    print 'mean', np.mean(result[k])
    print

result.to_csv(config.get('outDir').format('topic_coherence_100.csv'))

C_A
min 0.05724
max 0.35435
mean 0.1754591

NPMI
min -0.3412
max 0.11236
mean -0.1633153

C_P
min -0.84292
max 0.61506
mean -0.0716679

UMass
min -12.15041
max -2.03488
mean -5.4488292

C_V
min 0.32653
max 0.77135
mean 0.5549458

UCI
min -9.83813
max 0.43101
mean -5.3473584



In [10]:
result

Unnamed: 0,C_A,C_P,C_V,NPMI,UCI,UMass
0,0.17891,-0.38273,0.61482,-0.28606,-8.18575,-5.46086
1,0.15833,0.26531,0.59502,-0.01964,-2.50232,-5.99031
2,0.13729,-0.54796,0.67243,-0.29438,-8.34550,-8.21323
3,0.20078,0.41547,0.58556,0.00711,-2.17348,-3.27842
4,0.17393,-0.34264,0.63761,-0.34120,-9.83813,-4.94111
5,0.25105,0.29664,0.54227,-0.09178,-4.13464,-3.29363
6,0.14852,0.02555,0.55748,-0.09579,-3.76102,-7.15361
7,0.20199,0.41692,0.39129,-0.00991,-1.22039,-2.33781
8,0.12013,-0.28862,0.65829,-0.29821,-8.57306,-4.97552
9,0.07971,-0.65674,0.70215,-0.30055,-8.39661,-10.84544
