In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 15)

In [3]:
from cptm.utils.topics import get_top_topic_words, topic_str
import codecs
import os

def topics2txt(n, config):
    """Make text file with top n topic word per topic
    
    The txt file contains a single topic per line. This text file can be used as input for calculating topic
    coherence using Palmetto.
    """
    with codecs.open(config.get('outDir').format('top_{}_topics_100.txt'.format(n)), 'w', encoding='utf-8') as f:
        for i in range(config.get('nTopics')):
            top = get_top_topic_words(topics, {}, i, n)
            t = topic_str(top, single_line=True, weights=False, opinions=False)
            t = t.replace('topic:\t', '')
            # Lowercase: word justitie is capitalized (for some reason)
            t = t.lower()
            f.write(t+'\n')

In [4]:
from cptm.utils.experiment import load_topics

config = {
    "inputData": "/path/to/input/data/*",
    "outDir": "/home/jvdzwaan/data/dilipad/results/all_years/all_years-adj-parties/{}",
    "nTopics": 100
}

topics = load_topics(config)

In [5]:
topics2txt(10, config)

In [6]:
tc = !java -jar ~/code/Palmetto/target/Palmetto-jar-with-dependencies.jar ~/data/nlwiki-palmetto/nlwiki-palmetto NPMI ~/data/dilipad/results/all_years/all_years-adj-parties/top_10_topics_100.txt

In [10]:
def add_data(data, measure, tc):
    
    data[measure] = []
    lines = tc
    for line in lines:
        parts = line.split()
        if len(parts) > 1:
            #print parts[1]
            data[measure].append(float(parts[1]))
    print len(data[measure])
    return data

data = {}
measures = ['C_A', 'C_P', 'C_V', 'NPMI', 'UCI', 'UMass']

for measure in measures:
    print measure, 
    tc = !java -jar ~/code/Palmetto/target/Palmetto-jar-with-dependencies.jar ~/data/nlwiki-palmetto/nlwiki-palmetto {measure} ~/data/dilipad/results/all_years/all_years-adj-parties/top_10_topics_100.txt
    tc = tc[1:]
    add_data(data, measure, tc)

 C_A 100
C_P 100
C_V 100
NPMI 100
UCI 100
UMass 100


In [11]:
result = pd.DataFrame(data)

for k in data.keys():
    print k
    print 'min', min(result[k])
    print 'max', max(result[k])
    print 'mean', np.mean(result[k])
    print

result.to_csv(config.get('outDir').format('topic_coherence_100.csv'))

C_A
min 0.03898
max 0.2453
mean 0.1277559

NPMI
min -0.35166
max 0.05113
mean -0.1941005

C_P
min -0.95045
max 0.66392
mean -0.0518794

UMass
min -9.5712
max -1.85845
mean -3.541935

C_V
min 0.364
max 0.72889
mean 0.529328

UCI
min -9.798
max -0.3934
mean -5.9468756



In [12]:
result

Unnamed: 0,C_A,C_P,C_V,NPMI,UCI,UMass
0,0.03898,-0.60041,0.65028,-0.22307,-6.34998,-9.57120
1,0.11500,-0.16257,0.58531,-0.15078,-4.97266,-6.32393
2,0.18909,0.18736,0.44293,-0.21050,-6.72275,-2.45187
3,0.16067,0.15556,0.48575,-0.17730,-5.84751,-2.48383
4,0.16875,0.31599,0.49672,-0.12774,-4.41198,-2.82026
5,0.16515,0.04861,0.48438,-0.19237,-6.04762,-2.97680
6,0.21276,0.40015,0.51787,-0.03771,-2.81850,-2.20372
7,0.12105,-0.07131,0.49716,-0.32625,-9.21279,-2.30136
8,0.06942,-0.02266,0.62766,-0.24036,-7.09844,-2.35078
9,0.07406,-0.38392,0.61208,-0.25675,-7.32136,-5.80428
