In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 15)

In [2]:
# get topic and opinion vacabularies with frog lemmas
from cptm.utils.experiment import load_topics, load_opinions

config = {
    "inputData": "/path/to/input/data/*",
    "outDir": "/home/jvdzwaan/data/dilipad/results/all_years-adj-cabinets_selected-parties/{}",
    "nTopics": 100
}

topics = load_topics(config)
opinions = load_opinions(config)

In [3]:
topic_words = topics.index
print topic_words
opinion_words = opinions[opinions.keys()[0]].index
print opinion_words

Index([u'baangarantie', u'ontwerprichtlijn', u'scheuring', u'cliëntniveau',
       u'leveringsakte', u'verzekeringsplichtig', u'verdubbeling',
       u'takenpakket', u'flutverhaal', u'ad-hocadvisering',
       ...
       u'besturingsmodel', u'kraamperiode', u'verkeersleidingscentrum', u'ík',
       u'celgebruik', u'woningprijs', u'goederenverkeer', u'heenzending',
       u'gezinsbijstand', u'projectleiding'],
      dtype='object', length=38145)
Index([u'verzekeringsplichtig', u'statisch', u'digitaal', u'kindvriendelijk',
       u'fout', u'onverholen', u'vakgericht', u'kwestieus', u'kwistig',
       u'deels',
       ...
       u'pasklaar', u'tonnen', u'ík', u'gezagvol', u'jammerlijk',
       u'toelaatbaar', u'toevallig', u'volkomen', u'rustiek', u'identiek'],
      dtype='object', length=6245)


In [4]:
# get lemmas extracted using pattern from NLwikipedia

import glob
import codecs

input_dir = '/home/jvdzwaan/data/nlwiki-lemmatized/'
input_files = glob.glob('{}/**/wiki*'.format(input_dir))

vocabulary = {}

for input_file in input_files:
    with codecs.open(input_file, 'rb', encoding='utf-8') as f:
        text = f.read()
        words = text.split()
        for word in words:
            vocabulary[word] = None
print len(vocabulary)

2933969


In [7]:
# how many topic and opinion words are not found in the nlwikipedia lemmas?
no_match_topic = [x for x in topic_words if x not in vocabulary]

print len(no_match_topic)
#print no_match_topic

8668


In [8]:
no_match_opinion = [x for x in opinion_words if x not in vocabulary]

print len(no_match_opinion)
#print no_match_opinion

476


In [10]:
# how many topic words from the top 10 of all topics are not found in the nlwikipedia lemmas?
with codecs.open(config.get('outDir').format('top_10_topics_100.txt'), 'rb', encoding='utf-8') as f:
    text = f.read()
    words = text.split()
    print len(words)

no_match_top_10_topic_words = [x for x in words if x not in vocabulary]

print len(no_match_top_10_topic_words)
print no_match_top_10_topic_words

1000
1
[u'aowleeftijd']


In [20]:
print 'd' in topic_words
print 'd66' in topic_words
print 'd' in vocabulary
print 'd66' in vocabulary

print

print 'f' in topic_words
print 'f16' in topic_words
print 'f' in vocabulary
print 'f16' in vocabulary

print

print 'a' in topic_words
print 'a2' in topic_words
print 'a' in vocabulary
print 'a1' in vocabulary
print 'a2' in vocabulary
print 'a3' in vocabulary

True
False
True
True

True
False
True
True

True
False
True
True
True
True


In [25]:
# perhaps it is not frog that ate the numbers, but gensim
import gensim

texts = [['d66', 'D66'], ['F16', 'f16']]
dictionary = gensim.corpora.Dictionary(texts)
print dictionary

Dictionary(4 unique tokens: [u'f16', u'D66', u'F16', u'd66'])


Okay, the problem is that in the folia files, numbers are separated from the character-part and the number part is a 'TW' (telwoord) and not a noun.

Example:

```xml
<folia:w xml:id="nl.proc.ob.d.h-ek-20112012-9-3.1.11.2.1.0.27">
   <folia:t>D</folia:t>
   <folia:lemma class="d"/>
   <folia:pos class="N"/>
</folia:w>
<folia:w xml:id="nl.proc.ob.d.h-ek-20112012-9-3.1.11.2.1.0.28">
   <folia:t>66</folia:t>
   <folia:lemma class="66"/>
   <folia:pos class="TW"/>
</folia:w>
```

Currently, frog does parse words like d66 correctly...

In [26]:
print '66' in topic_words

False


Okee, wat doen we? Bij het parsen van de nieuwe data (manifestos en andere handmatig gecodeerde data), gebruiken we pattern. En als we een woord tegenkomen dat bestaat uit [letters][cijfers] vervangen we die door [letters].

Voor de topic coherence laten we het zo (ik zou d door d66 kunnen vervangen en f door f16, maar voor de a wordt het lastig). Ik kan ook d en f vervengen en a zo laten. Zou het veel uitmaken?

In [75]:
import re

def remove_trailing_digits(word):
    regex = re.compile('^(.+?)(\d+)$', flags=re.UNICODE)
    m = regex.match(word)
    if m:
        #print m.group(0), m.group(1)
        return m.group(1)
    else:
        return word

tests = {u'd66': u'd',
         u'f16': u'f',
         u'é33': u'é'}

for i, o in tests.iteritems():
    r = remove_trailing_digits(i)
    if r != o:
        print 'error', i, o, r