#Do LDA in Gensim using nouns from Ducth Folia data

In [1]:
from lxml import etree
import gzip
import os
import logging

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

class DilipadFolia(object):
    def __init__(self, dir_name):
        self.dir_name = dir_name
        
        self.NUMBER = 100
    
    def __iter__(self):
        word_tag = '{http://ilk.uvt.nl/FoLiA}w'
        pos_tag = '{http://ilk.uvt.nl/FoLiA}pos'
        t_tag = '{http://ilk.uvt.nl/FoLiA}t'

        data_files = [d for d in os.listdir(self.dir_name) if d.endswith('.xml.gz')]
        logging.info('Extracting nouns from {} documents.'.format(len(data_files)))
        for i, data_file in enumerate(data_files):
            nouns = []
            xml_file = os.path.join(self.dir_name, data_file)
            if i % self.NUMBER == 0:
                logging.info('Extracting nouns from document #{} (\'{}\')'.format(i, xml_file))
            f = gzip.open(xml_file)
            context = etree.iterparse(f, events=('end',), tag=word_tag, huge_tree=True)
            for event, elem in context:
                pos = elem.find(pos_tag).attrib.get('class')
                if pos == 'N':
                    noun = elem.find(t_tag).text.lower()
                    nouns.append(noun)
            if i % self.NUMBER == 0:
                logging.info('Extracted {} nouns'.format(len(nouns)))
            del context
            f.close()
            yield nouns

In [3]:
data_dir = '/home/jvdzwaan/data/dilipad'

data = DilipadFolia(data_dir)

In [4]:
# save topic words (nouns) and opinion words (adjectives, verbs, adverbs) to text files
# to do: distinguish perspectives
import codecs

def save_topic_and_opinion_words():
    NUMBER = 100

    data_dir = '/home/jvdzwaan/data/dilipad'
    text_dir = '{}/txt-topic-opinion'.format(data_dir)

    word_tag = '{http://ilk.uvt.nl/FoLiA}w'
    pos_tag = '{http://ilk.uvt.nl/FoLiA}pos'
    t_tag = '{http://ilk.uvt.nl/FoLiA}t'

    data_files = [d for d in os.listdir(data_dir) if d.endswith('.xml.gz')]
    logging.info('Extracting words from {} documents.'.format(len(data_files)))
    for i, data_file in enumerate(data_files):
        topic_words = []
        opinion_words = []

        xml_file = os.path.join(data_dir, data_file)
        if i % NUMBER == 0:
            logging.info('Extracting words from document #{} (\'{}\')'.format(i, xml_file))
        f = gzip.open(xml_file)
        context = etree.iterparse(f, events=('end',), tag=word_tag, huge_tree=True)
        for event, elem in context:
            pos = elem.find(pos_tag).attrib.get('class')
            if pos == 'N':
                tw = elem.find(t_tag).text.lower()
                topic_words.append(tw)
            elif pos == 'ADJ' or pos == 'WW' or pos == 'BW':
                ow = elem.find(t_tag).text.lower()
                opinion_words.append(ow)
        if i % NUMBER == 0:
            logging.info('Extracted {} words'.format(len(topic_words)+len(opinion_words)))
        del context
        f.close()
        with codecs.open(os.path.join(text_dir, '{}-topic_words.txt'.format(data_file)), 'wb', 'utf8') as f:
            f.write('\n'.join(topic_words))
        with codecs.open(os.path.join(text_dir, '{}-opinion_words.txt'.format(data_file)), 'wb', 'utf8') as f:
            f.write('\n'.join(opinion_words))
                    
%time save_topic_and_opinion_words()

INFO:root:Extracting nouns from 1353 documents.
INFO:root:Extracting words from document #0 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-877-907.xml.gz')
INFO:root:Extracted 16273 words
INFO:root:Extracting words from document #100 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-6553-6569.xml.gz')
INFO:root:Extracted 8921 words
INFO:root:Extracting words from document #200 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-1634-1644.xml.gz')
INFO:root:Extracted 4778 words
INFO:root:Extracting words from document #300 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-773-773-1.xml.gz')
INFO:root:Extracted 17 words
INFO:root:Extracting words from document #400 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-1467-1467-1.xml.gz')
INFO:root:Extracted 52 words
INFO:root:Extracting words from document #500 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-4973-5002.xml.gz')
INFO:root:Extracted 14780 words
INFO:root:Extracting words from docume

CPU times: user 27min 17s, sys: 9.29 s, total: 27min 26s
Wall time: 27min 30s


In [6]:
import sys

def save_topic_and_opinion_words_perspectives():
    NUMBER = 5

    data_dir = '/home/jvdzwaan/data/dilipad'
    text_dir = '{}/txt-topic-opinion'.format(data_dir)

    word_tag = '{http://ilk.uvt.nl/FoLiA}w'
    pos_tag = '{http://ilk.uvt.nl/FoLiA}pos'
    t_tag = '{http://ilk.uvt.nl/FoLiA}t'
    speech_tag = '{http://www.politicalmashup.nl}speech'
    speaker_tag = '{http://www.politicalmashup.nl}speaker'
    party_tag = '{http://www.politicalmashup.nl}party'

    data_files = [d for d in os.listdir(data_dir) if d.endswith('.xml.gz')]
    logging.info('Extracting words from {} documents.'.format(len(data_files)))
    for i, data_file in enumerate(data_files):
        topic_words = []
        opinion_words = []

        xml_file = os.path.join(data_dir, data_file)
        if i % NUMBER == 0:
            logging.info('Extracting words from document #{} (\'{}\')'.format(i, xml_file))
        f = gzip.open(xml_file)
        context = etree.iterparse(f, events=('end',), tag=speech_tag, huge_tree=True)
        for event, elem in context:
            print elem.attrib.get(speaker_tag), elem.attrib.get(party_tag)
        if i % NUMBER == 0:
            sys.exit()
                    
%time save_topic_and_opinion_words_perspectives()

INFO:root:Extracting words from 1353 documents.
INFO:root:Extracting words from document #0 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-877-907.xml.gz')


Pastoor CDA
Van Gennip CDA
Ketting VVD
De Beer VVD
Pitstra GroenLinks
Van der Lans GroenLinks
Meindertsma PvdA
Rabbinge PvdA
Van Gennip CDA
Rabbinge PvdA
Van der Lans GroenLinks
Rabbinge PvdA
Van der Lans GroenLinks
Rabbinge PvdA
Van Bruchem RPF/GPV
De Beer VVD
Van Bruchem RPF/GPV
Hessing D66
Bierman OSF
De Beer VVD
Bierman OSF


SystemExit: 

To exit: use 'exit', 'quit', or Ctrl-D.


If iterparse chokes on XML (error message: XMLSyntaxError: ID XXXXXXXXXXX already defined), restart notebook.

In [9]:
import gensim

%time id2word_folia = gensim.corpora.Dictionary(data)

INFO:root:Extracting nouns from 1353 documents.
INFO:root:Extracting nouns from document #0 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-877-907.xml.gz')
INFO:root:Extracted 6268 nouns
INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:root:Extracting nouns from document #100 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-6553-6569.xml.gz')
INFO:root:Extracted 3181 nouns
INFO:root:Extracting nouns from document #200 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-1634-1644.xml.gz')
INFO:root:Extracted 1570 nouns
INFO:root:Extracting nouns from document #300 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-773-773-1.xml.gz')
INFO:root:Extracted 13 nouns
INFO:root:Extracting nouns from document #400 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-1467-1467-1.xml.gz')
INFO:root:Extracted 41 nouns
INFO:root:Extracting nouns from document #500 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-4973-

CPU times: user 26min 22s, sys: 1.1 s, total: 26min 23s
Wall time: 26min 22s


In [11]:
# save dictionary to disk
id2word_folia.save('/home/jvdzwaan/data/dilipad/gensim/folia.dictionary')

INFO:gensim.utils:saving Dictionary object under /home/jvdzwaan/data/dilipad/gensim/folia.dictionary, separately None


In [4]:
import gensim
id2word_folia = gensim.corpora.Dictionary.load('/home/jvdzwaan/data/dilipad/gensim/folia.dictionary')

In [24]:
import itertools

class FoliaCorpus:
    def __init__(self, dir_name, dictionary):
        self.dir_name = dir_name
        self.dictionary = dictionary
        self.df = DilipadFolia(dir_name)
    
    def __iter__(self):
        it = iter(self.df)
        for doc in it:
            yield self.dictionary.doc2bow(doc)

In [25]:
folia_corpus = FoliaCorpus(data_dir, id2word_folia)

In [26]:
%time gensim.corpora.MmCorpus.serialize('/home/jvdzwaan/data/dilipad/gensim/folia_bow.mm', folia_corpus)

INFO:gensim.corpora.mmcorpus:storing corpus in Matrix Market format to /home/jvdzwaan/data/dilipad/gensim/folia_bow.mm
INFO:gensim.matutils:saving sparse matrix to /home/jvdzwaan/data/dilipad/gensim/folia_bow.mm
INFO:root:Extracting nouns from 1353 documents.
INFO:root:Extracting nouns from document #0 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-877-907.xml.gz')
INFO:root:Extracted 6268 nouns
INFO:gensim.matutils:PROGRESS: saving document #0
INFO:root:Extracting nouns from document #100 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-6553-6569.xml.gz')
INFO:root:Extracted 3181 nouns
INFO:root:Extracting nouns from document #200 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-1634-1644.xml.gz')
INFO:root:Extracted 1570 nouns
INFO:root:Extracting nouns from document #300 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-773-773-1.xml.gz')
INFO:root:Extracted 13 nouns
INFO:root:Extracting nouns from document #400 ('/home/jvdzwaan/data/dilipad/nl.proc.

CPU times: user 27min 48s, sys: 1.89 s, total: 27min 50s
Wall time: 27min 49s


In [2]:
import gensim

folia_corpus = gensim.corpora.mmcorpus.MmCorpus('/home/jvdzwaan/data/dilipad/gensim/folia_bow.mm')

In [7]:
import logging

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [11]:
%time lda = gensim.models.LdaModel(corpus=folia_corpus, num_topics=100, id2word=id2word_folia, iterations=10000)

INFO:gensim.models.ldamodel:using symmetric alpha at 0.01
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online LDA training, 100 topics, 1 passes over the supplied corpus of 1353 documents, updating model once every 1353 documents, evaluating perplexity every 1353 documents, iterating 10000x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:-23.652 per-word bound, 13178734.0 perplexity estimate based on a held-out corpus of 1353 documents with 1492514 words
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #1353/1353
INFO:gensim.models.ldamodel:topic #23 (0.010): 0.021*wetsvoorstel + 0.013*heer + 0.010*orde + 0.010*ophouding + 0.009*vraag + 0.009*minister + 0.009*artikel + 0.008*wet + 0.007*voorzitter + 0.007*uur
INFO:gensim.models.ldamodel:topic #36 (0.010): 0.000*versmallingen + 0.000*grensgebieden + 0.000*gehandicaptenvervoer + 0.000*gebiedje + 0.000*functiebeperking + 0.000*exploitatiesubsidies + 0.0

CPU times: user 7min 46s, sys: 10min 26s, total: 18min 12s
Wall time: 4min 48s


In [12]:
# Do LDA based on text data
# Is veel sneller, de tijd zit 'm dus in het door de gezipte xml files heenlopen
import gensim

text_data_dir = '/home/jvdzwaan/data/dilipad/txt-sample'

class Nouns:
    def __init__(self, dir_name):
        self.dir_name = dir_name
        
        self.NUMBER = 100
    
    def __iter__(self):
        data_files = [d for d in os.listdir(self.dir_name) if d.endswith('.txt')]
        logging.info('Found {} documents.'.format(len(data_files)))
        for i, data_file in enumerate(data_files):
            nouns = []
            txt_file = os.path.join(self.dir_name, data_file)
            if i % self.NUMBER == 0:
                logging.info('Reading document #{} (\'{}\')'.format(i, txt_file))
            with codecs.open(txt_file, 'rb', 'utf8') as f:
                nouns = f.read().split()
                yield nouns

class TextCorpus:
    def __init__(self, dir_name, dictionary):
        self.dir_name = dir_name
        self.dictionary = dictionary
        self.nouns = Nouns(dir_name)
        
        self.NUMBER = 100
    
    def __iter__(self):
        for ns in self.nouns:
            yield self.dictionary.doc2bow(ns)
                
nouns = Nouns(text_data_dir)

print 'Create dictionary'
%time id2word = gensim.corpora.Dictionary(nouns)
corpus = TextCorpus(text_data_dir, id2word)
print 'Do LDA'
% time lda = gensim.models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, update_every=0, passes=5) 


INFO:root:Found 200 documents.
INFO:root:Reading document #0 ('/home/jvdzwaan/data/dilipad/txt-sample/nl.proc.ob.d.h-ek-19992000-951-951.xml.gz.txt')
INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:root:Reading document #100 ('/home/jvdzwaan/data/dilipad/txt-sample/nl.proc.ob.d.h-ek-19992000-179-198.xml.gz.txt')
INFO:gensim.corpora.dictionary:built Dictionary(24850 unique tokens: [u'gastvrouw', u'baangarantie', u'ontwerprichtlijn', u'verdubbeling', u'wisselvalligheden']...) from 200 documents (total 296237 corpus positions)


Create dictionary
CPU times: user 480 ms, sys: 11.4 ms, total: 491 ms

INFO:gensim.models.ldamodel:using symmetric alpha at 0.1
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:root:Found 200 documents.
INFO:root:Reading document #0 ('/home/jvdzwaan/data/dilipad/txt-sample/nl.proc.ob.d.h-ek-19992000-951-951.xml.gz.txt')
INFO:root:Reading document #100 ('/home/jvdzwaan/data/dilipad/txt-sample/nl.proc.ob.d.h-ek-19992000-179-198.xml.gz.txt')
INFO:gensim.models.ldamodel:running batch LDA training, 10 topics, 5 passes over the supplied corpus of 200 documents, updating model once every 200 documents, evaluating perplexity every 200 documents, iterating 50x with a convergence threshold of 0.001000
INFO:root:Found 200 documents.
INFO:root:Reading document #0 ('/home/jvdzwaan/data/dilipad/txt-sample/nl.proc.ob.d.h-ek-19992000-951-951.xml.gz.txt')
INFO:root:Reading document #100 ('/home/jvdzwaan/data/dilipad/txt-sample/nl.proc.ob.d.h-ek-19992000-179-198.xml.gz.txt')
INFO:gensim.models.ldamodel:-11.588 per-word bound, 3078.2 perplexity estimat


Wall time: 502 ms
Do LDA
CPU times: user 47.1 s, sys: 1min 40s, total: 2min 27s
Wall time: 42.4 s
