#Do LDA in Gensim using nouns from Ducth Folia data

In [2]:
from lxml import etree
import gzip
import os
import logging

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

class DilipadFolia(object):
    def __init__(self, dir_name):
        self.dir_name = dir_name
        
        self.NUMBER = 100
    
    def __iter__(self):
        word_tag = '{http://ilk.uvt.nl/FoLiA}w'
        pos_tag = '{http://ilk.uvt.nl/FoLiA}pos'
        t_tag = '{http://ilk.uvt.nl/FoLiA}t'

        data_files = [d for d in os.listdir(self.dir_name) if d.endswith('.xml.gz')]
        logging.info('Extracting nouns from {} documents.'.format(len(data_files)))
        for i, data_file in enumerate(data_files):
            nouns = []
            xml_file = os.path.join(self.dir_name, data_file)
            if i % self.NUMBER == 0:
                logging.info('Extracting nouns from document #{} (\'{}\')'.format(i, xml_file))
            f = gzip.open(xml_file)
            context = etree.iterparse(f, events=('end',), tag=word_tag, huge_tree=True)
            for event, elem in context:
                pos = elem.find(pos_tag).attrib.get('class')
                if pos == 'N':
                    noun = elem.find(t_tag).text.lower()
                    nouns.append(noun)
            if i % self.NUMBER == 0:
                logging.info('Extracted {} nouns'.format(len(nouns)))
            del context
            f.close()
            yield nouns

In [9]:
data_dir = '/home/jvdzwaan/data/dilipad'

data = DilipadFolia(data_dir)

If iterparse chokes on XML (error message: XMLSyntaxError: ID XXXXXXXXXXX already defined), restart notebook.

In [9]:
import gensim

%time id2word_folia = gensim.corpora.Dictionary(data)

INFO:root:Extracting nouns from 1353 documents.
INFO:root:Extracting nouns from document #0 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-877-907.xml.gz')
INFO:root:Extracted 6268 nouns
INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:root:Extracting nouns from document #100 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-6553-6569.xml.gz')
INFO:root:Extracted 3181 nouns
INFO:root:Extracting nouns from document #200 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-1634-1644.xml.gz')
INFO:root:Extracted 1570 nouns
INFO:root:Extracting nouns from document #300 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-773-773-1.xml.gz')
INFO:root:Extracted 13 nouns
INFO:root:Extracting nouns from document #400 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-1467-1467-1.xml.gz')
INFO:root:Extracted 41 nouns
INFO:root:Extracting nouns from document #500 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-4973-

CPU times: user 26min 22s, sys: 1.1 s, total: 26min 23s
Wall time: 26min 22s


In [11]:
# save dictionary to disk
id2word_folia.save('/home/jvdzwaan/data/dilipad/gensim/folia.dictionary')

INFO:gensim.utils:saving Dictionary object under /home/jvdzwaan/data/dilipad/gensim/folia.dictionary, separately None


In [5]:
import gensim
id2word_folia = gensim.corpora.Dictionary.load('/home/jvdzwaan/data/dilipad/gensim/folia.dictionary')

INFO:gensim.utils:loading Dictionary object from /home/jvdzwaan/data/dilipad/gensim/folia.dictionary


In [24]:
import itertools

class FoliaCorpus:
    def __init__(self, dir_name, dictionary):
        self.dir_name = dir_name
        self.dictionary = dictionary
        self.df = DilipadFolia(dir_name)
    
    def __iter__(self):
        it = iter(self.df)
        for doc in it:
            yield self.dictionary.doc2bow(doc)

In [25]:
folia_corpus = FoliaCorpus(data_dir, id2word_folia)

In [26]:
%time gensim.corpora.MmCorpus.serialize('/home/jvdzwaan/data/dilipad/gensim/folia_bow.mm', folia_corpus)

INFO:gensim.corpora.mmcorpus:storing corpus in Matrix Market format to /home/jvdzwaan/data/dilipad/gensim/folia_bow.mm
INFO:gensim.matutils:saving sparse matrix to /home/jvdzwaan/data/dilipad/gensim/folia_bow.mm
INFO:root:Extracting nouns from 1353 documents.
INFO:root:Extracting nouns from document #0 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-877-907.xml.gz')
INFO:root:Extracted 6268 nouns
INFO:gensim.matutils:PROGRESS: saving document #0
INFO:root:Extracting nouns from document #100 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-6553-6569.xml.gz')
INFO:root:Extracted 3181 nouns
INFO:root:Extracting nouns from document #200 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-1634-1644.xml.gz')
INFO:root:Extracted 1570 nouns
INFO:root:Extracting nouns from document #300 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-773-773-1.xml.gz')
INFO:root:Extracted 13 nouns
INFO:root:Extracting nouns from document #400 ('/home/jvdzwaan/data/dilipad/nl.proc.

CPU times: user 27min 48s, sys: 1.89 s, total: 27min 50s
Wall time: 27min 49s


In [28]:
for doc in folia_corpus:
    print doc

INFO:root:Extracting nouns from 1353 documents.
INFO:root:Extracting nouns from document #0 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-ek-19992000-877-907.xml.gz')
INFO:root:Extracted 6268 nouns


[(0, 1), (1, 1), (2, 1), (3, 29), (4, 2), (5, 1), (6, 1), (7, 5), (8, 3), (9, 2), (10, 1), (11, 8), (12, 4), (13, 2), (14, 1), (15, 1), (16, 26), (17, 5), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 2), (24, 1), (25, 1), (26, 13), (27, 3), (28, 1), (29, 2), (30, 1), (31, 1), (32, 1), (33, 1), (34, 4), (35, 1), (36, 2), (37, 2), (38, 1), (39, 5), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 2), (50, 1), (51, 1), (52, 11), (53, 2), (54, 4), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 10), (61, 1), (62, 1), (63, 1), (64, 8), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 2), (75, 1), (76, 5), (77, 1), (78, 1), (79, 1), (80, 5), (81, 1), (82, 6), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 2), (89, 1), (90, 1), (91, 1), (92, 2), (93, 1), (94, 2), (95, 3), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 8), (102, 1), (103, 1), (104, 4), (105, 5), (106, 1), (107, 4), (108, 2), (109, 1), (110

INFO:root:Extracting nouns from document #100 ('/home/jvdzwaan/data/dilipad/nl.proc.ob.d.h-tk-19992000-6553-6569.xml.gz')
INFO:root:Extracted 3181 nouns



[(1, 1), (3, 10), (9, 7), (12, 2), (16, 29), (27, 2), (37, 1), (38, 1), (42, 4), (54, 2), (74, 4), (77, 1), (82, 1), (101, 3), (104, 3), (105, 2), (129, 3), (147, 6), (148, 4), (150, 1), (152, 2), (154, 3), (158, 14), (160, 2), (162, 1), (164, 4), (168, 17), (170, 2), (173, 1), (181, 9), (195, 2), (201, 1), (218, 3), (219, 1), (222, 1), (224, 5), (227, 2), (232, 1), (233, 1), (244, 3), (246, 1), (250, 1), (252, 1), (253, 1), (254, 6), (258, 2), (259, 3), (260, 4), (261, 2), (267, 2), (270, 2), (278, 1), (281, 2), (282, 2), (303, 1), (327, 8), (332, 1), (343, 14), (344, 16), (351, 6), (357, 2), (359, 1), (368, 4), (371, 3), (372, 5), (383, 1), (385, 3), (386, 1), (392, 4), (394, 1), (395, 1), (400, 1), (401, 2), (407, 1), (426, 2), (430, 10), (431, 1), (436, 5), (440, 1), (446, 5), (448, 5), (453, 2), (454, 1), (462, 6), (465, 8), (470, 3), (478, 1), (495, 2), (497, 5), (500, 2), (519, 1), (523, 7), (524, 3), (526, 12), (553, 3), (557, 1), (561, 1), (562, 1), (569, 4), (570, 3), (579, 

KeyboardInterrupt: 