In [39]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Try to Load a Corpus with NLTK

NLTK has a CorpusReader for XML documents, but it seems fairly limited as it only stores the words.
It is possible to retrieve the tags of interest like 'sentences' via a CorpusView. Getting the sentences with CorpusView returns an ElementTree list, which is documented in the Python docs.

In [138]:
from nltk.corpus.reader.xmldocs import XMLCorpusView
from nltk.corpus.reader.xmldocs import XMLCorpusReader

class CorpusReader(XMLCorpusReader):
    def __init__(self, root, fileid):
        self.path = root+fileid
        XMLCorpusReader.__init__(self, root, fileid)
        
    def sentences(self, raw=False):
        '''Returns a list of ElementTree objects (see Python docs)
        If raw is true then return a list of sentences where each sentence is a list of words
        '''
        if raw:
            sents = XMLCorpusView(self.path, '.*/sentence')
            sent_list = list()
            for sentence in sents:
                word_list = [word.text for word in sentence]
                sent_list.append(word_list)
            return sent_list
        else:
            return XMLCorpusView(self.path, '.*/sentence')

    def tagged_words(self, lemmatize=False):
        words = XMLCorpusView(self.path, '.*/w')
        if lemmatize:
            word_tags = [ (word.text, 
                           word.attrib['pos'], 
                           word.attrib['lemma'].replace("|", ""))
                         for word in words ]            
        else:
            word_tags = [ (word.text, word.attrib['pos']) for word in words ]
        return word_tags        

In [99]:
fam_corpus = CorpusReader('../data/', r'familjeliv-sex25.xml')
fam_tagged_words = fam_corpus.tagged_words()[:10]

In [100]:
fam_tagged_words

[('Ofta', 'AB'),
 ('hoppar', 'VB'),
 ('jag', 'PN'),
 ('bara', 'AB'),
 ('iväg', 'PL'),
 ('till', 'PP'),
 ('toan', 'NN'),
 ('och', 'KN'),
 ('torkar', 'VB'),
 ('mig', 'PN')]

In [135]:
fam_corpus = CorpusReader('../data/', r'familjeliv-sex25.xml')
fam_tagged_lemmas = fam_corpus.tagged_words(lemmatize=True)[:50]

lemmatizing


In [136]:
fam_tagged_lemmas

[('Ofta', 'AB', 'ofta'),
 ('hoppar', 'VB', 'hoppa'),
 ('jag', 'PN', 'jag'),
 ('bara', 'AB', 'bara'),
 ('iväg', 'PL', 'iväg'),
 ('till', 'PP', 'till'),
 ('toan', 'NN', 'toa'),
 ('och', 'KN', 'och'),
 ('torkar', 'VB', 'torka'),
 ('mig', 'PN', 'jag'),
 ('och', 'KN', 'och'),
 ('kissar', 'NN', 'kisse'),
 ('.', 'MAD', ''),
 ('mår', 'VB', 'må'),
 ('verkligen', 'AB', 'verkligen'),
 ('kroppen', 'NN', 'kropp'),
 ('bra', 'JJ', 'bra'),
 ('av', 'PP', 'av'),
 ('att', 'IE', 'att'),
 ('få', 'VB', 'få'),
 ('kiss', 'NN', 'kiss'),
 ('i', 'PP', 'i'),
 ('fel', 'JJ', 'fel'),
 ('hål', 'NN', 'hål'),
 ('så', 'AB', 'så'),
 ('att', 'IE', 'att'),
 ('säjja', 'VB', ''),
 ('...', 'MAD', ''),
 ('?', 'MAD', ''),
 ('?', 'MAD', ''),
 ('Så', 'AB', 'såså länge'),
 ('länge', 'AB', 'längeså länge:01'),
 ('man', 'PN', 'man'),
 ('inte', 'AB', 'inte'),
 ('gör', 'VB', 'göra'),
 ('nåt', 'PN', 'någon'),
 ('som', 'HP', ''),
 ('skadar', 'VB', 'skada'),
 ('nån', 'DT', 'någon'),
 ('annan', 'JJ', 'annan'),
 (',', 'MID', ''),
 ('är', '

In [92]:
fam_corpus = CorpusReader('../data/', r'familjeliv-sex25.xml')
fam_sents = fam_corpus.sentences(raw=True)[:2]

In [93]:
fam_sents

[['Ofta',
  'hoppar',
  'jag',
  'bara',
  'iväg',
  'till',
  'toan',
  'och',
  'torkar',
  'mig',
  'och',
  'kissar',
  '.'],
 ['mår',
  'verkligen',
  'kroppen',
  'bra',
  'av',
  'att',
  'få',
  'kiss',
  'i',
  'fel',
  'hål',
  'så',
  'att',
  'säjja',
  '...',
  '?',
  '?']]

In [69]:
for sent in fam_sents:
    sent_list = list()
    for word in sent:
        sent_list.append(word.text)
    print(sent_list)

['Ofta', 'hoppar', 'jag', 'bara', 'iväg', 'till', 'toan', 'och', 'torkar', 'mig', 'och', 'kissar', '.']
['mår', 'verkligen', 'kroppen', 'bra', 'av', 'att', 'få', 'kiss', 'i', 'fel', 'hål', 'så', 'att', 'säjja', '...', '?', '?']
['Så', 'länge', 'man', 'inte', 'gör', 'nåt', 'som', 'skadar', 'nån', 'annan', ',', 'är', 'väl', 'allt', 'okej', '?']
['Dit', 'hade', 'vi', 'aldrig', 'nått', 'om', 'jag', 'hade', 'varit', 'tyst', 'och', 'gillat', 'läget', 'när', 'jag', 'inte', 'gillade', 'läget', '.']
['Första', 'och', 'tredje', 'gången', 'hade', 'jag', 'linne', 'och', 'mjukisbyxor', 'på', 'mig', '..']


## Create Trigrams with NLTK.util.ngrams
Something to note here is that periods are included in the ngrams, it would be nice to remove them. The ngrams function returns an iterator, so we put it in a list() to get at the actual ngrams.

In [7]:
from nltk.util import ngrams
ng = list(ngrams(fam_words[:25], 3))
ng

[('Ofta', 'hoppar', 'jag'),
 ('hoppar', 'jag', 'bara'),
 ('jag', 'bara', 'iväg'),
 ('bara', 'iväg', 'till'),
 ('iväg', 'till', 'toan'),
 ('till', 'toan', 'och'),
 ('toan', 'och', 'torkar'),
 ('och', 'torkar', 'mig'),
 ('torkar', 'mig', 'och'),
 ('mig', 'och', 'kissar'),
 ('och', 'kissar', '.'),
 ('kissar', '.', 'mår'),
 ('.', 'mår', 'verkligen'),
 ('mår', 'verkligen', 'kroppen'),
 ('verkligen', 'kroppen', 'bra'),
 ('kroppen', 'bra', 'av'),
 ('bra', 'av', 'att'),
 ('av', 'att', 'få'),
 ('att', 'få', 'kiss'),
 ('få', 'kiss', 'i'),
 ('kiss', 'i', 'fel'),
 ('i', 'fel', 'hål'),
 ('fel', 'hål', 'så')]