In [14]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [15]:
import re
import os.path as op
import funcy as fp
from glob import glob

import pandas as pd

from gensim import models
from gensim.corpora import Dictionary, MmCorpus

import nltk
from nltk.corpus import stopwords

from spacy.lang.es import Spanish

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

MAIN_FOLDER = "."
DATA_PATHS = op.join(MAIN_FOLDER, "data/*.txt")
MODELS_FOLDER = op.join(MAIN_FOLDER, "models")

TOPICS_NUMBER = 10

nlp = Spanish()
stopwords = stopwords.words('spanish')

In [16]:
def tokenize_line(line):
    line = re.sub(r"[\(\[].*?[\)\]]", "", line)
    tokens = [token.text.lower().removesuffix('\n') for token in nlp(line)]
    tokens = [t for t in tokens if not 'm-ddhh-' in t]
    tokens = [t for t in tokens if len(t) > 2]
    tokens = set(tokens) - set(stopwords)
    return tokens
    
def tokenize(lines, token_size_filter=2):
    tokens = fp.mapcat(tokenize_line, lines)
    return [t for t in tokens if len(t) > token_size_filter]    

def load_doc(filename):
    group, doc_id = op.split(filename)
    doc_id = doc_id.split('.docx.txt')[0]
    with open(filename, errors='ignore') as f:
        doc = f.readlines()
    return {'group': group,
            'doc': doc,
            'tokens': tokenize(doc),
            'id': doc_id}

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('spanish'))

def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
  print('Building dictionary...')
  dictionary = Dictionary(docs)
  stopwords = nltk_stopwords().union(additional_stopwords)
  stopword_ids = map(dictionary.token2id.get, stopwords)
  dictionary.filter_tokens(stopword_ids)
  dictionary.compactify()
  dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  dictionary.compactify()

  print('Building corpus...')
  corpus = [dictionary.doc2bow(doc) for doc in docs]

  return dictionary, corpus

#### Show files paths

In [17]:
glob(DATA_PATHS)

['./data/17-pm-ppoo-07.wav_ok.docx.txt',
 './data/17-pm-ppoo-10_ok.docx.txt',
 './data/17-pm-ppoo-11_ok.docx.txt',
 './data/17-pm-ppoo-12.docx.txt',
 './data/17-pm-ppoo-13-José-Patricio-Olave-Martínez _ Transcrito por Agustin Hermosilla.docx.txt',
 './data/17-pm-ppoo-14.docx.txt',
 './data/17-pm-ppoo-15.docx.txt',
 './data/17-pm-ppoo-16. Audiencia BERNARDO COLIPAN.docx.txt',
 './data/17pm-ppoo-aymara_ok.docx.txt',
 './data/18-08-2021. Psicologos y psicologas de Salud de la Araucanía AG-Red de Peritajes Independientes Araucanía.docx.txt',
 './data/18-08-2021.- Instituto Nacional de la Lengua Mapuche .docx.txt',
 './data/18-08-2021.-_Sindicato_de_Trabajadores_Independientes_Nueva_Esperanza_del_fundo_Mundo.doc.docx.txt',
 './data/18-am-DDHH-11_ok.docx.txt',
 './data/18-am-DDHH-3-Kallfulikan-Lanco_ok.docx.txt',
 './data/18-am-DDHH-6.docx.txt',
 './data/18-am-DDHH-7.docx.txt',
 './data/18-am-DDHH-8.docx.txt',
 './data/19-08-2021.- Asoc. Indígena Consejo Territorial Quechua Pica_ok.docx.txt'

#### Load files and tokenize it

In [18]:
data = list(map(load_doc, glob(DATA_PATHS)))
docs = pd.DataFrame(data).set_index(['group','id'])

#### Create dictionary and bag of words corpus

In [19]:
dictionary, corpus = prep_corpus(docs['tokens'])

Building dictionary...
Building corpus...


#### Fit Latent Dirichlet Allocation (LDA) model

In [20]:
lda = models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=TOPICS_NUMBER,
    passes=10)

#### Save dictionary and lda model

In [21]:
MmCorpus.serialize(op.join(MODELS_FOLDER, 'ddhh.mm'), corpus)
dictionary.save(op.join(MODELS_FOLDER, 'ddhh.dict'))
lda.save(op.join(MODELS_FOLDER, 'lda_ddhh.model'))

#### Visualize Laten Dirichlet Allocation (LDA) model

In [22]:
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)

#### Fit and save Hierarchical Dirichlet process (HDP) model

In [23]:
hdp = models.hdpmodel.HdpModel(corpus, dictionary, T=TOPICS_NUMBER)
hdp.save(op.join(MODELS_FOLDER, 'hdp_ddhh.model'))

#### Visualize HDP model

In [24]:
vis_data = gensimvis.prepare(hdp, corpus, dictionary)
pyLDAvis.display(vis_data)