<a href="https://colab.research.google.com/github/jbrown544/interlingual-topic-modeling/blob/main/Interlingual_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unsupervised Thematic Discovery in Non-English Natural Language Corpora

In [17]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [18]:
# make package management current then install required versions of packages
!pip install -q -U pip setuptools wheel python-Levenshtein \
                   gensim==4.1.0 spacy==3.1.2 pyLDAvis==3.3.1



In [19]:
# gensim package information
!pip show gensim

Name: gensim
Version: 4.1.0
Summary: Python framework for fast Vector Space Modelling
Home-page: http://radimrehurek.com/gensim
Author: Radim Rehurek
Author-email: me@radimrehurek.com
License: LGPL-2.1-only
Location: /usr/local/lib/python3.7/dist-packages
Requires: smart-open, scipy, numpy
Required-by: pyLDAvis


In [20]:
# spaCy package information
!pip show spacy

Name: spacy
Version: 3.1.2
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: cymem, spacy-legacy, numpy, packaging, wasabi, preshed, setuptools, srsly, requests, tqdm, thinc, typer, jinja2, blis, catalogue, typing-extensions, murmurhash, pathy, pydantic
Required-by: fastai, en-core-web-sm


In [21]:
# pyLDAvis package information
!pip show pyLDAvis

Name: pyLDAvis
Version: 3.3.1
Summary: Interactive topic model visualization. Port of the R package.
Home-page: https://github.com/bmabey/pyLDAvis
Author: Ben Mabey
Author-email: ben@benmabey.com
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: scipy, pandas, numexpr, future, numpy, scikit-learn, funcy, jinja2, sklearn, joblib, gensim, setuptools
Required-by: 


In [22]:
# import dependencies

from urllib.parse import urljoin

from smart_open import open

from gensim import corpora
from gensim import models
from gensim.utils import simple_preprocess

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [23]:
# samples of french language text

french_book_url = 'https://raw.githubusercontent.com/jbrown544/interlingual-topic-modeling/main/text/books/fr/'

jules_verne_french_books = [
  urljoin(french_book_url, 'Around_the_World_in_Eighty_Days.txt'),
  urljoin(french_book_url, 'Five_Weeks_in_a_Balloon.txt'),
  urljoin(french_book_url, 'From_the_Earth_to_the_Moon.txt'),
  urljoin(french_book_url, 'Robur_the_Conqueror.txt'),
  urljoin(french_book_url, 'The_Mutineers_of_the_Bounty.txt')
]

In [24]:
# prototype of preprocessing pipeline
def preprocess_pipeline(text, 
                        deaccent=False, 
                        min_token_len=2, 
                        max_token_length=15):
  
  return simple_preprocess(text, deacc=deaccent, 
                           min_len=min_token_len, 
                           max_len=max_token_length)

In [25]:
# read and process texts into token lists
jules_verne_texts = \
  [preprocess_pipeline(open(book, encoding='latin-1').read()) \
    for book in jules_verne_french_books]

In [26]:
# construct dictionary from token lists
jules_verne_dictionary = corpora.Dictionary(jules_verne_texts)

print(jules_verne_dictionary)

Dictionary(20937 unique tokens: ['abaissait', 'abaissant', 'abaissement', 'abaissé', 'abandonna']...)


In [27]:
# create a corpus by converting token lists into a documet-term matrix
jules_verne_corpus = [jules_verne_dictionary.doc2bow(text) for text in jules_verne_texts]

In [28]:
# construct an HDP model (topics determined during processing)
jules_verne_hdp_model = models.HdpModel(jules_verne_corpus, 
                                        id2word=jules_verne_dictionary)

In [29]:
# enable visualization inline in notebook
pyLDAvis.enable_notebook()

In [30]:
# visualize suggested LDA model generated from HDP model
gensimvis.prepare(jules_verne_hdp_model.suggested_lda_model(),
                  jules_verne_corpus, jules_verne_dictionary)