<a href="https://colab.research.google.com/github/jbrown544/interlingual-topic-modeling/blob/main/Interlingual_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unsupervised Thematic Discovery in Non-English Natural Language Corpora

*Note: Use the "Open in Colab" link above to render inline visualizations.*

In [29]:
import warnings

# supress library maintenance warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [30]:
# update package management then install required versions of dependencies
!pip install -q -U pip setuptools wheel python-Levenshtein \
                   gensim==4.1.0 spacy==3.1.2 pyLDAvis==3.3.1



In [31]:
# inspect Gensim package information
!pip show gensim

Name: gensim
Version: 4.1.0
Summary: Python framework for fast Vector Space Modelling
Home-page: http://radimrehurek.com/gensim
Author: Radim Rehurek
Author-email: me@radimrehurek.com
License: LGPL-2.1-only
Location: /usr/local/lib/python3.7/dist-packages
Requires: numpy, smart-open, scipy
Required-by: pyLDAvis


In [32]:
# inspect spaCy package information
!pip show spacy

Name: spacy
Version: 3.1.2
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: spacy-legacy, numpy, catalogue, requests, blis, typer, thinc, pathy, wasabi, jinja2, tqdm, pydantic, typing-extensions, setuptools, packaging, preshed, srsly, cymem, murmurhash
Required-by: fastai, en-core-web-sm


In [33]:
# instpect pyLDAvis package information
!pip show pyLDAvis

Name: pyLDAvis
Version: 3.3.1
Summary: Interactive topic model visualization. Port of the R package.
Home-page: https://github.com/bmabey/pyLDAvis
Author: Ben Mabey
Author-email: ben@benmabey.com
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: funcy, scikit-learn, scipy, sklearn, future, gensim, numpy, jinja2, setuptools, joblib, pandas, numexpr
Required-by: 




---



In [34]:
# import dependencies

from urllib.parse import urljoin

from smart_open import open

from gensim import corpora
from gensim import models
from gensim.utils import simple_preprocess

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [35]:
# samples from the french language

# samples are accessible from within this repository
french_book_url = 'https://raw.githubusercontent.com/jbrown544/interlingual-topic-modeling/main/text/books/fr/'

jules_verne_french_books = [
  urljoin(french_book_url, 'Around_the_World_in_Eighty_Days.txt'),
  urljoin(french_book_url, 'Five_Weeks_in_a_Balloon.txt'),
  urljoin(french_book_url, 'From_the_Earth_to_the_Moon.txt'),
  urljoin(french_book_url, 'Robur_the_Conqueror.txt'),
  urljoin(french_book_url, 'The_Mutineers_of_the_Bounty.txt')
]

In [36]:
# prototype of text preprocessing pipeline
def normalize_text(text,
                   deaccent=False,
                   min_token_length=2,
                   max_token_length=15):
  
  # tokenizization, lowercasing, filtering tokens to length range,
  # and optionally removing accenting marks
  processed_text = simple_preprocess(text, deacc=deaccent,
                                     min_len=min_token_length,
                                     max_len=max_token_length)

  return processed_text

In [37]:
# read and process documents into processed lists
jules_verne_texts = \
  [normalize_text(open(book, encoding='latin-1').read()) \
   for book in jules_verne_french_books]

In [38]:
# construct dictionary from processed lists
jules_verne_dictionary = corpora.Dictionary(jules_verne_texts)

print(jules_verne_dictionary)

Dictionary(20937 unique tokens: ['abaissait', 'abaissant', 'abaissement', 'abaissé', 'abandonna']...)


In [39]:
# filter tokens not appearing in at least N documents
# or appearing in over % of documents
# jules_verne_dictionary.filter_extremes(no_below=1,
#                                        no_above=0.5)

# print(jules_verne_dictionary)

In [40]:
# construct corpus (document-term matrix) by converting 
# processed lists into bag-of-words representation
jules_verne_corpus = [jules_verne_dictionary.doc2bow(text) \
                      for text in jules_verne_texts]

In [41]:
# construct HDP model (topics discovered during processing)
jules_verne_hdp_model = models.HdpModel(jules_verne_corpus, 
                                        id2word=jules_verne_dictionary)

In [42]:
# enable visualizations inline
pyLDAvis.enable_notebook()

In [43]:
# visualize suggested LDA model derived from computed HDP model
gensimvis.prepare(jules_verne_hdp_model.suggested_lda_model(),
                  jules_verne_corpus, jules_verne_dictionary)