<a href="https://colab.research.google.com/github/jbrown544/interlingual-topic-modeling/blob/main/Interlingual_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Built with spaCy](https://img.shields.io/badge/built%20with-spaCy-09a3d5.svg)](https://spacy.io)

# Unsupervised Thematic Discovery in Non-English Natural Language Corpora

**Use the "Open in Colab" link above to render visualizations.*

## Processing Environment Initialization

In [1]:
# reduce warnings for publishing

import warnings
warnings.filterwarnings("ignore")

### Environment Variables

In [2]:
# configuration folders

# place your non-ephemeral configuration files in here
%mkdir --parents caches keys

In [3]:
# set environment variable to point to your own service account key

%set_env GOOGLE_APPLICATION_CREDENTIALS=/content/keys/interlingual-topic-modeling.json

env: GOOGLE_APPLICATION_CREDENTIALS=/content/keys/interlingual-topic-modeling.json


### Package Installations

In [4]:
# update required dependencies

# If you see a message such as this after running pip (bottom of output):
#
# "WARNING: The following packages were previously imported in this runtime:""
# " [ xxxx, xxxxx ]"
# "You must restart the runtime in order to use newly installed versions.""
#
# Please do restart the Python runtime. Neglecting this restart may 
# result in errors during processing due to previously loaded packages.

%pip install -U python-Levenshtein \
                spacy==3.1.2 \
                pyLDAvis==3.3.1 \
                gensim==4.1.2

# pip setuptools wheel



In [5]:
# inspect spaCy package information

%pip show spacy

Name: spacy
Version: 3.1.2
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: typer, tqdm, numpy, typing-extensions, requests, jinja2, preshed, wasabi, pathy, pydantic, blis, cymem, packaging, murmurhash, spacy-legacy, setuptools, thinc, srsly, catalogue
Required-by: fr-core-news-md, fastai, en-core-web-sm


In [6]:
# instpect pyLDAvis package information

%pip show pyLDAvis

Name: pyLDAvis
Version: 3.3.1
Summary: Interactive topic model visualization. Port of the R package.
Home-page: https://github.com/bmabey/pyLDAvis
Author: Ben Mabey
Author-email: ben@benmabey.com
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: numexpr, sklearn, numpy, setuptools, jinja2, pandas, funcy, scikit-learn, future, gensim, scipy, joblib
Required-by: 


In [7]:
# inspect Gensim package information

%pip show gensim

Name: gensim
Version: 4.1.2
Summary: Python framework for fast Vector Space Modelling
Home-page: http://radimrehurek.com/gensim
Author: Radim Rehurek
Author-email: me@radimrehurek.com
License: LGPL-2.1-only
Location: /usr/local/lib/python3.7/dist-packages
Requires: numpy, smart-open, scipy
Required-by: pyLDAvis


## Data Environment Initialization

### Dependencies

In [8]:
# import dependencies

import json
from urllib.parse import urljoin
from os.path import exists

import spacy

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

from gensim import corpora
from gensim import models
from gensim.utils import simple_preprocess
from gensim.test.utils import get_tmpfile

from smart_open import open

from google.cloud import translate_v2 as translate

  from collections import Iterable
  from collections import Mapping
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecat

In [9]:
# enable visualizations inline

pyLDAvis.enable_notebook()

### Translation Environment

#### Translation Caching

In [10]:
xlat_cache_file_name = '/content/caches/xlat.json'

In [11]:
# translation cache: loading

def load_translation_cache():  
  """ 
  Return a translation cache populated with previously persisted dictionaries.
  """
  
  xlat_cache = {}

  if exists(xlat_cache_file_name):
    with open(xlat_cache_file_name) as f:
        xlat_cache = json.load(f)

    print(f'"{xlat_cache_file_name}" xlat cache loaded:')
    for lang in xlat_cache:
      print(f'\t[{lang}] contains {len(xlat_cache[lang])} entries')

  return xlat_cache

In [12]:
# translation cache: saving

def save_translation_cache(dirty_xlat_cache):
  """
  Save a translation cache overwriting any previously persisted dictionaries.

  Keyword arguments:
      dirty_xlat_cache -- a dirty translation cache
  """

  with open(xlat_cache_file_name, 'w') as f:
    json.dump(dirty_xlat_cache, f)

  print(f'"{xlat_cache_file_name}" xlat cache persisted.',
        'Please make a non-ephemeral copy!!!')

In [13]:
# translation cache: extending

def extend_translation_cache(terms,
                      source_language,
                      target_language='en'):  
  """
  Extend a translation cache resolving any unknown terms using a cloud 
  translator. Return the number of cache misses and the translation dictionary 
  for the specified source language containing at least the requested terms.

  Keyword arguments:
      terms -- sequence of terms to warm in cache
      source_language -- the language of terms
      target_language -- the target language for term translation
  """

  xlat_cache = load_translation_cache()

  # get or create a dictionary for the specified language mapping
  xlat_dict = xlat_cache.setdefault(f'{source_language}->{target_language}', {})

  cache_terms = [term for term in terms if term not in xlat_dict]

  # if there are uknown terms, resolve them with cloud translation
  if len(cache_terms) > 0:

    print(f'Translating {len(cache_terms)} terms.')

    translate_client = translate.Client(target_language=target_language)

    # the cloud translation API processes lists of a finite length so
    # terms requiring resolution are batched into 128 per call
    batch_size = 128

    # process terms in batches storing results into the translation dictionary
    for i in range(0, len(cache_terms), batch_size):
      for result in translate_client.translate(cache_terms[i:i+batch_size],
                                               source_language=source_language):
        xlat_dict[result['input']] = result['translatedText']
    
    save_translation_cache(xlat_cache)

  return len(cache_terms), xlat_dict

#### Gensim Dictionary Translation

In [14]:
# gensim dictionary: loading

def load_gensim_dict_file(dict_file):
  """
  Load the entries of a gensim dictionary file then return a tuple
  containing read document count and list of entries.

  Keyword arguments:
      dict_file -- gensim dictionary file
  """

  entries = []

  with open(dict_file) as f:
    num_docs = f.readline().strip()
    line = f.readline().strip()
    while line != '':
      entries.append(line.split('\t'))
      line = f.readline().strip()

  return num_docs, entries

In [15]:
# gensim dictionary: saving

def save_gensim_dict_file(dict_file, gensim_dict_entries):
  """
  Save the entries of a gensim dictionary to a file.

  Keyword arguments:
      dict_file -- gensim dictionary file
      gensim_dict_entries -- list of (id, term, frequency) entries
  """

  num_docs, entries = gensim_dict_entries

  with open(dict_file, 'w') as f:
    f.write(num_docs)
    f.write('\n')
    for entry in entries:
      f.write('\t'.join(map(str, entry)))
      f.write('\n')

In [16]:
# gensim dictionary: individual entry translation

def translate_gensim_dictionary_entries(gensim_dict_entries,
                                        source_language,
                                        target_language='en'):
  """
  Returns the number of translation cache misses and a new list of gensim 
  dictionary entries having terms augmented with translations.

  Keyword arguments:
      gensim_dict_entries -- list of (id, term, frequency) entries
      source_language -- the language of entry terms
      target_language -- the target language for entry term translation
  """

  num_docs, entries = gensim_dict_entries

  # extend the cache with the requested terms
  misses, xlat_dict = \
    extend_translation_cache([term for id, term, freq in entries],
                             source_language, target_language)

  # created augmented dictionary entries
  entries = entries.copy()
  for i in range(len(entries)):
    entries[i][1] = f"{xlat_dict[entries[i][1]]} ({entries[i][1]})"

  return misses, (num_docs, entries)

In [17]:
# gensim dictionary: dictionary translation

def translate_gensim_dictionary(gensim_dictionary, 
                                source_language,
                                target_language='en'):
  
  """
  Returns the number of translation cache misses and a new gensim dictionary 
  with entries augmented with their target language translations.

  Keyword arguments:
      gensim_dict_entries -- list of (id, term, frequency) entries
      source_language -- the language of entry terms
      target_language -- the target language for entry term translation
  """

  # convert dictionary to entries structure by way of temp file
  dict_file = get_tmpfile('dict_file')
  gensim_dictionary.save_as_text(dict_file)
  gensim_dict_entries = load_gensim_dict_file(dict_file)

  # augment the entries structure with translations
  misses, xlat_entries = translate_gensim_dictionary_entries(
      gensim_dict_entries, source_language, target_language)

  # convert entries structure to dictionary by way of temp file
  xlat_dict_file = get_tmpfile('xlat_dict_file')
  save_gensim_dict_file(xlat_dict_file, xlat_entries)  
  return misses, corpora.Dictionary().load_from_text(xlat_dict_file)

#### Translation Test

In [18]:
# test translate a small dictionary

misses, translated_test_dict = \
  translate_gensim_dictionary(corpora.Dictionary([['crayon', 'stylo']]), 'fr')

# Check we have a previously populated dictionary.
# This check helps ensure we are not forgetting 
# to install our non-ephemeral translation dictionary.
assert misses == 0

print(translated_test_dict)

"/content/caches/xlat.json" xlat cache loaded:
	[fr->en] contains 14782 entries
Dictionary(2 unique tokens: ['pencil (crayon)', 'pen (stylo)'])


### Language-Neutral Processing Pipelines

#### Install spaCy Models

In [19]:
# install data sets required for spaCy statistical models

%run -m spacy download fr_core_news_md

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_md')


#### Create spaCy Processing Pipelines

In [20]:
# spaCy model parameters

load_params = dict(
    # components to exclude in pipelines
    exclude=['parser', 'ner']
)

In [21]:
# load language object and inspect pipeline: French

fr_nlp = spacy.load('fr_core_news_md', **load_params)

print(fr_nlp.component_names)

['tok2vec', 'morphologizer', 'senter', 'attribute_ruler', 'lemmatizer']


#### Basic Processing

In [22]:
# basic text preprocessing

def basic_preprocess(texts,
                     deaccent=False,
                     min_token_length=2,
                     max_token_length=15):
  """
  Perform basic (faster) preprocessing on a list of texts returning the 
  tokenized results.

  Processing consists of tokenizization, lowercasing, filtering tokens to a
  length range and optionally removing accenting marks.

  Keyword arguments:
      texts -- strings containing source texts
      deaccent -- remove accents from characters
      min_token_length -- the minimum token length to retain
      max_token_length -- the maximum token length to retain
  """

  return [simple_preprocess(text, deacc=deaccent,
                     min_len=min_token_length,
                     max_len=max_token_length)
  for text in texts]

#### Advanced Processing

In [23]:
# advanced text preprocessing

def advanced_preprocess(nlp, 
                        texts,
                        min_token_length=2,
                        max_token_length=15,
                        processors=1):
  """
  Perform more advanced (slower) preprocessing on a list of texts returning the 
  tokenized results.

  Processing consists of tokenizization, lowercasing, filtering tokens to a
  length range, filtering by high-value parts of speech (nouns, verbs, etc.),
  filtering non-alphabetic terms, and filtering language specific stopwords.

  Keyword arguments:
      nlp -- spaCy language model appropriate for texts
      texts -- list of strings containing language texts      
      min_token_length -- the minimum token length to retain
      max_token_length -- the maximum token length to retain
      processors -- count of CPUs to employ; (-1=all) use with caution!
  """

  # SpaCy's NLP processing pipeline extracts 'features' from text
  # useful for selecting the more valuable terms. 
  # Here, only parts-of-speech listed are preserved.
  keep_pos = {'NOUN', 'VERB', 'ADJ', 'ADV'}
  
  return [[token.lemma_.lower() for token in doc
           if token.pos_ in keep_pos
           and token.is_alpha and not token.is_stop
           and len(token.lemma_) >= min_token_length
           and len(token.lemma_) <= max_token_length]
          for doc in nlp.pipe(texts, n_process=processors)]

## French Language Samples

In [24]:
# samples of the French language

# samples are available within this repository
french_book_url = ('https://raw.githubusercontent.com/jbrown544/'
                   'interlingual-topic-modeling/main/text/books/fr/')

# characteristics: few, lengthy documents with informational headers intact
jules_verne_french_books = [
  'Five_Weeks_in_a_Balloon.txt',
  'Around_the_World_in_Eighty_Days.txt',
  'Robur_the_Conqueror.txt',
  'From_the_Earth_to_the_Moon.txt',  
  'The_Begums_Fortune.txt',
  'The_Blockade_Runners.txt',
  'The_Mutineers_of_the_Bounty.txt'
]

#### Process French Language Samples

In [25]:
# generator for loading encoded french books

%time jules_verne_texts = \
  [open(urljoin(french_book_url, book), encoding='latin-1').read() \
    for book in jules_verne_french_books]

CPU times: user 120 ms, sys: 10.3 ms, total: 131 ms
Wall time: 2.79 s


In [26]:
# preprocessing parameters

preprocess_params = dict(
    texts=jules_verne_texts, 
    min_token_length=3    
)

In [27]:
# basic preprocessing

%time jules_verne_docs_basic = basic_preprocess(**preprocess_params)

CPU times: user 428 ms, sys: 9.91 ms, total: 438 ms
Wall time: 439 ms


In [28]:
# advanced preprocessing

%time jules_verne_docs_adv = advanced_preprocess(fr_nlp, **preprocess_params)

CPU times: user 48.7 s, sys: 3.97 s, total: 52.7 s
Wall time: 52.6 s


In [29]:
# check the term lengths on docs for grades of preprocessing

# NOTE: refactor into a general EDA function?

print(f'terms (basic vs. advanced):')
for title, basic, advanced \
 in zip(jules_verne_french_books,
        jules_verne_docs_basic,
        jules_verne_docs_adv):
  print(f'\t{len(basic)}\tvs.\t{len(advanced)}\t({title})')

terms (basic vs. advanced):
	59713	vs.	32377	(Five_Weeks_in_a_Balloon.txt)
	51505	vs.	27415	(Around_the_World_in_Eighty_Days.txt)
	43217	vs.	22509	(Robur_the_Conqueror.txt)
	40445	vs.	22174	(From_the_Earth_to_the_Moon.txt)
	38249	vs.	21409	(The_Begums_Fortune.txt)
	13719	vs.	7314	(The_Blockade_Runners.txt)
	5348	vs.	2763	(The_Mutineers_of_the_Bounty.txt)


In [30]:
# use advance output for subsequent processing 

# NOTE: remove the smaller two books?

jules_verne_docs = jules_verne_docs_adv

print(len(jules_verne_docs))

7


In [31]:
# construct dictionary from processed docs

jules_verne_dictionary = corpora.Dictionary(jules_verne_docs)

print(jules_verne_dictionary)

Dictionary(11668 unique tokens: ['abaissement', 'abaisser', 'abandonne', 'abandonner', 'abat']...)


In [32]:
# filter extremes from the dictionary 
# by removing words that appear too 
# frequently or too rarely

# NOTE: consider only terms in 2 or more docs?

jules_verne_dictionary.filter_extremes(no_below=1,
                                       no_above=0.75,
                                       keep_n=10_000)

print(jules_verne_dictionary)

Dictionary(10000 unique tokens: ['abaissement', 'abandonne', 'abat', 'abattre', 'abattu']...)


In [33]:
# construct corpus (term-document matrix) by converting 
# processed docs into bag-of-words representations

jules_verne_corpus = [jules_verne_dictionary.doc2bow(doc) \
                      for doc in jules_verne_docs]

### HDP Topic Modelling

In [34]:
# construct HDP model

# Online HDP is efficient non-parametric topic modelling that 
# does not require a specified topic count. The "T" parameter 
# restricts the number of topics revealed.
%time jules_verne_hdp_model = models.HdpModel(jules_verne_corpus, \
                                              jules_verne_dictionary, \
                                              T=20)

CPU times: user 439 ms, sys: 212 ms, total: 651 ms
Wall time: 428 ms


In [35]:
# create a translation of the dictionary for presentation

misses, jules_verne_xlat_dictionary = \
  translate_gensim_dictionary(jules_verne_dictionary, 'fr')
  
print(f'{misses} cache misses.')
print(jules_verne_xlat_dictionary)

"/content/caches/xlat.json" xlat cache loaded:
	[fr->en] contains 14782 entries
0 cache misses.
Dictionary(10000 unique tokens: ['lowering (abaissement)', 'abandoned (abandonne)', 'stun (abasourdir)', 'abatement (abat)', 'abatement (abattement)']...)


#### Model Visualization

In [36]:
# prepare model visualization

vis_data = gensimvis.prepare(jules_verne_hdp_model,
                             jules_verne_corpus,
                             jules_verne_xlat_dictionary,
                             mds='tsne')

# pyLDAvis.save_html(model_vis, 'jules_verne_topics.html')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype = np.float
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  error = np.finfo(np.float).max
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  best_error = np.finfo(np.float).max
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  error = np.finfo(np.float).max
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  best_error = np.finfo(np.float).max


In [37]:
# display the visualization inline

pyLDAvis.display(vis_data)

#### Model Evaluation

In [38]:
# evaluate model coherence

coherence_model = models.CoherenceModel(jules_verne_hdp_model,
                                        corpus=jules_verne_corpus,
                                        coherence='u_mass')

print(f'{coherence_model.get_coherence()} coherence')

-8.671587170236354 coherence


In [39]:
# coherence_model.top_topics_as_word_lists(jules_verne_hdp_model,
#                                          jules_verne_xlat_dictionary,
#                                          topn=10)