<a href="https://colab.research.google.com/github/jbrown544/interlingual-topic-modeling/blob/main/Interlingual_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unsupervised Thematic Discovery in Non-English Natural Language Corpora

**Use the "Open in Colab" link above to render visualizations.*

## Processing Environment Initialization

[![Built with spaCy](https://img.shields.io/badge/built%20with-spaCy-09a3d5.svg)](https://spacy.io)

In [1]:
# supress library maintenance warnings

import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)
# warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [2]:
# update package management then install required versions of dependencies
!pip install -q -U pip setuptools wheel python-Levenshtein numpy \
                   gensim==4.1.0 spacy==3.1.2 pyLDAvis==3.3.1



In [3]:
# inspect Gensim package information
!pip show gensim

Name: gensim
Version: 4.1.0
Summary: Python framework for fast Vector Space Modelling
Home-page: http://radimrehurek.com/gensim
Author: Radim Rehurek
Author-email: me@radimrehurek.com
License: LGPL-2.1-only
Location: /usr/local/lib/python3.7/dist-packages
Requires: numpy, smart-open, scipy
Required-by: pyLDAvis


In [4]:
# inspect spaCy package information
!pip show spacy

Name: spacy
Version: 3.1.2
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: srsly, setuptools, pydantic, pathy, catalogue, requests, typer, wasabi, numpy, thinc, blis, preshed, jinja2, murmurhash, typing-extensions, cymem, spacy-legacy, tqdm, packaging
Required-by: fastai, en-core-web-sm


In [5]:
# instpect pyLDAvis package information
!pip show pyLDAvis

Name: pyLDAvis
Version: 3.3.1
Summary: Interactive topic model visualization. Port of the R package.
Home-page: https://github.com/bmabey/pyLDAvis
Author: Ben Mabey
Author-email: ben@benmabey.com
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: funcy, joblib, scikit-learn, future, gensim, setuptools, numexpr, sklearn, jinja2, scipy, numpy, pandas
Required-by: 


## Data Environment Initialization

In [6]:
# import dependencies

from urllib.parse import urljoin

from gensim import models
from gensim import corpora
from gensim.utils import simple_preprocess

from smart_open import open

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

  from collections import Iterable
  from collections import Mapping
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for mor

In [7]:
# enable visualizations inline
pyLDAvis.enable_notebook()

### Language-Neutral Processing Pipelines

In [8]:
# basic text preprocessing
def basic_preprocess(texts,
                     deaccent=False,
                     min_token_length=2,
                     max_token_length=15):
  
  # tokenizization, lowercasing, filtering tokens to length range,
  # and optionally removing accenting marks
  return [simple_preprocess(text, deacc=deaccent,
                             min_len=min_token_length,
                             max_len=max_token_length)
  for text in texts]

In [9]:
# basic text preprocessing
def advanced_preprocess(texts, language):
  pass

### French Language Samples

In [10]:
# samples from the French language

# samples are accessible from within this repository
french_book_url = ('https://raw.githubusercontent.com/jbrown544/'
                   'interlingual-topic-modeling/main/text/books/fr/')

jules_verne_french_books = [
  urljoin(french_book_url, 'Around_the_World_in_Eighty_Days.txt'),
  urljoin(french_book_url, 'Five_Weeks_in_a_Balloon.txt'),
  urljoin(french_book_url, 'From_the_Earth_to_the_Moon.txt'),
  urljoin(french_book_url, 'Robur_the_Conqueror.txt'),
  urljoin(french_book_url, 'The_Mutineers_of_the_Bounty.txt'),
  urljoin(french_book_url, 'The_Begums_Fortune.txt'),
  urljoin(french_book_url, 'The_Blockade_Runners.txt')
]

#### Process French Language Samples

In [11]:
# read and process documents into processed docs

encoding = 'latin-1'
jules_verne_french_texts = [open(book, encoding=encoding).read() \
                            for book in jules_verne_french_books]

jules_verne_docs = basic_preprocess(jules_verne_french_texts)

In [12]:
# construct dictionary from processed docs
jules_verne_dictionary = corpora.Dictionary(jules_verne_docs)

print(jules_verne_dictionary)

Dictionary(23777 unique tokens: ['abaissait', 'abaissant', 'abaissement', 'abaissé', 'abandonna']...)


In [13]:
# construct corpus (term-document matrix) by converting 
# processed lists into bag-of-words representation
jules_verne_corpus = [jules_verne_dictionary.doc2bow(doc) \
                      for doc in jules_verne_docs]

## HDP Topic Modelling Inspection (Automatic Case)

In [14]:
# construct HDP model (topics discovered during processing)
jules_verne_hdp_model = models.HdpModel(jules_verne_corpus, 
                                        id2word=jules_verne_dictionary)

In [15]:
# visualize suggested LDA model derived from computed HDP model
gensimvis.prepare(jules_verne_hdp_model.suggested_lda_model(),
                  jules_verne_corpus, jules_verne_dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


## Translation Environment Initialization

In [16]:
# set environment variable to point to your own service account key
%env GOOGLE_APPLICATION_CREDENTIALS /content/keys/interlingual-topic-modeling.json

env: GOOGLE_APPLICATION_CREDENTIALS=/content/keys/interlingual-topic-modeling.json


In [17]:
# sample code from google documentation
def translate_text(target, text):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    import six
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    results = translate_client.translate(text, target_language=target)

    for result in results:
      print(u"Text: {}".format(result["input"]))
      print(u"Translation: {}".format(result["translatedText"]))
      print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))

In [18]:
# test translatoins
translate_text('fr', ['pencil', 'pen', 'paper'])

Text: pencil
Translation: crayon
Detected source language: en
Text: pen
Translation: stylo
Detected source language: en
Text: paper
Translation: papier
Detected source language: en
