# Multilanguage topic modeling with BERT
- badges: true
- comments: true
- categories: [BERT,topics,nlp]



In [None]:
#collapse-output
!pip install contextualized_topic_models
!pip uninstall transformers -y
!pip install transformers==3.0.2

Uninstalling transformers-3.1.0:
  Successfully uninstalled transformers-3.1.0
Collecting transformers==3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 3.4MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 17.9MB/s 
Installing collected packages: tokenizers, transformers
  Found existing installation: tokenizers 0.8.1rc2
    Uninstalling tokenizers-0.8.1rc2:
      Successfully uninstalled tokenizers-0.8.1rc2
Successfully installed tokenizers-0.8.1rc1 transformers-3.0.2


In [None]:
import os
import numpy as np
import pickle
from contextualized_topic_models.models.ctm import CTM
from contextualized_topic_models.utils.data_preparation import bert_embeddings_from_file, bert_embeddings_from_list
from contextualized_topic_models.datasets.dataset import CTMDataset
from contextualized_topic_models.utils.data_preparation import TextHandler

In [None]:
!curl -s https://raw.githubusercontent.com/MilaNLProc/contextualized-topic-models/master/contextualized_topic_models/data/gnews/GoogleNews.txt | head -n1000 > googlenews.txt
!head googlenews.txt
!cat googlenews.txt | wc -l

centrepoint winter white gala london
mourinho seek killer instinct
roundup golden globe won seduced johansson voice
travel disruption mount storm cold air sweep south florida
wes welker blame costly turnover
psalm book fetch record ny auction ktvn channel reno
surface review comparison window powered tablet pitted
scientist unreported fish trap space
nokia lumia launch
edward snowden latest leak nsa monitored online porn habit radicalizers
1000


### Load The Data

In [None]:
file_name = "googlenews.txt"
handler = TextHandler(file_name)
handler.prepare() # create vocabulary and training data 

In [None]:
# generate BERT data
train_bert = bert_embeddings_from_file(file_name, "distiluse-base-multilingual-cased")
training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token)

### Train the Fully Contextualized Topic Model

In [None]:
num_topics = 50
ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=100, hidden_sizes = (100, ),
            inference_type="contextual", n_components=num_topics, num_data_loader_workers=0)

ctm.fit(training_dataset) # run the model

In [None]:
ctm.get_topic_lists(5) # get the top-5 words lists

[['kim', 'west', 'kanye', 'kardashian', 'bound'],
 ['day', 'thanksgiving', 'parade', 'macy', 'packer'],
 ['patriot', 'bronco', 'pat', 'packer', 'loss'],
 ['xbox', 'microsoft', 'p', 'game', 'console'],
 ['government', 'political', 'thai', 'party', 'protest'],
 ['oldboy', 'brolin', 'josh', 'lee', 'spike'],
 ['google', 'chrome', 'search', 'extension', 'voice'],
 ['johansson', 'globe', 'golden', 'scarlett', 'ineligible'],
 ['star', 'dancing', 'amber', 'riley', 'win'],
 ['police', 'guilty', 'watkins', 'case', 'lostprophets'],
 ['san', 'andreas', 'gta', 'mobile', 'android'],
 ['flat', 'future', 'record', 'level', 'p'],
 ['thanksgiving', 'day', 'parade', 'thanksgivukkah', 'holiday'],
 ['jos', 'wearhouse', 'men', 'bank', 'baldwin'],
 ['prince', 'william', 'swift', 'jovi', 'bon'],
 ['porn', 'nsa', 'habit', 'radicalizers', 'spying'],
 ['pope', 'church', 'putin', 'issue', 'coalition'],
 ['report', 'benghazi', 'security', 'baldwin', 'alec'],
 ['china', 'zone', 'flight', 'airspace', 'disputed'],
 [

In [None]:
!tail -n 5 googlenews.txt > test.txt
!cat test.txt

ray whitney return will dallas star huge boost offensively
s relied intermediary probe spacex sept upper stage
nokia lumia tablet kill surface
lakers net preview
neighbor helped save girl imprisoned year speaks


In [None]:
test_handler = TextHandler("test.txt")
test_handler.prepare() # create vocabulary and training data

# generate BERT data
testing_bert = bert_embeddings_from_file("test.txt", "distiluse-base-multilingual-cased")
testing_dataset = CTMDataset(test_handler.bow, testing_bert, test_handler.idx2token)

In [None]:

# we sample n times and average to get a more accurate estimate of the document-topic distribution
predicted_topics = [] 
thetas = np.zeros((len(testing_dataset), num_topics))
for a in range(0, 100):
    thetas = thetas + np.array(ctm.get_thetas(testing_dataset))
    
for idd in range(0, len(testing_dataset)):
    
    thetas[idd] = thetas[idd]/np.sum(thetas[idd])
    predicted_topic = np.argmax(thetas[idd]) 
    predicted_topics.append(predicted_topic)

# document-topic distribution , list of the topic predicted for each testing document
# thetas, 
predicted_topics 

[22, 41, 44, 23, 47]

In [None]:
test_handler.load_text_file()[1]

's relied intermediary probe spacex sept upper stage\n'

In [None]:
ctm.get_topic_lists(20)[41]

['ison',
 'comet',
 'raptor',
 'sun',
 'bonobo',
 'dna',
 'flying',
 'trouble',
 'stereo',
 'seahorse',
 'researcher',
 'preview',
 'spacecraft',
 'century',
 'jellyfish',
 'testing',
 'minute',
 'net',
 'spectacular',
 'congo']