#1 Preperations

Install third party libraries:

In [None]:
!pip install aiohttp==3.6.2
!pip install nest_asyncio==1.4.0
!pip install aiosqlite==0.15.0
!pip install gensim==3.8.3
!pip install pyLDAvis==2.1.2
!pip install colorspacious==1.1.2
!pip install spacy==2.2.4
!pip install nltk==3.4
!pip install tqdm==4.41.1
!pip install bokeh==2.1.1
!pip install beautifulsoup4==4.7.1
!pip install pandas==1.0.5
!pip install numpy==1.18.5
!pip install wordcloud==1.5.0
!pip install sklearn==0.0

In [None]:
import os
import sys
import time
import re
import logging
import math
from pathlib import Path
from datetime import date, timedelta
from pprint import pprint
from collections import defaultdict

import aiohttp
import asyncio
import aiosqlite
import sqlite3
import tqdm
import traceback
import nest_asyncio
import gensim
import spacy
import nltk
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
from wordcloud import WordCloud
from bs4 import BeautifulSoup

Prepare folder structure:

In [None]:
data_dir = Path('01-data')
data_dir.mkdir(exist_ok=True)
models_dir = Path('02-models')
models_dir.mkdir(exist_ok=True)
plots_dir = Path('03-plots')
plots_dir.mkdir(exist_ok=True)
extra_libraries_dir = Path('04_extra_libraries')
os.environ["EXTRA_LIBRARIES"] = str(extra_libraries_dir)
extra_libraries_dir.mkdir(exist_ok=True)

article_db = data_dir / 'articles.db'

#2 Building the text corpus

To build a large text corpus, we will scrape some newspaper articles from zeit.de which are publicly available and easily to parse. Newspaper articles in general are a good start to get familiar with topic modelling since we can more or less anticipate distinct topics. 

To scrape, we will first navigate through zeit.de's sitemap and download all available urls to online articles for a given period. Then, we will download them, parse their content and save the resulting text in a text file, one article per line. 

We will be using Python's asyncio library which will allows us to write asynchronous code. This way, Python can request multiple articles from the server at once without having to wait for the first request to be completed.

We will need this, so asyncio works within Ipython notebook:

In [None]:
nest_asyncio.apply()

In [None]:
db = sqlite3.connect(str(article_db))
db.execute("""
  CREATE TABLE articles
    (
      article_id INTEGER PRIMARY KEY,
      url TEXT UNIQUE,
      title TEXT,
      text TEXT,
      authors TEXT,
      publishing_date TEXT,
      topic_ref TEXT,
      tags TEXT,
      downloaded TEXT
    )
""")
db.commit()
db.close()

In [None]:
async def fetch_html(url, session):
      response = await session.request(method="GET", url=url)
      response.raise_for_status()
      content = await response.text()
      return content


async def parse_html(url, session):
    try:
        html = await fetch_html(url=url, session=session)
    except (
        aiohttp.ClientError,
        aiohttp.http_exceptions.HttpProcessingError,
    ) as ex:
        print(ex)
        return None
    else:
        soup = BeautifulSoup(html, features="lxml")
        urls = [entry.loc.text for entry in soup.findAll("url")]
        return urls


async def write_urls_to_db(db_session, url, session):
    result = await parse_html(url=url, session=session)
    if result:
        for article_url in result:
            data = (f"{article_url}", "false",)
            try:
                await db_session.execute(
                    "INSERT OR IGNORE INTO articles (url, downloaded) VALUES (?,?)",
                    data
                )
            except sqlite3.InterfaceError as ex:
                print(ex)
 
async def bulk_crawl_and_write(article_db, start_date, end_date):
    def daterange(start_date, end_date):
        '''Helper function to easily iterate over date range'''
        for n in range(int((end_date - start_date).days)):
            yield start_date + timedelta(n)
    
    base_url = "https://www.zeit.de/gsitemaps/index.xml?date="
    # We have to trick zeit.de into thinking we are running the requests
    # using the library requests:
    headers = {"User-Agent": "python-requests/2.21.0"}

    # Start client session:
    async with aiohttp.ClientSession(
          connector=aiohttp.TCPConnector(limit=5),
          cookie_jar=aiohttp.CookieJar(),
          headers=headers
        ) as session:
        # Run a request to set a cookie for the session:
        await session.request(method="GET", url="https://www.zeit.de/gsitemaps/index.xml")
        
        async with aiosqlite.connect(article_db) as db:
          tasks = []
          for single_date in daterange(start_date, end_date):
              url = base_url + single_date.strftime("%Y-%m-%d")
              tasks.append(
                  write_urls_to_db(db_session=db, url=url, session=session)
              )

          responses = []
          for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks)):
              responses.append(await f)

          await db.commit()

Retrieve all URLs listed in their sitemap for a defined period:

In [None]:
start_date = date(year=2020, month=1, day=1) #including
end_date = date(year=2020, month=2, day=1) #excluding

start_time = time.time()
loop = asyncio.get_event_loop()
result = loop.run_until_complete(bulk_crawl_and_write(
    article_db=article_db,
    start_date=start_date,
    end_date=end_date
))
duration = time.time() - start_time
print(f"\nDownloaded sites in {duration:.2f} seconds")

Let's see how many URLs we have collected:

In [None]:
db = sqlite3.connect(str(article_db))
cursor = db.cursor()
print(len(list(cursor.execute("SELECT * FROM articles WHERE downloaded = 'false'"))))
db.close()

If we have a look at the https://www.zeit.de/robots.txt, it tells us we should not touch those URLs that cointain:
* /zeit/
* /templates/
* /hp_channels/
* /send/
* /suche/
* /comment-thread
* /liveblog-backend

Most of these URLs won't be listed in the sitemap but we never know, so let's explicitly remove them from our list of URLs:

In [None]:
db = sqlite3.connect(str(article_db))
cursor = db.cursor()

# Delete articles in english:
cursor.execute("DELETE FROM articles WHERE url LIKE '%/zeit/%'")
cursor.execute("DELETE FROM articles WHERE url LIKE '%/templates/%'")
cursor.execute("DELETE FROM articles WHERE url LIKE '%/hp_channels/%'")
cursor.execute("DELETE FROM articles WHERE url LIKE '%/send/%'")
cursor.execute("DELETE FROM articles WHERE url LIKE '%/suche/%'")
cursor.execute("DELETE FROM articles WHERE url LIKE '%/comment-thread/%'")
cursor.execute("DELETE FROM articles WHERE url LIKE '%/liveblog-backend/%'")

db.commit()
db.close()

In [None]:
def extract_content(tag):
    content = ""
    for x in tag.contents:
        if isinstance(x, str):
            content += " " + x
        else:
            content += extract_content(x)
    return content


def clean_text(text):
    # Remove line breaks
    text = text.replace("\n", " ").replace("\r", " ")
    
    # Remove double whitespace
    text = re.sub(r"\s{2,}", " ", text)
    text = text.strip()
    return text


def parse_article(raw_html):
    # Initiate result dict:
    result = {
      "text": "",
      "title": "",
      "tags": "",
      "authors": "",
      "publishing_date": "",
    }
    
    html = BeautifulSoup(raw_html, "html.parser")

    # Get publishing date:
    publishing_date_html = html.find("time", class_="metadata__date")
    if publishing_date_html:
        result["publishing_date"] = publishing_date_html.get("datetime", "")

    # Get title:
    title_html = (
        html.find("span", class_="article-heading__title")
        or html.find("span", class_="column-heading__title")
        or html.find("span", class_="headline__title")
        or html.find("span", class_="article-header__title article-header__title--default")
    )
    if title_html:
        result["title"] = title_html.text.strip()

    # Get all tags:
    tags_html = html.find_all("a", class_="article-tags__link")
    if tags_html:
        tags = [tag.text.replace(" ", "_") for tag in tags_html]
        result["tags"] = " ".join(tags)

    # Get all authors:
    authors_html = html.find("div", class_="byline")
    if authors_html:
        authors = [
            author.text.replace(" ", "_")
            for author
            in authors_html.find_all("span", itemprop="name")
        ]
        result["authors"] = " ".join(authors)

    # Parse article text:
    text = ""
    paragraphs = html.find_all("p", class_="paragraph article__item")
    if paragraphs:
        for paragraph in paragraphs:
            text += extract_content(paragraph)
    result["text"] = clean_text(text)

    return result


async def fetch_html(url, session):
    try:
        response = await session.request(method="GET", url=url)
        response.raise_for_status()
        return await response.text()        
    except (
        aiohttp.ClientError,
        aiohttp.ClientResponseError,
        aiohttp.http_exceptions.HttpProcessingError,
    ) as e:
        traceback.print_exc()
        return None


async def download_and_parse_article_url(url, article_id, session, db):
    result = await fetch_html(url=url, session=session)
    if result:
        result_str = str(result)
        if 'komplettansicht" data-ct-label="all"' in result_str:
            result = await fetch_html(url=url + "/komplettansicht", session=session)
        if result:
            parsing_result = parse_article(result)
            data = (
                parsing_result["title"],
                parsing_result["text"],
                parsing_result["authors"],
                parsing_result["publishing_date"],
                parsing_result["tags"],
                "true",
                article_id,
            )
            await db.execute("""
                UPDATE articles SET
                    title = ?,
                    text = ?,
                    authors = ?,
                    publishing_date = ?,
                    tags = ?,
                    downloaded = ?
                WHERE article_id = ?
                """,
                data
            )


async def download_and_parse_text_of_all_article_urls(articles_db):
    async with aiohttp.ClientSession(
          connector=aiohttp.TCPConnector(limit=30),
          cookie_jar=aiohttp.CookieJar(),
          headers={"User-Agent": "python-requests/2.21.0"},
          timeout=aiohttp.ClientTimeout(total=None),
        ) as session:

        # Run initial request to set a cookie:
        await session.request(method="GET", url="https://www.zeit.de")

        tasks = []
        async with aiosqlite.connect(articles_db) as db:
            db.row_factory = aiosqlite.Row
            cursor = await db.execute("SELECT rowid, url FROM articles WHERE downloaded = 'false'")
            fetchall = await cursor.fetchall()
            for row in fetchall:
                tasks.append(
                    download_and_parse_article_url(
                        url=row["url"],
                        article_id=row["article_id"],
                        session=session,
                        db=db
                    )
                )
            for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks)):
                await f
            await db.commit()

In [None]:
start_time = time.time()
loop = asyncio.get_event_loop()
result = loop.run_until_complete(download_and_parse_text_of_all_article_urls(articles_db=article_db))
duration = time.time() - start_time
print(f"\nDownloaded sites in {duration:.2f} seconds")

Depending on the time of the day and the capacity of the server, we might trigger some 503 (or 104) errors, meaning the server ran out of resources to fulfill our request. However, we can just rerun the cell above to redownload those that failed in the first run. There also might be some 404 errors indicating dead links. 

In [None]:
db = sqlite3.connect(str(article_db))
cursor = db.cursor()
num_downloaded = len(list(cursor.execute("SELECT * FROM articles WHERE downloaded = 'true'")))
num_not_downloaded = len(list(cursor.execute("SELECT * FROM articles WHERE downloaded = 'false'")))
db.close()

print(f"URLs downloaded: {num_downloaded}")
print(f"URLs not downloaded: {num_not_downloaded}")

In [None]:
db = sqlite3.connect(str(article_db))
cursor = db.cursor()

num_rows_before = len(list(cursor.execute("SELECT * FROM articles")))

# Let's delete everything that has not been downloaded at this point, (probaly only dead urls left)
cursor.execute("DELETE FROM articles WHERE downloaded = 'false'")

# Delete articles without text:
cursor.execute("DELETE FROM articles WHERE downloaded = 'true' AND text = '' ")

# Delete articles in english:
cursor.execute("DELETE FROM articles WHERE text LIKE '%Lesen Sie diesen Text auf Deutsch%'")

db.commit()

num_rows_after = len(list(cursor.execute("SELECT * FROM articles")))
db.close()

print(num_rows_before)
print(num_rows_after)

#3 Topic Modelling using Latent Dirichlet Allocation

##3.1 Preperations:

In [None]:
!python -m spacy download de_core_news_sm
nltk.download('stopwords')

In [None]:
import de_core_news_sm
nlp = de_core_news_sm.load()

In [None]:
# Let's download some extra stopwords:
!wget https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_full.txt

In [None]:
# Combine stopwords from spacy and nltk and solariz:
stopwords_solariz = set()
with open('german_stopwords_full.txt') as f:
    for word in f:
        if not word.startswith(';'):
            stopwords_solariz.add(word.strip())

stopwords_spacy = spacy.lang.de.STOP_WORDS

stopwords_nltk = nltk.corpus.stopwords.words('german')

own_stopwords = set(
    [
     'hauptsache',
     'jetzig',
     'mittlerweile',
     'freilich',
     'zuvor',
     'fortan',
     'vorab',
     'einzig',
     'bloß',
     'worauf'
    ]
)

stopwords = stopwords_spacy | set(stopwords_nltk) | stopwords_solariz | own_stopwords

## 3.2 Data preprocessing:

Data preprocessing steps:

1. remove newline characters and multiple consecutive whitespaces
2. remove quotation marks
3. remove punctuation
4. remove numerals
5. lowercase
6. tokenization
7. remove stopwords
8. Lemmatization
9. bigram and trigram collocation detection

In [None]:
# Add new column to table:
db = sqlite3.connect(str(article_db))
db.execute("ALTER TABLE articles ADD preprocessed_text TEXT")
db.commit()
db.close()

In [None]:
def preprocess_text(text, stopwords):
  tokens = gensim.utils.simple_preprocess(text, deacc=False) # takes care of 1.-6.
  
  # Remove stopwords
  tokens = [token for token in tokens if token not in stopwords]

  # Lemmatization
  allowed_postags = set(['NOUN', 'ADJ', 'VERB', 'ADV'])
  doc = nlp(" ".join(tokens))
  tokens = [token.lemma_ for token in doc if token.pos_ in allowed_postags]

  return tokens

In [None]:
db = sqlite3.connect(str(article_db))
db.row_factory = sqlite3.Row
cursor_iter = db.cursor()
cursor_writer = db.cursor()
num_rows = len(list(cursor_iter.execute("SELECT * FROM articles")))
for row in tqdm.tqdm(cursor_iter.execute("SELECT * FROM articles"), total=num_rows):
    preprocessed_text = preprocess_text(row["text"], stopwords)
    data = (" ".join(preprocessed_text), row["article_id"],)
    cursor_writer.execute(
        "UPDATE articles SET preprocessed_text = ? WHERE article_id = ?", data
    )

db.commit()
db.close()

In [None]:
def iterate_over_preprocessed_documents(article_db):
    db = sqlite3.connect(str(article_db))
    db.row_factory = sqlite3.Row
    cursor = db.cursor()
    db.row_factory = sqlite3.Row
    for row in cursor.execute("SELECT * FROM articles"):
      yield row["preprocessed_text"].split(" ")
    db.close()

Bigram and Trigram collocation detection:

In [None]:
bigram = gensim.models.Phrases(
    iterate_over_preprocessed_documents(article_db),
    min_count=10,
    threshold=0.6,
    scoring="npmi"
)
bigram_mod = gensim.models.phrases.Phraser(bigram)

trigram = gensim.models.Phrases(
    bigram_mod[iterate_over_preprocessed_documents(article_db)],
    min_count=10,
    threshold=0.8,
    scoring="npmi"
)  
trigram_mod = gensim.models.phrases.Phraser(trigram)

Mark bigrams and trigrams:

In [None]:
db = sqlite3.connect(str(article_db))
db.row_factory = sqlite3.Row
cursor_iter = db.cursor()
cursor_writer = db.cursor()
num_rows = len(list(cursor_iter.execute("SELECT * FROM articles")))
for row in tqdm.tqdm(cursor_iter.execute("SELECT * FROM articles"), total=num_rows):
    text = row['preprocessed_text'].split(" ")
    trigrammed_text = trigram_mod[bigram_mod[text]]
    data = (" ".join(trigrammed_text), row['article_id'], )
    cursor_writer.execute(
        "UPDATE articles SET preprocessed_text = ? WHERE article_id = ?",
        data
    )

db.commit()
db.close()

Load some data from database into pandas table:

In [None]:
db = sqlite3.connect(str(article_db))
df = pd.read_sql('SELECT title, preprocessed_text, publishing_date FROM articles', con=db)
db.close()

In [None]:
# Drop rows without title and publishing date:
df = df.drop(df[(df['title'] == '') | (df['publishing_date'] == '')].index)

In [None]:
df['preprocessed_text'] = df['preprocessed_text'].str.split(' ')

In [None]:
df['publishing_date'] = pd.to_datetime(df['publishing_date'])

Create Dictionary

In [None]:
dictionary = gensim.corpora.Dictionary(df['preprocessed_text'])

To improve the overall quality of the model, we can remove words that only occur a few times and words that occur in almost all documents. Let's look at those words first:

In [None]:
# Words that only occur 10 times or less:
for token_id, freq in dictionary.cfs.items():
    if freq <= 10:
        print(dictionary[token_id])

In [None]:
# Words that occur in more than half of all documents:
ratio = int(0.5 * df.shape[0])
for token_id, n_docs in dictionary.dfs.items():
    if n_docs >= ratio:
        print(dictionary[token_id])

In [None]:
# Remove words that appear less than 10, and which appear in more 50% of all documents.
dictionary.filter_extremes(no_below=10, no_above=0.5)

In [None]:
# Create corpus:
corpus = [dictionary.doc2bow(text) for text in df['preprocessed_text']]

In [None]:
# Human readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[0:1]]

In [None]:
print(f"Number of unique tokens: {len(dictionary)}")
print(f"Number of documents: {df.shape[0]}")

In [None]:
%matplotlib inline
plt.hist([len(text) for text in corpus], bins=200)
plt.ylabel('Number of Documents')
plt.xlabel('Number of Words (after preprocessing)')
plt.show()
plt.savefig(plots_dir / "n_words_histrogram.png")

## 3.3 Training the LDA model:

In [None]:
logging.basicConfig(filename='lda_model.log', level=logging.DEBUG)

# Log the perplexity and coherence score at the end of each epoch:
perplexity_logger = gensim.models.callbacks.PerplexityMetric(
    corpus=corpus,
    logger="shell"
)
coherence_logger = gensim.models.callbacks.CoherenceMetric(
    corpus=corpus,
    texts=df['preprocessed_text'],
    coherence="c_v",
    logger="shell"
)

lda_model = gensim.models.ldamulticore.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=20, 
    random_state=100,
    chunksize=1500,
    passes=20,
    alpha="auto",
    per_word_topics=True,
    iterations=2000,
    callbacks=[coherence_logger, perplexity_logger]
)

lda_model.save(str(models_dir / "lda_model"))

In [None]:
# Parse logs for coherence score, plot over epochs:
with open('lda_model.log', 'r') as f:
    log = f.read()

coherence_score = re.findall(
    r'INFO:gensim\.models\.ldamodel:Epoch (\d+): Coherence estimate: (\d+\.\d+)',
    log
)

perplexity_score = re.findall(
    r'INFO:gensim\.models\.ldamodel:Epoch (\d+): Perplexity estimate: (\d+\.\d+)',
    log
)

In [None]:
# Coherence score vs epochs:
ax = plt.figure().gca()
plt.plot([int(x[0]) for x in coherence_score], [float(x[1]) for x in coherence_score])
plt.grid(True)
plt.ylabel('Coherence Score')
plt.xlabel('Epoch')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()
plt.savefig(plots_dir / "coherence_score_vs_epochs.png")

In [None]:
ax = plt.figure().gca()
plt.plot([int(x[0]) for x in perplexity_score], [float(x[1]) for x in perplexity_score])
plt.grid(True)
plt.ylabel('Perplexity Score')
plt.xlabel('Epoch')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()
plt.savefig(plots_dir / "perplexity_score_vs_epochs.png")

Alternatively, we could load a previously trained model:

In [None]:
lda_model = gensim.models.ldamulticore.LdaModel.load(str(models_dir / "lda_model"))

In [None]:
# Compute Perplexity
print(f"Perplexity: {lda_model.log_perplexity(corpus):.3f}")

# Compute Coherence Score
coherence_model_lda = gensim.models.CoherenceModel(
    model=lda_model,
    texts=df['preprocessed_text'],
    dictionary=dictionary,
    coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_lda:.3f}")

## 3.4 Training the LDA model with mallet:

Mallet is a different implementation that usese Gibbs Sampling which is a bit more accurate:


In [None]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip -P "$EXTRA_LIBRARIES"
!unzip -a "$EXTRA_LIBRARIES/mallet-2.0.8.zip" -d "$EXTRA_LIBRARIES/mallet-2.0.8"

In [None]:
mallet_path = extra_libraries_dir / "mallet-2.0.8" / "bin" / "mallet"
ldamallet = gensim.models.wrappers.LdaMallet(
    mallet_path=str(mallet_path),
    corpus=corpus,
    num_topics=20,
    id2word=dictionary,
    iterations=2000,
)
# Transform mallet model to gensim compatible model: 
# (requires gensim > 3.7.0, before it will work, but has some bugs)
lda_mallet_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
lda_mallet_model.save(str(models_dir / "lda_mallet_model"))

Alternatively, we could load a previously trained model:

In [None]:
lda_mallet_model = gensim.models.LdaModel.load(str(models_dir / "lda_mallet_model"))

In [None]:
# Compute Coherence Score
coherence_model_lda_mallet = gensim.models.CoherenceModel(
    model=lda_mallet_model,
    texts=df['preprocessed_text'],
    dictionary=dictionary,
    coherence="c_v"
)
coherence_lda_mallet = coherence_model_lda_mallet.get_coherence()
print(f"Coherence Score: {coherence_lda_mallet:.3f}.")

## 3.5 Compare both models

Display topics and keyowrds:

In [None]:
# LDA model (trained with Gensim)
pprint(lda_model.show_topics(20, 30))

In [None]:
# Lda mallet model (trained with mallet via Gensim wrapper)
pprint(lda_mallet_model.show_topics(20, 30))

Find most representative article for each topic:

In [None]:
def find_most_representative_article_for_each_topic(model):
    most_dominant_topic = defaultdict(lambda: {'topic_percentage': 0})

    for document_index, topic_results in enumerate(model[corpus]):
        if len(topic_results) == 3: # to account for difference between gensim and mallet 
          topic_results = topic_results[0]
        for topic_number, topic_weight in topic_results:
            if topic_weight > most_dominant_topic[topic_number]['topic_percentage']:
                most_dominant_topic[topic_number] = {
                    'topic_percentage': topic_weight,
                    'document_index': document_index
                }

    for topic_num, values in sorted(most_dominant_topic.items()):
        print(f"Topic Number {topic_num}: Most representative article {values['document_index']} with {values['topic_percentage']:.2f}%")

In [None]:
# LDA model (trained with Gensim)
find_most_representative_article_for_each_topic(lda_model)

In [None]:
# Lda mallet model (trained with mallet via Gensim wrapper)
find_most_representative_article_for_each_topic(lda_mallet_model)

#3 LDA Visualization

First, let's calculate visually distinct colors for each topic:

In [None]:
num_topics = 20

In [None]:
!git clone https://github.com/taketwo/glasbey "$EXTRA_LIBRARIES/glasbey"

In [None]:
sys.path.append(str(extra_libraries_dir / 'glasbey'))
from glasbey import Glasbey
gb = Glasbey()
p = gb.generate_palette(size=num_topics + 1)
rgb = gb.convert_palette_to_rgb(p)
# Transform to hex colors:
colors = ['#%02x%02x%02x' % color for color in rgb]
# Exclude 1st color since it is white
colors = colors[1:]

In [None]:
sys.path

##3.1 Word clouds

In [None]:
num_topics = 19

In [None]:
def generate_word_clouds(model, num_topics, result_filepath):
    cloud = WordCloud(
        background_color="white",
        width=1000,
        height=1000,
        max_words=15,
        colormap="tab10",
        color_func=lambda *args, **kwargs: colors[i],
        prefer_horizontal=1.0
    )

    topics = model.show_topics(num_topics=num_topics, num_words=20, formatted=False)

    fig, axes = plt.subplots(math.ceil(num_topics/4), 4, figsize=(20, math.ceil(num_topics/4)*5), sharex=True, sharey=True)

    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        if i < num_topics:
            topic_words = dict(topics[i][1])
            cloud.generate_from_frequencies(topic_words, max_font_size=150)
            plt.gca().imshow(cloud)
            plt.gca().set_title(f"Topic {i}", fontdict=dict(size=25))
        plt.gca().axis("off")

    plt.tight_layout(h_pad=10, w_pad=10)
    plt.axis("off")
    #plt.margins(x=0, y=0)

    plt.savefig(result_filepath)

In [None]:
generate_word_clouds(
    model=lda_mallet_model,
    num_topics=num_topics,
    result_filepath=plots_dir / 'word_cloud_lda_mallet.png'
)

In [None]:
generate_word_clouds(
    model=lda_model,
    num_topics=num_topics,
    result_filepath=plots_dir / 'word_cloud_lda.png'
)

##3.2 pyLDAvis


In [None]:
# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

LDA model:

In [None]:
p_lda = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(p_lda, 'pyLDAvis_lda.html')

LDA mallet model:

In [None]:
p_mallet = pyLDAvis.gensim.prepare(lda_mallet_model, corpus, dictionary)
pyLDAvis.save_html(p_mallet, 'pyLDAvis_lda_mallet.html')

##3.3 t-SNE

In [None]:
model = lda_model
#model = lda_mallet_model

In [None]:
array = []
for topic_results in model[corpus]:  
    topic_weights = [0]*num_topics
    if len(topic_results) == 3:
        topic_results = topic_results[0]
    for topic_n, topic_weigth in topic_results:
        topic_weights[topic_n] = topic_weigth
    array.append(topic_weights)

topic_weights_array = np.array(array)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(
    n_components=2,
    random_state=0,
    perplexity=20,
    early_exaggeration=120,
)
embedding = tsne.fit_transform(topic_weights_array)

In [None]:
df["x"], df["y"] = embedding[:,0], embedding[:,1]
df['dominant_topic'] = topic_weights_array.argmax(axis=1)

In [None]:
from sklearn.manifold import TSNE

from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.models.widgets import DateRangeSlider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()

source = ColumnDataSource(
        data=dict(
            x = df.x,
            y = df.y,
            colors = [colors[i] for i in df['dominant_topic']],
            title = df.title,
            day = df['publishing_date'],
            day_humanreadable = [x.strftime("%d.%m.%Y") for x in df.publishing_date],
            alpha = [0.9] * df.shape[0],
            size = [7] * df.shape[0]
        )
    )

hover_tsne = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Title:</span>
            <span style="font-size: 12px">@title (@day_humanreadable)</span>
        </div>
    </div>
    """)

tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(
    plot_width=1000,
    plot_height=700,
    tools=tools_tsne,
    title='t-SNE Visualization'
)
plot_tsne.xaxis.major_label_text_color = None
plot_tsne.yaxis.major_label_text_color = None

plot_tsne.circle(
    x='x',
    y='y',
    size='size',
    fill_color='colors', 
    alpha='alpha',
    line_alpha=0,
    line_width=0.01,
    source=source,
    name="df"
)

callback = CustomJS(
    args=dict(source=source),
    code="""
        var data = source.data;
        var start_date = cb_obj.value[0]
        var end_date = cb_obj.value[1]
        x = data['x']
        y = data['y']
        colors = data['colors']
        alpha = data['alpha']
        title = data['title']
        day = data['day']
        size = data['size']
        for (i = 0; i < x.length; i++) {
            if (day[i] <= end_date && day[i] >= start_date) {
                alpha[i] = 0.9
                size[i] = 4
            } else {
                alpha[i] = 0.01
                size[i] = 2
            }
        }
        source.change.emit();
        """
)
date_range_slider = DateRangeSlider(
    title="Date Range: ",
    start=min(df['publishing_date']),                     
    end=max(df['publishing_date']),
    value=(min(df['publishing_date']), max(df['publishing_date'])),
    step=1,
    width=980
)
date_range_slider.js_on_change('value', callback)

layout = column(date_range_slider, plot_tsne)
show(layout)

In [None]:
from bokeh.plotting import output_file, save
output_file(plots_dir / "tsne-lda_mallet.html")
save(layout)

In [None]:
# Create data for Tensorflow's Embedding Projector:

# Tensor
with open(plots_dir / 'lda_mallet_tensor.tsv', 'w') as f:
    for document_topics in topic_weights_array:
        for topic_weight in document_topics:
            f.write(f"{topic_weight}\t")
        f.write("\n")

# Metadata
with open(plots_dir / 'lda_mallet_metadata_title.tsv','w') as f:
    for _, row in df.iterrows():
        f.write(f"{row['title'].strip()} ({row['publishing_date'].strftime('%d.%m.%Y')})\n")