# Topic Modelling with Province News Releases

In [None]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.critical)

# LDA
import gensim
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.phrases import Phrases, Phraser
from gensim.utils import simple_preprocess

# Stopwords
import nltk
from nltk.corpus import stopwords

# Regex
import re

# Lemmatization
import spacy
import fr_core_news_sm

# Printing model topics
from pprint import pprint

# Model Visualization
import pyLDAvis
import pyLDAvis.gensim

# For plotting coherence values
import matplotlib.pyplot as plt

# From source_scraping.py
from source_scraping import load_province

# Utils
import numpy as np
import pandas as pd

## Preprocessing Text

In order to prepare the articles for our LDA model, they are be split into individual words, stripped of "stop words" that contribute little information, and lemmatized (replaced by their roots. “Swimming” becomes “swim”, for instance).

In [None]:
def remove_stopwords(texts, stop_words):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def clean_(doc):
    new_doc = [re.sub(r'\s+', ' ', word) for word in doc]
    # More filters if necessary
    return new_doc

def clean(texts):
    return [clean_(doc) for doc in texts]

def texts_to_words(corpus):
    for doc in corpus:
        yield gensim.utils.simple_preprocess(doc, deacc=True)

def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB'], lang='english'):
    # French for Quebec
    nlp = spacy.load('en', disable=['parser', 'ner']) if lang == 'english' else fr_core_news_sm.load(disable=['parser', 'ner'])
    return [[token.lemma_ for token in nlp(" ".join(doc)) if token.pos_ in allowed_postags] for doc in texts]

def make_bigrams(texts):
    bigram = Phrases(texts, min_count=5, threshold=100)
    bigram_mod = Phraser(bigram)
    return [bigram_mod[doc] for doc in texts]
    
# def make_trigrams(texts):
#     return make_bigrams(make_bigrams(texts)) I'm not sure if this one behaves correctly

def custom_preprocess(texts, stop_words, allowed_postags, bigrams=True, lang='english'):
    partially_processed = clean(list(texts_to_words(texts)))
    return lemmatize(make_bigrams(partially_processed) if bigrams else partially_processed, allowed_postags=allowed_postags, lang=lang)

def dict_corpus(texts):
    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts] # Essentially `texts`, but encoded (I think)
    return id2word, corpus

## Finding a Good Topic Number

In order to create a model that produces understandable topics, the number of topics should be tweaked so as to produce topics that are coherent. The following function tries various values for the topic number (specified by the <code>n_topic_range</code> parameter) and outputs the best model, a list of all the models, and a list of all the $C_v$ coherence values.

The <code>plot</code> parameter specifies whether or not the function should also graph the coherence values against the number of topics for each model.

In [None]:
def find_best_model(n_topic_range, texts, id2word, corpus, random_state=42, plot=True, verbose=False):
    models = []
    coherence_vals = []
        
    for n_topics in n_topic_range:
        
        # Print percentage progress
        if verbose:
            diff = max(n_topic_range) - min(n_topic_range)
            print(str(int(1000 * (n_topics - min(n_topic_range)) / diff) / 10) + "% done")
        
        lda_model = LdaModel(corpus=corpus,
                            id2word=id2word,
                            num_topics=n_topics,
                            random_state=random_state,
                            update_every=1,
                            chunksize=100,
                            passes=10,
                            alpha='auto',
                            per_word_topics=True
                            )
        co_model = CoherenceModel(lda_model, texts=texts, dictionary=id2word, coherence="c_v")
        coherence = co_model.get_coherence()
                
        models.append(lda_model)
        coherence_vals.append(coherence)

    if plot:
        plt.plot(n_topic_range, coherence_vals, 'b')
        plt.show()
    
    return models[np.argmax(coherence_vals)], models, coherence_vals

## Model Visualization

The <code>pyLDAvis</code> library provides neat visualizations of LDA models.

In [None]:
def visualize_model(model, corpus, id2word):
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(model, corpus, id2word)
    pyLDAvis.show(vis)

# Full LDA Pipeline

All together now!

In [None]:
def lda_model(province, doc_attrib='source_full_text', bigram=True, allowed_postags=['NOUN', 'VERB', 'ADJ'], n_topic_range=range(2, 40, 3), plot=True, random_state=42, verbose=False, vis=True):
    lang = 'french' if province.lower() == 'quebec' else 'english'
    
    texts = load_province(province.lower(), verbose)[doc_attrib]
    
    if verbose: print("\nPreprocessing Texts\n")
    texts = custom_preprocess(texts, stopwords.words(lang), allowed_postags=allowed_postags, bigrams=True, lang=lang)
    
    id2word, corpus = dict_corpus(texts)
    
    if verbose: print("\nFinding Best n_topics Values\n")
    model, model_list, co_vals = find_best_model(n_topic_range, texts, id2word, corpus, random_state, plot, verbose)

    if vis:
        visualize_model(model, corpus, id2word)
        
    return model, model_list, co_vals

In [None]:
%%time
    
q_model, _, _ = lda_model('quebec', verbose=True) # Achieves coherence score of 0.60

Note that while a warning will suggest using <code>pyLDAvis.display()</code> instead of <code>pyLDAvis.show()</code>, it doesn't seem to work. Click [here](https://github.com/bmabey/pyLDAvis/issues/101) for more info.