In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

import warnings

import spacy
from spacy_lefff import LefffLemmatizer, POSTagger

import gensim
import gensim.corpora as corpora
from pprint import pprint
from gensim.models import CoherenceModel
import os

from matplotlib import pyplot as plt
%matplotlib inline

import gensim
from gensim.utils import simple_preprocess

import os
from gensim.models.wrappers import LdaMallet


# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns



from threading import Thread, RLock
from wordcloud import WordCloud


import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

# Importing data

In [None]:
%%time
article_df = pd.read_pickle("data//articles")
article_df.head()

In [None]:
article_df.shape

In [None]:
%%time
article_df = article_df.drop_duplicates()
article_df.shape

# Selecting randomly 50.000 articles to reduce the computation time for the LDA process

In [None]:
np.random.seed(1)
df = article_df.sample(50000, replace=False).reset_index(drop=True)
df.head()

df.article = df.article.apply(lambda text: re.sub('[",\.!?]', '', text))
df.article = df.article.str.lower()
df.head()

# Text preprocessing

## Tokenizing and stopwords

In [None]:

french_stopwords =  stopwords.words('french') + ["plus", "cette", "comme", "depuis", "être", "fait", "deux", "entre", "aussi", 
                                                "si", "ans","tout", "après", "faire", "sans", "bien", "très", "leurs", "où", "dont",
                                               "contre", "selon","encore","moins", "alors","premier","peut","000", 
                                               "tous", "toutes","années", "année", "mois", "autres", "avant", "après", "avoir","non", "autre","peu",
                                                 "un", "deux","trois", "quatre","cinq","sous"
                                                "ainsi", "fois", "fin",  "aujourd", "hui", "ainsi","déjà", "cela", "dit", "quelques", "toujours", "lors", "faut", 
                                                 "jusqu", "plusieurs", "donc", "là", "doit", "celui", "quand", "elles", "cas", "ceux","va", "cet","celle", "celles", 
                                                "après", "était", "être", "été", "même", "très", "ca", "dire", "ni", 
                                                "sous", "vers", "ici", "car", "trop", "beaucoup", "grand", "père", "dernier","devant", "près", "heures", "jour", "jours", "chaque"]


In [None]:
%%time
#Most common words

sns.set_style('whitegrid')
%matplotlib inline
# Helper function
def plot_most_common_words(count_data, count_vectorizer, n=10, save_path="Redaction/pics/most_common_words.png"):
    
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:n]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='%s most common words'%n)
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=60) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.savefig(save_path)
    plt.show()
    
# Initialise the count vectorizer with the french stop words
count_vectorizer = CountVectorizer(stop_words=french_stopwords)
# Fit and transform the processed text
count_data = count_vectorizer.fit_transform(df.article)
# Visualise the 40 most common words
plot_most_common_words(count_data, count_vectorizer, n=40)

In [None]:
%%time 

df["preprocessed_article"] = df.article.apply(lambda article: [ word for word in simple_preprocess(article) if word not in french_stopwords])


## Phrase Modeling: Bi-grams and Tri-grams

In [None]:
%%time
# Build the bigram and trigram models
bigram = gensim.models.Phrases(df.preprocessed_article, min_count=5, threshold=100) # higher threshold fewer phrases.

bigram_mod = gensim.models.phrases.Phraser(bigram)


In [None]:
%%time
df["bigram_article"] = df.preprocessed_article.apply(lambda article: bigram_mod[article])

In [None]:
df.head()

In [None]:
#df.to_pickle("data/sampleLDA_preprocessed_articles")

## Creating word dictionary and corpus (using words and bigrams)

In [None]:
#df = pd.read_pickle("data/sampleLDA_preprocessed_articles")

In [None]:
%%time

# Create Dictionary
id2word = corpora.Dictionary(df.bigram_article)


# Term Document Frequency
corpus = [id2word.doc2bow(article) for article in df.bigram_article]

# Optimal number of topics for LDA

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1, iterations=100):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    os.environ.update({'MALLET_HOME':r'C:/Users/hgill/Documents/Etudes/DABSA/data/mallet-2.0.8/'}) 
    mallet_path = r'C:/Users/hgill/Documents/Etudes/DABSA/data/mallet-2.0.8/bin/mallet'
    
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word, iterations=iterations)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
%%time
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=df.bigram_article, start=2, limit=20, step=2, iterations=100)


In [None]:
import pickle

file_path = "model_list_coherence_values.pkl"

with open(file_path, 'ab') as dbfile: 
    pickle.dump((model_list, coherence_values), dbfile)

In [None]:
sns.set(style="whitegrid")
# Show graph
limit=20; start=2; step=2;
x = range(start, limit, step)

plt.figure(figsize=(12,6))
plt.plot(x, coherence_values)
plt.xlabel("Number of topics")
plt.ylabel("Coherence score")
#plt.legend(("coherence_values"), loc='best')
plt.savefig("Redaction/pics/coherence_score.png")
plt.show()

# LDA Mallet Model

In [None]:
%%time

mallet_home = path_to_mallet #you have to replace it by the path to the mallet directory
os.environ.update({'MALLET_HOME':mallet_home}) 
mallet_path = os.path.join(mallet_home, "/bin/mallet")

num_topics = 14
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus , num_topics=num_topics, id2word=id2word, iterations=1000, workers=16)


In [None]:
#saving the model

file_path = "ldamallet.pkl"

with open(file_path, 'ab') as dbfile: 
    pickle.dump(ldamallet, dbfile)

In [None]:
%%time
num_topics = 14

# Show Topics
pprint(ldamallet.show_topics(num_topics=num_topics, formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=df.bigram_article, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


In [None]:
aspects = ldamallet.show_topics(num_topics = num_topics, formatted=False, num_words=100)
aspects_df = pd.DataFrame(aspects, columns=["aspects", "words_imp"])
aspects_df["words"] = aspects_df.words_imp.apply(lambda words_imp: [word_imp[0] for word_imp in words_imp])
aspects_df["imp"] = aspects_df.words_imp.apply(lambda words_imp: [word_imp[1] for word_imp in words_imp])
#aspects_df["most_imp_words"] = aspects_df.words_imp.apply(lambda word_imp: sorted([(imp,word) for word,imp in word_imp],reverse=True)[0][1])
aspects_df["nwords"] = aspects_df.words.apply(lambda words:len(words))

aspects_df = aspects_df.sort_values(by=["aspects"], axis=0).reset_index(drop=True)
aspects_df

## Naming each topic

In [None]:
def topic_name(words):

    if ("américain" in words)&("etats_unis" in words):
        return "United States"
    if ("gouvernement" in words)&("négociations" in words):
        return "government"
    if "film" in words:
        return "culture"
    if ("france" in words)&("équipe" in words):
        return "sport"
    if ("politique" in words)&("ps" in words):
        return "politics"
    if ("vie" in words)&("monde" in words):
        return "society"
    if ("travail" in words)&("jeunes" in words):
        return "employment"
    if ("justice" in words)&("loi" in words):
        return "justice"
    if ("santé" in words)&("eau" in words):
        return "health"
    if ("groupe" in words)&("marché" in words):
        return "firms"
    if ("politique" in words)&("europe" in words):
        return "Europe"
    if ("télévision" in words)&("internet" in words):
        return "media"
    if ("économie" in words)&("croissance" in words):
        return "growth"
    if ("ville" in words)&("personnes" in words):
        return "cities"
    
    return words[0]
    
aspects_df["aspect_name"] = aspects_df.words.apply(topic_name)
aspects_df

In [None]:
#Saving the aspect dataframe
aspects_df.to_csv("data/aspects.csv", index=None)

# Word clouds

In [None]:

%%time
sns.set(style="white")

aspect_list = aspects_df.aspect_name.values
nrows=4
ncols=4

fig, ax = plt.subplots(nrows=nrows,ncols=ncols, figsize=(14,18))

line=0
col =0
pos = 0

for i,aspect in enumerate(aspect_list):
    
    
    if pos==12:
        col+=1
        pos+=1
        
    text = " ".join(aspects_df.words[i])

    wc = WordCloud(random_state=2,
                   relative_scaling=0.2, min_font_size =10 ,background_color='white', width=1000, height=1000)

    frequences = dict(aspects_df.words_imp[i] )
    wc.generate_from_frequencies(frequences)
    
    
    ax[line,col].imshow(wc, interpolation='bilinear')

    ax.flat[pos].set_title(aspects_df.aspect_name[i])
    
    ax.flat[pos].label_outer()
    
    
    if ((col+1)%ncols==0):
        line+=1
        col=0

    else:
        col+=1

    pos+=1

    
for ax in fig.axes:
    plt.sca(ax)
    plt.axis("off")

plt.savefig("pics/aspects.png")
plt.show()