In [None]:
import pandas as pd
import re

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

# Load data
articles = pd.read_csv('scraper/canebiere.csv')
articles = articles.dropna()


In [None]:

all_tokens = [[w for w in re.sub("[^A-Za-z]", " ", t.lower()).split() if len(w)>3] for t in articles['full_text'].values]
dictionary = Dictionary(all_tokens)

# remove stopwords
from six import iteritems
from stop_words import get_stop_words

stoplist = set(get_stop_words('fr'))

stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
dictionary.filter_tokens(stop_ids)  # remove stop words and words that appear only once
dictionary.compactify()  # remove gaps in id sequence after words that were removed

# counter corpus
counter = list(dictionary.doc2bow(text) for text in all_tokens)




In [None]:
num_topics = 10
lda = LdaModel(corpus=counter, id2word=dictionary, num_topics=num_topics)
topics = lda.show_topics(num_topics=num_topics, log=False, formatted=False)

print('Dictionary: {} docs, {} terms'.format(dictionary.num_docs, len(dictionary.dfs)))

for t in topics:
    t_str = "Topic {}: ".format(t[0])
    for w in t[1]:
        t_str += "{} ".format(w[0])
    print(t_str)


In [None]:
import pyLDAvis
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

# Vizualization of the LDA-model on the data
LDA_visualization = pyLDAvis.gensim.prepare(lda, counter, dictionary)
pyLDAvis.display(LDA_visualization)

In [None]:
import matplotlib.pyplot as plt

# The wordcloud
stoplist = set(get_stop_words('fr'))

cloud = WordCloud(stopwords=stoplist, max_words=20, background_color='white').generate(' '.join(articles[articles['author'] == 'Blaah']['full_text'].values))
#cloud.to_file('cloud.png')

plt.figure(figsize=(10,10))
plt.imshow(cloud, interpolation="bilinear")
plt.axis("off")
plt.show()
