In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!git clone https://github.com/rwalk/gsdmm

In [None]:
#imports
import os
import sys

sys.path.append("../")

import pandas as pd
import numpy as np
import operator
import pyLDAvis
import pyLDAvis.gensim
import gensim
from gensim.models.coherencemodel import CoherenceModel

from gsdmm.gsdmm import MovieGroupProcess

from src.text_preprocessor import TextPreprocessor
from src.embeddor import Embeddor

In [None]:
DATA_FOLDER = "../data"
DATA1 = "seatguru_python_scraping.csv"
DATA2 = "skytrax_scraping_2.csv"

In [None]:
df1 = pd.read_csv(os.path.join(DATA_FOLDER, DATA1))
df2 = pd.read_csv(os.path.join(DATA_FOLDER, DATA2))

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
df2_small = df2.copy().iloc[:5000, :]
preprocessor = TextPreprocessor(df2_small, column_to_clean='body')
preprocessor.transform(n_grams=False, remove_stopwords=True)
corpus = preprocessor.corpus

# Embedding

In [None]:
embeddor = Embeddor(corpus=corpus)
embeddor.transform(vec_method="word2vec", how="PCA", n=3)
word2vec_embed = embeddor.description_embedding
word2vec_model = embeddor.model
word2vec_embed['corpus'] = corpus
word2vec_embed['rating'] = df2_small['rating']
word2vec_embed.head()

# LDA

In [None]:
df2_small["corpus"] = corpus

In [None]:
#Dictionary
tokens = df2_small.corpus
dictionary = gensim.corpora.Dictionary(tokens)
dictionary.filter_extremes(no_below=0.05, no_above=0.9)
corpus_lda = [dictionary.doc2bow(tok) for tok in tokens]

In [None]:
ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpus_lda,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=42,
                                           alpha=0.1,
                                           eta=0.1,
                                           per_word_topics=True)

In [None]:
for i, topic in ldaModel.show_topics(formatted=True, num_topics=10, num_words=20):
    print(str(i)+": "+ topic+"\n")

In [None]:
cm = CoherenceModel(model=ldaModel, corpus=corpus, texts=tokens ,coherence="c_v")
cm.get_coherence()

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldaModel, corpus_lda, dictionary)
vis

# GSDMM

In [None]:
df2_small['nb_token'] = list(map(len, df2_small['corpus']))
docs = df2_small.corpus.to_list()
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

In [None]:
nb_topic = 10
alpha = 0.1
beta = 0.1

mgpModel = MovieGroupProcess(K=nb_topic, alpha=alpha, beta=beta, n_iters=20)
mgpModelFit = mgpModel.fit(tokens, n_terms)

In [None]:
def topWordsPerTopic(clusterDistrib, topIndex, nbWord):
    for index in topIndex:
        clusterWord = clusterDistrib[index]
        sortedCluster = sorted(clusterWord.items(), key=operator.itemgetter(1), reverse=True)
        clusterTopWords = sortedCluster[:nbWord]
        print(f"Cluster {index} : {clusterTopWords}")
        print('*'*20)

In [None]:
docCount = np.array(mgpModel.cluster_doc_count)
print('Number of documents per topic :', docCount)
print('*'*20)
# Topics sorted by the number of document they are allocated to
topIndex = docCount.argsort()[::-1]
print('Most important clusters (by number of docs inside):', topIndex)
print('*'*20)
# Show the top 30 words in term frequency for each cluster 
topWordsPerTopic(mgpModel.cluster_word_distribution, topIndex, 30)