In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!git clone https://github.com/rwalk/gsdmm

In [None]:
#imports
import os
import sys

sys.path.append("../")

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import pandas as pd
import numpy as np
import pickle

from gensim.models.coherencemodel import CoherenceModel
from gsdmm.gsdmm import MovieGroupProcess
from nltk.corpus import stopwords
from wordcloud import WordCloud
from PIL import Image
import pyLDAvis.gensim
import operator
import pyLDAvis
import gensim

from src.meta_data_preprocessor import MetaDataPreprocessor
from src.text_preprocessor import TextPreprocessor
from src.embeddor import Embeddor


%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 6)
sns.set_palette("husl")

# Load data

In [None]:
DATA_FOLDER = "../data/inputs"
DATA1 = "seatguru_python_scraping.csv"
DATA2 = "skytrax_scraping_2.csv"

df1 = pd.read_csv(os.path.join(DATA_FOLDER, DATA1), index_col=0)
df2 = pd.read_csv(os.path.join(DATA_FOLDER, DATA2), index_col=0)
concat_df = pd.concat([df1, df2])
concat_df.reset_index(inplace=True)

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
with open('../data/preprocessed/corpus_ngram_data.pickle', 'rb') as f:
    data = pickle.load(f)

In [None]:
data.head()

# Adding meta-data

In [None]:
meta_data_preprocessor = MetaDataPreprocessor()
df_with_metadata = meta_data_preprocessor.preprocess(concat_df)
df_with_metadata.to_pickle(os.path.join("..", "data", "preprocessed", "reviews_metadata.pickle"))

In [None]:
df_with_metadata.head()

# Embedding

In [None]:
df2_small = df2.copy().iloc[:5000, :]
preprocessor = TextPreprocessor(df2_small, column_to_clean='body')
preprocessor.transform(n_grams=False, remove_stopwords=True)
corpus = preprocessor.corpus

In [None]:
embeddor = Embeddor(corpus=corpus)
embeddor.transform(vec_method="word2vec", how="PCA", n=3)
word2vec_embed = embeddor.description_embedding
word2vec_model = embeddor.model
word2vec_embed['corpus'] = corpus
word2vec_embed['rating'] = df2_small['rating']
word2vec_embed.head()

# LDA

In [None]:
#Dictionary
tokens = data.corpus
dictionary = gensim.corpora.Dictionary(tokens)
dictionary.filter_extremes(no_below=0.05, no_above=0.9)
corpus_lda = [dictionary.doc2bow(tok) for tok in tokens]

In [None]:
ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpus_lda,
                                           id2word=dictionary,
                                           num_topics=5, 
                                           random_state=42,
                                           alpha=0.1,
                                           eta=0.1,
                                           per_word_topics=True)

In [None]:
for i, topic in ldaModel.show_topics(formatted=True, num_topics=5, num_words=10):
    print(str(i)+": "+ topic+"\n")

In [None]:
cm = CoherenceModel(model=ldaModel, corpus=data['corpus'], texts=tokens ,coherence="c_v")
print(f'Model coherence: {cm.get_coherence()}')

In [None]:
def get_lda_topic(review, ldaModel=ldaModel, dictionary=dictionary):
    review_lda = dictionary.doc2bow(review) 
    max_prob = 0
    max_topic = None
    for topic, prob in ldaModel.get_document_topics(review_lda):
        if prob > max_prob:
            max_topic = topic
            max_prob = prob
    
    return max_topic

In [None]:
data['lda_topic'] = data['corpus'].apply(get_lda_topic)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldaModel, corpus_lda, dictionary)
vis

# Wordclouds

In [None]:
interior_design_reviews = data[data['lda_topic']==0]

In [None]:
corpus_good = interior_design_reviews.loc[interior_design_reviews['bin']==2, 'corpus'].tolist()
corpus_bad = interior_design_reviews.loc[interior_design_reviews['bin']==0, 'corpus'].tolist()

mask_good = np.array(Image.open("../images/mask2.PNG"))
mask_bad = np.array(Image.open("../images/mask3.PNG"))

def build_wordcloud(corpus, mask, colormap="viridis"):
    text = ""

    for review in corpus:
        text += " ".join(review)
    wordcloud = WordCloud(collocations=False, background_color="white", max_words=50, mask=mask, colormap=colormap).generate(text)
    
    return wordcloud


In [None]:
good_wc = build_wordcloud(corpus_good, mask_good)

In [None]:
bad_wc = build_wordcloud(corpus_bad, mask_bad, colormap='inferno')

In [None]:
plt.imshow(good_wc)

In [None]:
plt.imshow(bad_wc)

In [None]:
good_wc.to_file('../images/good_wc.png')
bad_wc.to_file('../images/bad_wc.png')

# Explore n-grams

In [None]:
corpus_good = interior_design_reviews.loc[interior_design_reviews['bin']==2, 'corpus'].tolist()
corpus_bad = interior_design_reviews.loc[interior_design_reviews['bin']==0, 'corpus'].tolist()

n_grams_good = interior_design_reviews.loc[interior_design_reviews['bin']==2, 'n_grams'].tolist()
n_grams_bad = interior_design_reviews.loc[interior_design_reviews['bin']==0, 'n_grams'].tolist()

In [None]:
flatenned_corpus_good = [val for sublist in corpus_good for val in sublist]
good_c = Counter(flatenned_corpus_good)

flatenned_corpus_bad = [val for sublist in corpus_bad for val in sublist]
bad_c = Counter(flatenned_corpus_bad)

flatenned_n_grams_good = [val for sublist in n_grams_good for val in sublist]
good_n = Counter(flatenned_n_grams_good)

flatenned_n_grams_bad = [val for sublist in n_grams_bad for val in sublist]
bad_n = Counter(flatenned_n_grams_bad)

In [None]:
good_c.most_common(20)

In [None]:
bad_c.most_common(20)

In [None]:
good_n.most_common(50)

In [None]:
bad_n.most_common(50)

# Time-series analysis

In [None]:
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month

In [None]:
# data.groupby(by=['year', 'month', 'lda_topic'])['corpus'].count().reset_index()
fig, ax = plt.subplots()
(data[(data['lda_topic'].isin([0, 1, 2]))&(data['year']>=2014)]
     .pivot_table(values='corpus', columns='lda_topic', index=['year', 'month'], aggfunc='count')
     .plot(ax=ax))
ax.legend(["Interior Desgin", "Positive", "Negative"])
plt.title('Topic Evolution over time', fontsize=20)
plt.xlabel('Year, Month', fontsize=16)
plt.ylabel('Count', fontsize=16)


# Shap analysis

In [None]:
import shap
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
vectorizer = CountVectorizer()

untokenized_corpus = [" ".join(words) for words in df_log_reg.corpus]
X = vectorizer.fit_transform(untokenized_corpus)
model.fit(X, df_log_reg.target.tolist())

In [None]:
explainer = shap.LinearExplainer(model, X, feature_perturbation="interventional")
shap_values = explainer.shap_values(X)
X_array = X.toarray()

shap.summary_plot(shap_values, X_array, feature_names=vectorizer.get_feature_names())

# GSDMM

In [None]:
df2_small['nb_token'] = list(map(len, df2_small['corpus']))
docs = df2_small.corpus.to_list()
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

In [None]:
nb_topic = 10
alpha = 0.1
beta = 0.1

mgpModel = MovieGroupProcess(K=nb_topic, alpha=alpha, beta=beta, n_iters=20)
mgpModelFit = mgpModel.fit(tokens, n_terms)

In [None]:
def topWordsPerTopic(clusterDistrib, topIndex, nbWord):
    for index in topIndex:
        clusterWord = clusterDistrib[index]
        sortedCluster = sorted(clusterWord.items(), key=operator.itemgetter(1), reverse=True)
        clusterTopWords = sortedCluster[:nbWord]
        print(f"Cluster {index} : {clusterTopWords}")
        print('*'*20)

In [None]:
docCount = np.array(mgpModel.cluster_doc_count)
print('Number of documents per topic :', docCount)
print('*'*20)
# Topics sorted by the number of document they are allocated to
topIndex = docCount.argsort()[::-1]
print('Most important clusters (by number of docs inside):', topIndex)
print('*'*20)
# Show the top 30 words in term frequency for each cluster 
topWordsPerTopic(mgpModel.cluster_word_distribution, topIndex, 30)