In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!git clone https://github.com/rwalk/gsdmm

In [None]:
#imports
import os
import sys

sys.path.append("../")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import operator
import pyLDAvis
import pyLDAvis.gensim
import gensim
from gensim.models.coherencemodel import CoherenceModel

from gsdmm.gsdmm import MovieGroupProcess

from src.text_preprocessor import TextPreprocessor
from src.embeddor import Embeddor
from src.utils.plotting_utils import tf_idf

%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 6)
sns.set_palette("husl")

In [None]:
DATA_FOLDER = "../data"
DATA1 = "seatguru_python_scraping.csv"
DATA2 = "skytrax_scraping_2.csv"

In [None]:
df1 = pd.read_csv(os.path.join(DATA_FOLDER, DATA1))
df2 = pd.read_csv(os.path.join(DATA_FOLDER, DATA2))

In [None]:
df1.head()

In [None]:
df2.head()

# EDA

## Balance

In [None]:
sns.countplot(x='rating', data=df2)
plt.title('Number of reviews per rating')
plt.xlabel('Rating')
plt.ylabel(' ')
plt.show()

In [None]:
def bin(rating):
    if rating <= 3:
        return "Bad"
    elif rating > 3 and rating < 8:
        return "Medium"
    else:
        return "Good"

In [None]:
df2['bin'] = df2['rating'].apply(bin)

In [None]:
sns.countplot(x='bin', data=df2)
plt.title('Number of reviews per bin')
plt.xlabel('Rating')
plt.ylabel(' ')
plt.show()

## Time series annalysis

In [None]:
df3 = df2.copy()
df3.date = pd.to_datetime(df3.date)
df3 = df3.loc[df3.airline == "american-airlines"]
df3['month'] = df3.date.dt.month
df3['year'] = df3.date.dt.year
rat_by_month = df3.dropna(subset=["rating"]).groupby(["month", "year"]).agg({'rating':'mean'}).sort_index(level=[1, 0])
rat_by_month.index = rat_by_month.index.get_level_values(0).astype(str) + '-' + rat_by_month.index.get_level_values(1).astype(str)
rat_by_month.plot()

In [None]:
df4 = df2.copy()
df4.date = pd.to_datetime(df4.date)
df4 = df4.loc[df4.airline == "american-airlines"]
df4['month'] = df4.date.dt.month
df4['year'] = df4.date.dt.year
rat_by_month = df4.dropna(subset=["rating"]).groupby(["month", "year"]).agg({'rating':'count'}).sort_index(level=[1, 0])
rat_by_month.index = rat_by_month.index.get_level_values(0).astype(str) + '-' + rat_by_month.index.get_level_values(1).astype(str)
rat_by_month.plot()

# TF-IDF

In [None]:
df_good = df2.loc[df2.bin == "Good"]
preprocessor = TextPreprocessor(df_good, column_to_clean='body')
preprocessor.transform(n_grams=False, remove_stopwords=True)
corpus_good = preprocessor.corpus

df_bad = df2.loc[df2.bin == "Bad"]
preprocessor = TextPreprocessor(df_bad, column_to_clean='body')
preprocessor.transform(n_grams=False, remove_stopwords=True)
corpus_bad = preprocessor.corpus

In [None]:
tf_idf_matrix = tf_idf(corpus_good, wordcloud=True, rating="Good")

In [None]:
tf_idf_matrix = tf_idf(corpus_bad, wordcloud=True, rating="Bad")

# Embedding

In [None]:
df2_small = df2.copy().iloc[:5000, :]
preprocessor = TextPreprocessor(df2_small, column_to_clean='body')
preprocessor.transform(n_grams=False, remove_stopwords=True)
corpus = preprocessor.corpus

In [None]:
embeddor = Embeddor(corpus=corpus)
embeddor.transform(vec_method="word2vec", how="PCA", n=3)
lsi = embeddor.description_embedding
word2vec_model = embeddor.model
lsi['corpus'] = corpus
lsi['rating'] = df2_small['rating']
lsi.head()

In [None]:
for i in range(3):
    rat_list = []
    lsi[f'Dimension_{i+1}'] = np.abs(lsi[f'Dimension_{i+1}'])
    top_words = lsi.sort_values(f'Dimension_{i+1}', ascending=False).index[:5]
    print((f"Top reviews for topic {i} are : "))
    print(lsi.corpus.apply(lambda x: x[2:]).iloc[top_words])
    print(f"Average rating for topic {i} are : ")
    ratings = df2_small.rating.tolist()
    for ind in list(top_words):
        rat_list.append(ratings[ind])
    print(np.mean(rat_list))
    print()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

colors = ['red', 'red', 'red', 'yellow', 'yellow', 'yellow', 'yellow', 'green', 'green', 'green']

for val in lsi['rating'].dropna().astype(int).unique():
    topic_1 = np.abs(lsi[lsi['rating']==val]['Dimension_1'].values)
    topic_2 = np.abs(lsi[lsi['rating']==val]['Dimension_3'].values)
    color = colors[val-1]
    ax.scatter(topic_1, topic_2, alpha=0.7, label=val, color=color)
    
ax.set_xlabel('First Topic')
ax.set_ylabel('Second Topic')
ax.axvline(linewidth=0.5)
ax.axhline(linewidth=0.5)
ax.legend()

# LDA

In [None]:
df2_small["corpus"] = corpus

In [None]:
#Dictionary
tokens = df2_small.corpus
dictionary = gensim.corpora.Dictionary(tokens)
dictionary.filter_extremes(no_below=0.05, no_above=0.9)
corpus_lda = [dictionary.doc2bow(tok) for tok in tokens]

In [None]:
ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpus_lda,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=42,
                                           alpha=0.1,
                                           eta=0.1,
                                           per_word_topics=True)

In [None]:
for i, topic in ldaModel.show_topics(formatted=True, num_topics=10, num_words=20):
    print(str(i)+": "+ topic+"\n")

In [None]:
cm = CoherenceModel(model=ldaModel, corpus=corpus, texts=tokens ,coherence="c_v")
cm.get_coherence()

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldaModel, corpus_lda, dictionary)
vis

# GSDMM

In [None]:
df2_small['nb_token'] = list(map(len, df2_small['corpus']))
docs = df2_small.corpus.to_list()
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

In [None]:
nb_topic = 10
alpha = 0.1
beta = 0.1

mgpModel = MovieGroupProcess(K=nb_topic, alpha=alpha, beta=beta, n_iters=20)
mgpModelFit = mgpModel.fit(tokens, n_terms)

In [None]:
def topWordsPerTopic(clusterDistrib, topIndex, nbWord):
    for index in topIndex:
        clusterWord = clusterDistrib[index]
        sortedCluster = sorted(clusterWord.items(), key=operator.itemgetter(1), reverse=True)
        clusterTopWords = sortedCluster[:nbWord]
        print(f"Cluster {index} : {clusterTopWords}")
        print('*'*20)

In [None]:
docCount = np.array(mgpModel.cluster_doc_count)
print('Number of documents per topic :', docCount)
print('*'*20)
# Topics sorted by the number of document they are allocated to
topIndex = docCount.argsort()[::-1]
print('Most important clusters (by number of docs inside):', topIndex)
print('*'*20)
# Show the top 30 words in term frequency for each cluster 
topWordsPerTopic(mgpModel.cluster_word_distribution, topIndex, 30)