In [1]:
#https://www.kaggle.com/roshansharma/amazon-alexa-reviews

# Disclaimer
# I don't have any connection with Glassdoor and this 
# project is neither approved or endorsed by them. 
# The data collected, and made available here was publicly accessible 
# (without even logging in to the website) at the moment it was collected.
# This dataset was created for educational purposes.

In [7]:
# General imports
import pandas as pd
import numpy as np
from datetime import datetime

# libraries for visualization
from matplotlib import pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim

# For latent dirichlet allocation
import spacy
import gensim
from gensim import corpora

# For modelling and ELI5 analysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
import eli5

# Set width of notebook
from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:60% !important; }</style>"))

import os
os.chdir('/Users/hectortemp/github/pytrust')
# Used for plotting visualisations
from pytrust.examine.plotting_trust import *

# Used for cleaning (lemmatizing, removing punctuation, lower case etc...)
from pytrust.examine.clean_words import *

In [8]:
# Step 1: find all csvs in data folder
import glob, os
list_of_csvs = glob.glob("data/*.csv")
comp_names = [i.replace('www_', '').split('_')[1] for i in list_of_csvs]
# Read them all in


data = pd.DataFrame()
for comp, path in zip(comp_names, list_of_csvs):
    df_hold = pd.read_csv(path)
    df_hold.insert(0, 'company', comp)
    data = pd.concat([data, df_hold])

In [12]:
data['company'].value_counts()

dfs                45137
amazon             14612
ubereats           12732
ocado              11274
just-eat            7427
deliveroo           7040
asda                6259
tesco               5886
sainsburys          4243
morrisons           3593
marksandspencer     3186
aldi                2457
iceland             2359
waitrose            1876
lidl                1510
co-operative         558
farmfoods            160
booths                20
Name: company, dtype: int64

In [None]:
# Step 2: clean data
data = clean_data(data)

In [None]:
# Step 3a: How do we know which words to remove
a = [item for sublist in data['content_clean'].str.split().values for item in sublist]
pd.Series(a).value_counts().head(20)

In [None]:
# Step 3b: remove custom stopwords
custom_stopwords = comp_names
data = remove_custom_stopwords(df = data,
                        custom_stop = custom_stopwords,
                        cols = ['title_clean', 'content_clean'])
# Step 4: replace content and title with the clean versions
REPLACE = True
if REPLACE:
    data['title'] = data['title_clean']
    data['content'] = data['content_clean']
    data = data.drop(['title_clean', 'content_clean'], axis = 1)
    
# Step 5: Create quantitative features
data = create_fea(data)

# Step 6: Fix spelling - not yet implemented

In [None]:
today = datetime.today().strftime('%Y_%m_%d')
data.to_csv(f'clean_data/{today}_FULL_clean.csv')
print(data.shape)
data.head()

In [None]:
print("Oldest review:", data['date'].min(), ", Newest review:", data['date'].max())

top_pct_to_drop = 0.02
data = data.sort_values('content_num_words', ascending=False).iloc[round(data.shape[0] * top_pct_to_drop):,:]

In [None]:
# Let's also remove reviews with no content
print(data.shape)
data = data[(data['content'].str.len() != 0) & (data['content'].str.len() != 1)]
data.shape

In [None]:
plot_pie_chart(df = data)

In [None]:
plot_star_funnel(data)

In [None]:
plot_dist3(data[data['num_stars'] == 5], 'content_num_char',
       'Characters Per "Positive review')
plot_dist3(data[data['num_stars'] == 1], 'content_num_char',
       'Characters Per "Negative review')

In [None]:
plot_word_len_histogram(data[data['num_stars'] == 5]['content'],
                       data[data['num_stars'] == 1]['content'])

In [None]:
plot_dist3(data[data['num_stars'] == 5], 'content_num_words',
       'Words Per "Positive review')
plot_dist3(data[data['num_stars'] == 1], 'content_num_words',
       'Words Per "Negative review')

In [None]:
sns.set(font_scale = 2)
g = sns.FacetGrid(data, col='num_stars', height=4)
g.map(plt.hist,'content_num_char')
plt.subplots_adjust(top=0.8)
g.fig.suptitle(f'Size of review distribution, by number of stars')
plt.show()

In [None]:
ngrams(df = data, n = 1, title = 'Most Common Unigrams', mx_df = 0.9, content_or_title = 'content')
ngrams(df = data, n = 1, title = 'Most Common Unigrams', mx_df = 0.9, content_or_title = 'title')

In [None]:
ngrams(df = data, n = 2, title = 'Most Common Bigrams', mx_df = 0.9, content_or_title = 'content')
ngrams(df = data, n = 2, title = 'Most Common Bigrams', mx_df = 0.9, content_or_title = 'title')

In [None]:
ngrams(df = data, n = 3, title = 'Most Common Trigrams', mx_df = 0.9, content_or_title = 'content')
ngrams(df = data, n = 3, title = 'Most Common Trigrams', mx_df = 0.9, content_or_title = 'title')

In [None]:
from plotting_trust import display_topics

In [None]:
print('Topics for TITLE of review')
display_topics(data[data['num_stars'] == 5]['title'], 
               no_top_words = 5,
               topic = 'Positive review topics \n',
               components = 10)
print('\n======================================\n')
print('\n======================================\n')
print('Topics for BODY of review')
display_topics(data[data['num_stars'] == 5]['content'], 
               no_top_words = 5,
               topic = 'Positive review topics \n',
               components = 10)

In [None]:
print('Topics for TITLE of review')
display_topics(data[data['num_stars'] == 1]['title'], 
               no_top_words = 5,
               topic = 'Negative review topics \n',
               components = 10)
print('\n======================================\n')
print('\n======================================\n')
print('Topics for BODY of review')
display_topics(data[data['num_stars'] == 1]['content'], 
               no_top_words = 5,
               topic = 'Negative review topics \n',
               components = 10)

In [None]:
time_series_slider(df = data, window = 30, add_count = False, add_var = True, add_kurt = False)

In [None]:
# Prep data
data_for_reg = data[data['num_stars'] != 3].copy()
data_for_reg.loc[:,'target'] = -9999
data_for_reg.loc[data_for_reg['num_stars'] < 3, 'target'] = 0 # 0 negative
data_for_reg.loc[data_for_reg['num_stars'] > 3, 'target'] = 1 # 1 positive

X_full = data_for_reg['content']
y_full = data_for_reg['target']

vect = TfidfVectorizer()
X = vect.fit_transform(X_full)

y = y_full

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.20, random_state=23, stratify=y)



model = LogisticRegression()
model.fit(X_train, y_train)
print("Train Set Accuracy: {}".format(metrics.accuracy_score(model.predict(X_train), y_train)))
print("Train Set ROC: {}\n".format(metrics.roc_auc_score(model.predict(X_train), y_train)))

print("Validation Set Accuracy: {}".format(metrics.accuracy_score(model.predict(X_valid), y_valid)))
print("Validation Set ROC: {}".format(metrics.roc_auc_score(model.predict(X_valid), y_valid)))

In [None]:
print(metrics.classification_report(model.predict(X_valid), y_valid))

In [None]:
# Confusion Matrix\
fig, axes = plt.subplots(1, 2, figsize=(18, 8))
axes = axes.flatten()
sns.set(font_scale=2.0)
for norm, j in zip(['true', None], axes):
    plot_confusion_matrix(model, X_valid, y_valid, normalize = norm, ax = j)
axes[0].set_title(f'Normalised confusion matrix', fontsize = 24)
axes[1].set_title(f'Raw confusion matrix', fontsize = 24)
plt.show()

In [None]:
target_names = [0, 1]
eli5.show_weights(model, vec=vect, top=100,
                  target_names=target_names)