In [76]:
%load_ext autoreload
%autoreload 1

import pandas as pd
import numpy as np
import pickle
import math
import os
import re

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm

import seaborn as sns

import umap
from kneed import KneeLocator
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, centroid
from tabulate import tabulate
from sklearn.metrics import confusion_matrix, classification_report, silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
url      = 'https://github.com/MaartenGr/cTFIDF/archive/refs/tags/v0.1.1.tar.gz'
version  = re.search(r'/v(.+?)\.tar\.gz', url).group(1)
dir_name = f'cTFIDF-{version}'
dir_path = os.path.join(os.getcwd(),dir_name)
import sys
if sys.path[-1] != dir_path:
    sys.path.append(dir_path)
from ctfidf import CTFIDFVectorizer

import matplotlib.font_manager
fonts = matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
print("Possible fonts: ", end="")
for f in sorted(fonts):
    if 'Narrow' in f:
        print(f.split(os.path.sep)[-1], end=", ")
    elif 'Sans' in f:
        print(f.split(os.path.sep)[-1], end=", ")
print()

Possible fonts: DejaVuSans-Bold.ttf, DejaVuSans-BoldOblique.ttf, DejaVuSans-ExtraLight.ttf, DejaVuSans-Oblique.ttf, DejaVuSans.ttf, DejaVuSansCondensed-Bold.ttf, DejaVuSansCondensed-BoldOblique.ttf, DejaVuSansCondensed-Oblique.ttf, DejaVuSansCondensed.ttf, DejaVuSansMono-Bold.ttf, DejaVuSansMono-BoldOblique.ttf, DejaVuSansMono-Oblique.ttf, DejaVuSansMono.ttf, GillSansBoItNova.ttf, GillSansBoNova.ttf, GillSansCondBoItNova.ttf, GillSansCondBoNova.ttf, GillSansCondExtraItNova.ttf, GillSansCondExtraNova.ttf, GillSansCondItNova.ttf, GillSansCondLightItNova.ttf, GillSansCondLightNova.ttf, GillSansCondNova.ttf, GillSansCondUltraBoNova.ttf, GillSansItNova.ttf, GillSansLightItNova.ttf, GillSansLightNova.ttf, GillSansNova.ttf, GillSansUltraBoNova.ttf, SansSerifCollection.ttf, 


In [78]:
fp = fonts[0] # Ensure at least _something_ is set here
for f in fonts:
    if 'LiberationSansNarrow-Regular' in f:
        fp = f.split(os.path.sep)[-1].split('.')[0]
        break
    elif 'Arial Narrow.ttf' in f:
        fp = f.split(os.path.sep)[-1].split('.')[0]
        break
    elif 'Narrow' in f:
        fp = f.split(os.path.sep)[-1].split('.')[0]
print(f"Using font: {fp}")

fname = ''.join([f' {x}' if x==x.upper() else x for x in fp.split('-')[0]]).strip().replace('  ','')
print(f"  Guessing at font name: {fname}")

# These are font dictionaries for the 's'uper-title, 't'itle, 
# 'a'xis, and 'l'abels.
sfont = {'fontname':fname, 'fontsize':16}
tfont = {'fontname':fname, 'fontsize':12}
afont = {'fontname':fname, 'fontsize':10}
lfont = {'fontname':fname, 'fontsize':8}

Using font: C:\Windows\Fonts\Candaral.ttf
  Guessing at font name: C : \ Windows \ Fonts \ Candaral .ttf


In [79]:
# Random seed
rs = 43

# Which embeddings to use
src_embeddings = 'doc_vec'  #'word_vec'

In [80]:
# Name of the file
fn = 'Amazon_Unlocked_Mobile.csv'

# See if the data has already been downloaded, and
# if not, download it from the web site. We save a
# copy locally so that you can run this tutorial
# offline and also spare the host the bandwidth costs
if os.path.exists(os.path.join('data',fn)):
    df = pd.read_csv(os.path.join('data',fn))
else:
    # We will look for/create a 'data' directory
    if not os.path.exists('data'):
        os.makedirs('data')
   
    # Download and save
    df = pd.read_parquet(f'http://orca.casa.ucl.ac.uk/~jreades/data/{fn}')
    df.to_parquet(os.path.join('data',fn))
    

In [81]:
print(f"Loading columns: {', '.join(df.columns.tolist())}")
print(df.describe())
print(df.shape[0])

df_small = df.head(1000)

Loading columns: Product Name, Brand Name, Price, Rating, Reviews, Review Votes
               Price         Rating   Review Votes
count  407907.000000  413840.000000  401544.000000
mean      226.867155       3.819578       1.507237
std       273.006259       1.548216       9.163853
min         1.730000       1.000000       0.000000
25%        79.990000       3.000000       0.000000
50%       144.710000       5.000000       0.000000
75%       269.990000       5.000000       1.000000
max      2598.000000       5.000000     645.000000
413840


In [82]:
#Load open ai data. 

# Name of the file
fn = 'reviews_summaries.xlsx'

if os.path.exists(os.path.join('data',fn)):
    df_ai = pd.read_excel(os.path.join('data',fn))
    df_ai.to_csv('reviews_summaries.csv', index=False) # in place

print(f"Loading columns: {', '.join(df_ai.columns.tolist())}")
# print(df.describe())
print(df_ai.head(2))

Loading columns: Product Name, Brand Name, Price, Rating, Reviews, Review Votes, Summary
                                        Product Name Brand Name   Price  \
0  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
1  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   

   Rating                                            Reviews  Review Votes  \
0       5  I feel so LUCKY to have found this used (phone...             1   
1       4  nice phone, nice up grade from my pantach revu...             0   

                                             Summary  
0  [('positive', 'found this used phone in good c...  
1  [('positive', 'nice upgrade from my Pantach Re...  


In [83]:
print(df_ai.columns)
# df_ai[["Reviews", "Summary"]]
# df_ai[["Reviews"]]

Index(['Product Name', 'Brand Name', 'Price', 'Rating', 'Reviews',
       'Review Votes', 'Summary'],
      dtype='object')


In [84]:
# Python program to generate word vectors using Word2Vec
 
# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
# nltk.download('punkt')
import warnings

warnings.filterwarnings(action = 'ignore')
 
import gensim
from gensim.models import Word2Vec
 
data_reviews = []
# iterate through each sentence in the file
for i in df_ai[["Reviews"]].to_numpy():
    
    temp = []
    # tokenize the sentence into words
    for j in word_tokenize(i[0]):
        temp.append(j.lower())
 
    data_reviews.append(temp)
 
# Create CBOW model
model1 = gensim.models.Word2Vec(data_reviews, min_count = 1, 
                              vector_size = 100, window = 5)
 
# Print results
print("Cosine similarity between 'samsung' " +
               "and 'galaxy' - CBOW : ",
    model1.wv.similarity('samsung', 'galaxy'))
     
print("Cosine similarity between 'samsung' " +
                 "and 'great' - CBOW : ",
      model1.wv.similarity('samsung', 'great'))
 
# Create Skip Gram model
model2 = gensim.models.Word2Vec(data_reviews, min_count = 1, vector_size = 100,
                                             window = 5, sg = 1)
 
# Print results
print("Cosine similarity between 'samsung' " +
          "and 'galaxy' - Skip Gram : ",
    model2.wv.similarity('samsung', 'galaxy'))
     
print("Cosine similarity between 'samsung' " +
            "and 'great' - Skip Gram : ",
      model2.wv.similarity('samsung', 'great'))

Cosine similarity between 'samsung' and 'galaxy' - CBOW :  -0.18181361
Cosine similarity between 'samsung' and 'great' - CBOW :  0.2900546
Cosine similarity between 'samsung' and 'galaxy' - Skip Gram :  0.7133872
Cosine similarity between 'samsung' and 'great' - Skip Gram :  0.908905


In [85]:
data_summary = []
# iterate through each sentence in the file
for i in df_ai[["Summary"]].to_numpy():
    
    temp = []
    # tokenize the sentence into words
    for j in word_tokenize(i[0]):
        temp.append(j.lower())
 
    data_summary.append(temp)

# Create CBOW model
model1_summary = gensim.models.Word2Vec(data_summary, min_count = 1, 
                              vector_size = 100, window = 5)
 
 
# Create Skip Gram model
model2_summary = gensim.models.Word2Vec(data_summary, min_count = 1, vector_size = 100,
                                             window = 5, sg = 1)

In [86]:
import gensim 
from gensim.models import word2vec,KeyedVectors 
from sklearn.metrics.pairwise import cosine_similarity

import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')
glove_vectors.


['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [105]:
# glove_vectors["samsung"]
def review_embedding(text, model = glove_vectors):
    mean = 0
    for j in word_tokenize(text):
        mean += model[j.lower()]
    return mean

def embedding_list(df, col, model = glove_vectors):
    for text in df[[col]].to_numpy():
        text = text[0]
        embeddings = []

        embeddings.append(review_embedding(text, model = glove_vectors))
        # tokenize the sentence into words


# embeddings_summary = embedding_list(df_ai, "Summary") # requires more segmenting.
# df_ai["embeddings_summary"] = embeddings_summary

embeddings_reviews = embedding_list(df_ai, "Reviews") # A bunch of weird characters in so doesn't quite work.
df_ai["embeddings_reviews"] = embeddings_reviews

# fn = "embeddings.csv"
# embeddings = pd.read_csv(os.path.join('data',fn))["embedding"]
# df_ai_embedded = pd.concat([df_ai, embeddings], axis=1)
# df_ai_embedded["embedding"]
# df_ai_embedded["embedding"].iloc[0,]


KeyError: "Key '2.5+' not present"

In [52]:
dmeasure = 'euclidean'
rdims    = 4 # r-dims == Reduced dimensionality
print(f"UMAP dimensionality reduction to {rdims} dimensions with '{dmeasure}' distance measure.")


def x_from_df(df:pd.DataFrame, col:str='Embedding') -> pd.DataFrame:
    cols = ['E'+str(x) for x in np.arange(0,len(df[col].iloc[0]))]
    return pd.DataFrame(df[col].tolist(), columns=cols, index=df.index)


UMAP dimensionality reduction to 4 dimensions with 'euclidean' distance measure.
