In [1]:
%load_ext autoreload
%autoreload 1

import pandas as pd
import numpy as np
import pickle
import math
import os
import re

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm

import seaborn as sns

import umap
from kneed import KneeLocator
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, centroid
from tabulate import tabulate
from sklearn.metrics import confusion_matrix, classification_report, silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud

In [2]:
url      = 'https://github.com/MaartenGr/cTFIDF/archive/refs/tags/v0.1.1.tar.gz'
version  = re.search(r'/v(.+?)\.tar\.gz', url).group(1)
dir_name = f'cTFIDF-{version}'
dir_path = os.path.join(os.getcwd(),dir_name)
import sys
if sys.path[-1] != dir_path:
    sys.path.append(dir_path)
from ctfidf import CTFIDFVectorizer

import matplotlib.font_manager
fonts = matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
print("Possible fonts: ", end="")
for f in sorted(fonts):
    if 'Narrow' in f:
        print(f.split(os.path.sep)[-1], end=", ")
    elif 'Sans' in f:
        print(f.split(os.path.sep)[-1], end=", ")
print()

Possible fonts: DejaVuSans-Bold.ttf, DejaVuSans-BoldOblique.ttf, DejaVuSans-ExtraLight.ttf, DejaVuSans-Oblique.ttf, DejaVuSans.ttf, DejaVuSansCondensed-Bold.ttf, DejaVuSansCondensed-BoldOblique.ttf, DejaVuSansCondensed-Oblique.ttf, DejaVuSansCondensed.ttf, DejaVuSansMono-Bold.ttf, DejaVuSansMono-BoldOblique.ttf, DejaVuSansMono-Oblique.ttf, DejaVuSansMono.ttf, GillSansBoItNova.ttf, GillSansBoNova.ttf, GillSansCondBoItNova.ttf, GillSansCondBoNova.ttf, GillSansCondExtraItNova.ttf, GillSansCondExtraNova.ttf, GillSansCondItNova.ttf, GillSansCondLightItNova.ttf, GillSansCondLightNova.ttf, GillSansCondNova.ttf, GillSansCondUltraBoNova.ttf, GillSansItNova.ttf, GillSansLightItNova.ttf, GillSansLightNova.ttf, GillSansNova.ttf, GillSansUltraBoNova.ttf, SansSerifCollection.ttf, 


In [3]:
fp = fonts[0] # Ensure at least _something_ is set here
for f in fonts:
    if 'LiberationSansNarrow-Regular' in f:
        fp = f.split(os.path.sep)[-1].split('.')[0]
        break
    elif 'Arial Narrow.ttf' in f:
        fp = f.split(os.path.sep)[-1].split('.')[0]
        break
    elif 'Narrow' in f:
        fp = f.split(os.path.sep)[-1].split('.')[0]
print(f"Using font: {fp}")

fname = ''.join([f' {x}' if x==x.upper() else x for x in fp.split('-')[0]]).strip().replace('  ','')
print(f"  Guessing at font name: {fname}")

# These are font dictionaries for the 's'uper-title, 't'itle, 
# 'a'xis, and 'l'abels.
sfont = {'fontname':fname, 'fontsize':16}
tfont = {'fontname':fname, 'fontsize':12}
afont = {'fontname':fname, 'fontsize':10}
lfont = {'fontname':fname, 'fontsize':8}

Using font: C:\Windows\Fonts\Candaral.ttf
  Guessing at font name: C : \ Windows \ Fonts \ Candaral .ttf


In [4]:
# Random seed
rs = 43

# Which embeddings to use
src_embeddings = 'doc_vec'  #'word_vec'

In [5]:
# Name of the file
fn = 'Amazon_Unlocked_Mobile.csv'

# See if the data has already been downloaded, and
# if not, download it from the web site. We save a
# copy locally so that you can run this tutorial
# offline and also spare the host the bandwidth costs
if os.path.exists(os.path.join('data',fn)):
    df = pd.read_csv(os.path.join('data',fn))
else:
    # We will look for/create a 'data' directory
    if not os.path.exists('data'):
        os.makedirs('data')
   
    # Download and save
    df = pd.read_parquet(f'http://orca.casa.ucl.ac.uk/~jreades/data/{fn}')
    df.to_parquet(os.path.join('data',fn))
    

In [10]:
print(f"Loading columns: {', '.join(df.columns.tolist())}")
print(df.describe())
print(df.shape[0])

df_small = df.head(1000)

Loading columns: Product Name, Brand Name, Price, Rating, Reviews, Review Votes
               Price         Rating   Review Votes
count  407907.000000  413840.000000  401544.000000
mean      226.867155       3.819578       1.507237
std       273.006259       1.548216       9.163853
min         1.730000       1.000000       0.000000
25%        79.990000       3.000000       0.000000
50%       144.710000       5.000000       0.000000
75%       269.990000       5.000000       1.000000
max      2598.000000       5.000000     645.000000
413840
