In [1]:
import json
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from sklearn.preprocessing import normalize
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import gensim
import spacy
from numpy import linalg as LA
from tqdm import tqdm
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


# Product classification based on title and description

In [4]:
# Import meta data
meta_data = [json.loads(i) for i in open('/content/drive/My Drive/Data Mining/meta_Luxury_Beauty.json', 'rt')]

In [5]:
# Select data from product id and product title
meta = pd.DataFrame(meta_data)[['asin', 'title']].dropna()

In [7]:
meta.head()

Unnamed: 0,asin,title
0,B00004U9V2,Crabtree &amp; Evelyn - Gardener's Ultra-Moist...
1,B0000531EN,AHAVA Bath Salts
2,B0000532JH,"AHAVA Dead Sea Mineral Mud, 8.5 oz, Pack of 4"
3,B00005A77F,"Crabtree &amp; Evelyn Hand Soap, Gardeners, 10..."
4,B00005NDTD,Soy Milk Hand Crme


In [8]:
# Make a copy and drop duplicates
df_meta = meta.copy()
df_meta.drop_duplicates(inplace = True)

1. Preprocess Data

In [9]:
# Import stopwords
stopset = set(stopwords.words('english'))

In [10]:
# Change text to lower case
df_meta['title_lower'] = df_meta['title'].str.replace('\n','').str.replace('&amp;', '')
df_meta['title_lower'] = df_meta['title_lower'].str.lower()

In [11]:
# Remove punctuation and stopwords
df_meta['preprocess_title'] = df_meta['title_lower'].str.translate(str.maketrans('', '', string.punctuation))
df_meta['preprocess_title'] = df_meta['preprocess_title'].str.split()
df_meta['preprocess_title'] = df_meta['preprocess_title'].apply(lambda x: [item for item in x if item not in stopset])

In [12]:
# Prepare text data for spacy
df_meta['preprocess_title_str'] = df_meta['preprocess_title'].apply(' '.join)

2. Classify products to different categories - makeup, skin care, hair care, perfume, hand care, nail care

In [13]:
# Load word2vec model
model = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/Data Mining/GoogleNews-vectors-negative300.bin', binary = True) 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [14]:
# Choose seven categories
product = ['makeup', 'skin', 'hair', 'fragrance', 'hand', 'nail', 'tooth']

# Convert the seven words to vectors
prod_vec_list = [model[word] for word in product if word in model.vocab]
prod_vec_zip = zip(product, prod_vec_list)
prod_vec = dict(prod_vec_zip)

df_prod = pd.DataFrame(prod_vec).transpose()
df_prod

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
makeup,0.162109,0.180664,0.099121,-0.043701,-0.203125,0.166992,-0.09375,-0.498047,-0.008728,-0.032227,0.148438,-0.077148,0.008057,-0.106445,-0.025635,0.193359,0.050781,0.332031,0.102539,-0.308594,-0.245117,0.028076,-0.138672,0.183594,-0.01178,0.40625,0.02771,-0.112793,-0.111816,0.028809,-0.289062,0.169922,0.023438,-0.053711,-0.086914,0.175781,0.083984,0.160156,-0.003113,-0.057617,...,0.146484,-0.104004,-0.277344,-0.146484,0.412109,0.269531,-0.045898,0.177734,-0.09082,-0.241211,-0.035645,0.214844,-0.054932,0.308594,0.245117,-0.037842,-0.002808,-0.031738,0.064941,0.111328,0.12793,0.287109,0.103516,0.431641,-0.08252,-0.145508,0.039795,-0.039795,0.279297,-0.161133,-0.333984,0.093262,-0.087891,0.047363,0.045654,0.181641,0.267578,0.157227,0.163086,-0.137695
skin,-0.077148,0.339844,0.120117,-0.277344,-0.273438,0.185547,0.040283,-0.165039,-0.022583,-0.023682,-0.170898,-0.131836,-0.143555,-0.137695,-0.091309,0.146484,-0.164062,0.582031,-0.185547,0.107422,-0.160156,-0.092773,0.149414,-0.09668,-0.353516,-0.255859,0.048828,0.209961,0.081055,-0.228516,-0.287109,0.014709,-0.033936,-0.138672,-0.105469,-0.014404,-0.118652,0.103027,0.058594,-0.049805,...,0.139648,-0.318359,-0.306641,-0.013245,0.257812,0.031738,-0.121094,-0.180664,-0.059326,-0.204102,-0.057861,0.208008,-0.046631,0.136719,0.212891,-0.162109,0.005096,0.019165,-0.15918,0.053223,0.287109,0.031982,-0.081055,0.018433,-0.083496,-0.194336,-0.034424,0.204102,-0.062988,-0.123047,0.148438,0.131836,-0.046875,0.074219,0.228516,0.359375,0.099121,0.089355,-0.043945,0.109375
hair,-0.146484,0.186523,0.219727,-0.080566,-0.128906,0.091797,0.164062,-0.421875,0.170898,0.035156,-0.182617,-0.198242,-0.108887,0.183594,-0.057129,-0.004272,0.124512,0.369141,0.1875,-0.277344,-0.429688,-0.136719,-0.000637,0.029175,-0.135742,-0.071777,-0.132812,-0.050781,0.361328,-0.126953,-0.117676,0.249023,0.119629,-0.210938,-0.170898,-0.265625,0.117676,-0.123535,0.054688,0.151367,...,0.202148,0.07959,-0.285156,0.040283,0.141602,0.328125,-0.273438,0.068359,0.095703,-0.105957,-0.279297,0.161133,-0.063965,0.283203,0.447266,-0.296875,0.100586,-0.011902,0.054443,-0.120605,0.245117,-0.037109,0.03125,0.209961,-0.241211,0.033936,0.016113,-0.04248,-0.036377,0.140625,0.067871,0.21875,-0.144531,0.128906,0.137695,0.016724,0.233398,0.289062,-0.105957,0.166016
fragrance,0.118164,0.072266,0.087891,0.051514,0.021606,0.077148,-0.101074,-0.503906,0.031982,0.180664,-0.3125,-0.234375,0.125,0.087891,-0.194336,0.12207,0.175781,0.261719,0.129883,0.115723,-0.1875,0.061279,0.165039,-0.291016,-0.15625,0.277344,0.193359,0.28125,-0.034424,0.116699,-0.201172,0.089355,-0.271484,0.103516,-0.193359,-0.051025,0.124023,-0.302734,-0.0625,-0.15332,...,0.131836,-0.016724,-0.237305,0.046875,0.216797,-0.088379,-0.371094,-0.137695,-0.006531,0.133789,-0.061768,0.008911,-0.253906,0.133789,0.511719,0.197266,-0.014038,-0.142578,0.011658,0.380859,-0.185547,0.205078,0.186523,0.037842,0.185547,-0.061523,-0.199219,0.062012,0.095215,0.314453,-0.105957,0.058838,0.025513,-0.205078,0.106445,-0.037598,0.047363,0.417969,0.098145,0.115723
hand,0.093262,-0.046387,-0.133789,0.048096,-0.316406,0.10791,0.300781,-0.28125,-0.018311,0.048584,-0.044434,-0.182617,-0.117188,0.137695,-0.178711,0.003342,-0.107422,0.013428,0.089844,0.072266,0.216797,0.103516,0.189453,0.057617,0.037598,-0.251953,-0.059814,-0.010071,-0.028442,0.12793,0.013,0.014343,-0.10791,0.12793,-0.178711,0.046143,0.006409,0.116699,-0.086914,-0.15332,...,-0.1875,-0.083496,-0.07959,-0.091797,-0.121582,0.077637,-0.091309,0.069824,-0.138672,5.2e-05,-0.146484,0.160156,-0.065918,0.071289,0.171875,-0.091309,0.060547,-0.138672,-0.144531,-0.000641,0.148438,-0.069824,0.197266,0.109863,0.077148,0.124023,-0.232422,-0.066406,0.107422,0.017456,-0.057617,-0.012451,-0.068359,-0.070312,0.176758,-0.039062,0.067383,-0.071777,-0.207031,-0.060303
nail,0.047607,0.197266,-0.060791,-0.376953,-0.054199,0.077637,0.175781,-0.10791,0.238281,0.182617,-0.180664,-0.101562,-0.259766,0.097168,0.025757,0.244141,0.273438,0.244141,-0.207031,-0.304688,0.002213,0.036621,0.248047,-0.022095,-0.203125,0.164062,-0.03125,-0.105957,0.065918,-0.114258,-0.128906,0.160156,0.226562,0.206055,-0.220703,-0.11084,-0.169922,0.208984,0.287109,-0.006256,...,-0.046631,0.101562,-0.236328,0.058105,0.339844,0.199219,-0.164062,-0.025269,0.207031,0.056641,0.131836,0.132812,0.28125,0.041992,0.032471,0.15625,-0.157227,-0.124023,0.019165,0.214844,-0.203125,-0.018677,-0.081055,0.213867,-0.032227,-0.048584,-0.228516,-0.120605,-0.304688,0.124512,0.194336,0.167969,-0.209961,0.105957,-0.167969,0.044922,0.029297,-0.003494,0.101074,0.122559
tooth,-0.022095,0.061035,-0.037109,-0.223633,-0.589844,0.199219,0.392578,-0.165039,0.126953,0.145508,-0.261719,-0.059326,-0.283203,0.016235,-0.025879,0.086426,-0.026733,0.300781,-0.180664,-0.261719,-0.079102,-0.030762,0.114258,-0.171875,-0.34375,0.117188,-0.316406,-0.073242,0.382812,-0.120117,0.037598,0.112793,0.384766,0.044922,0.058105,0.026001,-0.057861,0.355469,-0.041992,-0.023071,...,0.054199,0.051025,-0.439453,0.198242,0.093262,0.209961,-0.248047,-0.045898,0.158203,0.328125,-0.135742,0.253906,0.103516,0.131836,0.400391,-0.164062,0.107422,-0.337891,-0.144531,0.138672,0.000431,-0.273438,0.060791,0.197266,-0.095703,-0.28125,0.006744,0.020142,-0.243164,0.071777,0.177734,0.060303,-0.052002,0.116211,-0.057373,0.208984,-0.087402,0.279297,-0.188477,0.03833


In [15]:
# Identify nouns from product title
nlp = spacy.load('en_core_web_sm')

def pos_tagging(data):
    req_tag = ['NN', 'NNS']
    extracted_words = []
    i = 0
    try:
        for x in data:
            doc = nlp(x)
            for token in doc:
                i += 1
                if token.tag_ in req_tag and token.shape_ != 'x' and token.shape_ != 'xx' and token.shape_ != 'xxx':
                    extracted_words.append(token.lemma_)
        return extracted_words
    except Exception as e:
        return extracted_words
                
extract_words = pos_tagging(df_meta['preprocess_title_str'])

In [16]:
# Select words that are in word2vec model and convert them into vector
words_filtered = [word for word in extract_words if word in model.vocab]
vector_list = [model[word] for word in extract_words if word in model.vocab]

word_vec_zip = zip(words_filtered, vector_list)
word_vec = dict(word_vec_zip)

df_tv = pd.DataFrame(word_vec).transpose()
df_tv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
hand,0.093262,-0.046387,-0.133789,0.048096,-0.316406,0.10791,0.300781,-0.28125,-0.018311,0.048584,-0.044434,-0.182617,-0.117188,0.137695,-0.178711,0.003342,-0.107422,0.013428,0.089844,0.072266,0.216797,0.103516,0.189453,0.057617,0.037598,-0.251953,-0.059814,-0.010071,-0.028442,0.12793,0.013,0.014343,-0.10791,0.12793,-0.178711,0.046143,0.006409,0.116699,-0.086914,-0.15332,...,-0.1875,-0.083496,-0.07959,-0.091797,-0.121582,0.077637,-0.091309,0.069824,-0.138672,5.2e-05,-0.146484,0.160156,-0.065918,0.071289,0.171875,-0.091309,0.060547,-0.138672,-0.144531,-0.000641,0.148438,-0.069824,0.197266,0.109863,0.077148,0.124023,-0.232422,-0.066406,0.107422,0.017456,-0.057617,-0.012451,-0.068359,-0.070312,0.176758,-0.039062,0.067383,-0.071777,-0.207031,-0.060303
therapy,0.058105,0.28125,-0.02478,0.310547,0.080566,0.225586,0.075195,-0.453125,-0.142578,-0.167969,-0.060547,-0.142578,0.100098,0.057861,0.271484,0.455078,0.005798,0.088379,0.12793,-0.289062,0.515625,0.302734,-0.072754,0.103516,-0.296875,-0.068359,-0.498047,0.135742,-0.086914,-0.089844,0.099609,-0.135742,-0.369141,0.058105,-0.484375,-0.168945,-0.126953,0.130859,0.119629,-0.367188,...,-0.21875,-0.263672,-0.017334,-0.158203,0.057373,0.112793,0.188477,-0.175781,-0.166992,0.117676,0.111328,0.228516,0.064453,0.02356,-0.053711,-0.028564,-0.361328,0.046631,0.124023,0.036621,0.198242,0.355469,0.273438,-0.146484,-0.066895,-0.202148,0.394531,-0.080078,-0.135742,0.110352,-0.19043,-0.030762,0.002487,0.094727,0.176758,0.124023,-0.09082,0.335938,0.22168,0.085449
pump,-0.022217,0.228516,-0.166016,0.061035,-0.324219,0.097656,-0.026367,-0.170898,0.104004,0.185547,-0.021606,0.077148,-0.121094,0.149414,-0.106445,0.182617,-0.298828,0.05957,0.110352,0.030151,0.04834,0.241211,-0.009827,0.170898,-0.009155,-0.038818,0.024902,0.095703,0.01062,0.119629,-0.318359,-0.019409,0.005829,0.001129,-0.223633,-0.271484,0.283203,0.140625,-0.082031,-0.027588,...,0.132812,0.054688,-0.154297,0.277344,0.149414,0.144531,-0.019165,-0.185547,0.21875,0.177734,0.022827,0.25,-0.029297,-0.149414,-0.078613,0.038086,-0.121582,-0.052246,0.191406,-0.053223,0.033936,0.170898,0.232422,-0.050049,0.068359,-0.267578,0.150391,-0.047363,0.216797,-0.148438,-0.041016,0.092285,-0.22168,0.005249,-0.098633,0.081543,-0.19043,0.061523,0.138672,-0.161133
salt,-0.057373,0.222656,0.204102,0.046631,-0.027344,0.296875,-0.102539,-0.172852,0.024414,0.310547,0.051514,-0.177734,-0.04126,0.143555,-0.082031,-0.060547,-0.310547,0.037842,-0.073242,-0.043213,0.013611,0.028076,0.079102,-0.172852,0.283203,0.128906,-0.013306,0.279297,-0.298828,-0.253906,-0.121582,0.34375,0.244141,-0.169922,-0.257812,-0.082031,0.410156,-0.008728,0.291016,0.002487,...,-0.036133,-0.149414,-0.246094,0.000603,0.132812,0.182617,-0.027466,-0.033691,0.318359,-0.355469,-0.044434,-0.036865,0.179688,-0.179688,-0.142578,-0.03833,0.034424,0.043701,0.041504,0.166992,0.009216,-0.016846,0.318359,0.072266,-0.186523,-0.151367,-0.227539,0.386719,0.070801,0.326172,-0.357422,-0.478516,-0.131836,0.162109,0.167969,0.054199,0.166016,0.197266,-0.053955,-0.161133
pack,0.169922,0.157227,-0.273438,0.059082,0.067871,0.074707,-0.035156,-0.019409,-0.162109,0.253906,0.227539,0.081543,-0.132812,-0.039795,-0.163086,0.052002,0.210938,0.108887,-0.00116,0.039062,0.263672,0.009766,-0.02832,-0.027954,0.018311,0.026733,-0.157227,0.324219,0.071777,0.017456,0.131836,0.084473,-0.198242,0.211914,-0.149414,-0.143555,-0.063477,0.024048,-0.114746,0.076172,...,-0.02124,0.072754,-0.269531,0.027344,-0.074219,-0.057373,0.026733,0.018066,-0.170898,0.064941,-0.102051,-0.183594,-0.045166,0.059326,0.164062,0.206055,0.205078,-0.34375,-0.062988,-0.22168,0.101074,-0.064941,0.203125,-0.137695,-0.304688,0.088379,-0.251953,0.28125,0.28125,-0.193359,0.223633,-0.115234,-0.121582,0.077637,-0.031738,-0.135742,-0.013855,0.114258,-0.065918,-0.026123


In [17]:
# Use cosine simiarity to find the words that are near each category word
similarity_title = pd.DataFrame(columns = df_prod.index)
for i in range(len(df_prod.index)):
    for j in range(len(df_tv.index)):
        cos_similarity = df_prod.iloc[i].dot(df_tv.iloc[j])/(LA.norm(df_prod.iloc[i])*LA.norm(df_tv.iloc[j]))
        similarity_title.loc[df_tv.index[j], df_prod.index[i]] = cos_similarity

similarity_title.head()

Unnamed: 0,makeup,skin,hair,fragrance,hand,nail,tooth
hand,0.103074,0.14309,0.146204,0.00478804,1.0,0.254954,0.187303
therapy,0.113341,0.243095,0.121042,0.161655,-0.0433404,0.0905111,0.11565
pump,0.0158298,0.0528402,-0.00383429,0.0825247,0.144317,0.113884,0.0338933
salt,0.192497,0.196738,0.158162,0.172761,0.0698339,0.0696595,0.176711
pack,0.0285315,0.0698041,0.0156931,0.0496076,0.115804,0.0975997,0.159012


In [19]:
# Find top 15 words that are close to the word makeup
a =  similarity_title['makeup'].sort_values(ascending = False)
makeup_top15 = a[a>0.3][0:15].index.values

# Find top 15 words that are close to the word skin
b =  similarity_title['skin'].sort_values(ascending = False)
skin_top15 = b[b>0.3][0:15].index.values

# Find top 15 words that are close to the word hair
c =  similarity_title['hair'].sort_values(ascending = False)
hair_top15 = c[c>0.3][0:15].index.values

# Find top 15 words that are close to the word fragrance
d = similarity_title['fragrance'].sort_values(ascending = False)
fragrance_top15 = d[d>0.3][0:15].index.values

# Find top 15 words that are close to the word hand
e = similarity_title['hand'].sort_values(ascending = False)
hand_top15 = e[e>0.3][0:15].index.values

# Find top 15 words that are close to the word nail
f = similarity_title['nail'].sort_values(ascending = False)
nail_top15 = f[f>0.3][0:15].index.values

# Find top 15 words that are close to the word tooth
g = similarity_title['tooth'].sort_values(ascending = False)
oral_top15 = g[g>0.3][0:15].index.values

In [20]:
# Drop inaccurate word in each category and keep top words
makeup_tw = np.delete(makeup_top15, [7,11,13])
skin_tw = np.delete(skin_top15, 1)
hair_tw = np.delete(hair_top15, [1,5,9,13])
fragrance_tw = np.delete(fragrance_top15, [5,11])
hand_tw = hand_top15[0:4]
nail_tw = np.delete(nail_top15, 1)
oral_tw = np.delete(oral_top15,[3,5,6,7])

In [21]:
# Classify products to different categories based on the title of the product 
df_category = pd.DataFrame(columns = ['makeup', 'skin_care', 'hair_care', 'fragrance', 'hand_care', 'nail_care', 'oral_care'])

# Extract noun from each product
for p in tqdm(df_meta['asin'].unique()):
    extract_noun = pos_tagging(df_meta[df_meta['asin'] == p]['preprocess_title_str'])
  
  # Check if top words of each category is in the extraced nouns of each product, if so, add one point to that category
    category_group = [makeup_tw, skin_tw, hair_tw, fragrance_tw, hand_tw, nail_tw, oral_tw]
    for j in range(7):
        score = 0
        for k in category_group[j]:
            if k in extract_noun:
                score += 1
                df_category.loc[p, df_category.columns[j]] = score

100%|██████████| 12111/12111 [02:42<00:00, 74.73it/s]


In [22]:
# Fill Nan with 0
df_category.fillna(0, inplace = True)

In [23]:
# Convert values of each column to float
df_category['makeup'] = df_category['makeup'].astype('float')
df_category['skin_care'] = df_category['skin_care'].astype('float')
df_category['hair_care'] = df_category['hair_care'].astype('float')
df_category['fragrance'] = df_category['fragrance'].astype('float')
df_category['hand_care'] = df_category['hand_care'].astype('float')
df_category['nail_care'] = df_category['nail_care'].astype('float')
df_category['oral_care'] = df_category['oral_care'].astype('float')

In [24]:
# Choose the category with the largest points
df_category['category'] = df_category.idxmax(axis="columns")

In [25]:
df_category[df_category['category'] == 'skin_care'][20:30]

Unnamed: 0,makeup,skin_care,hair_care,fragrance,hand_care,nail_care,oral_care,category
B00067WRC6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,skin_care
B000684GPQ,0.0,1.0,0.0,0.0,0.0,0.0,0.0,skin_care
B000684H82,0.0,1.0,0.0,0.0,0.0,0.0,0.0,skin_care
B0006FMUJ8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,skin_care
B0006FMQWO,0.0,1.0,0.0,0.0,0.0,0.0,0.0,skin_care
B0006FMR28,0.0,1.0,0.0,0.0,0.0,0.0,0.0,skin_care
B0006NY1AQ,0.0,1.0,0.0,1.0,0.0,0.0,0.0,skin_care
B0006VSVJ0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,skin_care
B0007RUV8M,0.0,1.0,0.0,1.0,0.0,0.0,0.0,skin_care
B00095LIP2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,skin_care


In [26]:
df_meta[df_meta['asin'] == 'B0006FMR28']['title'].values

array(['PCA SKIN C-Quench Antioxidant Serum, 1 fl. oz.'], dtype=object)

# Aspect-based Sentiment Analysis
Use skin care product as an example

In [27]:
# Import raw data
raw_data = [json.loads(i) for i in open('/content/drive/My Drive/Data Mining/Luxury_Beauty_5.json', 'rt')]

In [28]:
# Select data from asin, reviewerID, reviewText, summary
data = pd.DataFrame(raw_data)[['asin', 'reviewerID','reviewText', 'summary']].dropna()

In [29]:
# Make a copy and drop duplicates
df_sub = data.copy()
df_sub.drop_duplicates(inplace=True)

1. Preprocess Data

In [30]:
# Preprocess data  
stopset = set(stopwords.words('english'))

In [31]:
# Change text to lower case
df_sub['preprocess_lower'] = df_sub['reviewText'].str.replace('\n','')
df_sub['preprocess_lower'] = df_sub['preprocess_lower'].str.lower()
df_sub.head()

Unnamed: 0,asin,reviewerID,reviewText,summary,preprocess_lower
0,B00004U9V2,A2HOI48JK8838M,This handcream has a beautiful fragrance. It d...,Beautiful Fragrance,this handcream has a beautiful fragrance. it d...
1,B00004U9V2,A1YIPEY7HX73S7,"wonderful hand lotion, for seriously dry skin,...",wonderful hand lotion,"wonderful hand lotion, for seriously dry skin,..."
2,B00004U9V2,A2QCGHIJ2TCLVP,"Best hand cream around. Silky, thick, soaks i...",Best hand cream around,"best hand cream around. silky, thick, soaks i..."
3,B00004U9V2,A2R4UNHFJBA6PY,Thanks!!,Five Stars,thanks!!
4,B00004U9V2,A2QCGHIJ2TCLVP,Great hand lotion. Soaks right in and leaves ...,Great hand lotion!,great hand lotion. soaks right in and leaves ...


In [32]:
# Remove punctuation and stopwords
df_sub['preprocess_data'] = df_sub['preprocess_lower'].str.translate(str.maketrans('', '', string.punctuation))
df_sub['preprocess_data'] = df_sub['preprocess_data'].str.split()
df_sub['preprocess_data'] = df_sub['preprocess_data'].apply(lambda x: [item for item in x if item not in stopset])

In [33]:
# Prepare text data for spacy
df_sub['preprocess_str'] = df_sub['preprocess_data'].apply(' '.join)
df_sub.head()

Unnamed: 0,asin,reviewerID,reviewText,summary,preprocess_lower,preprocess_data,preprocess_str
0,B00004U9V2,A2HOI48JK8838M,This handcream has a beautiful fragrance. It d...,Beautiful Fragrance,this handcream has a beautiful fragrance. it d...,"[handcream, beautiful, fragrance, doesnt, stay...",handcream beautiful fragrance doesnt stay prot...
1,B00004U9V2,A1YIPEY7HX73S7,"wonderful hand lotion, for seriously dry skin,...",wonderful hand lotion,"wonderful hand lotion, for seriously dry skin,...","[wonderful, hand, lotion, seriously, dry, skin...",wonderful hand lotion seriously dry skin stays...
2,B00004U9V2,A2QCGHIJ2TCLVP,"Best hand cream around. Silky, thick, soaks i...",Best hand cream around,"best hand cream around. silky, thick, soaks i...","[best, hand, cream, around, silky, thick, soak...",best hand cream around silky thick soaks way l...
3,B00004U9V2,A2R4UNHFJBA6PY,Thanks!!,Five Stars,thanks!!,[thanks],thanks
4,B00004U9V2,A2QCGHIJ2TCLVP,Great hand lotion. Soaks right in and leaves ...,Great hand lotion!,great hand lotion. soaks right in and leaves ...,"[great, hand, lotion, soaks, right, leaves, sk...",great hand lotion soaks right leaves skin supe...


In [34]:
df_sub.shape

(30038, 7)

2. Classify words from reviewText to 4 different aspects - price, moisture, scent, ingredient

In [35]:
# Choose four aspects
aspect = ['price', 'moisture', 'scent', 'ingredient']

# Use word2vec model to convert the four words to vectors
aspect_vec_list  = [model[word] for word in aspect if word in model.vocab]
aspect_vec_zip = zip(aspect, aspect_vec_list)
aspect_vec = dict(aspect_vec_zip)

df_as = pd.DataFrame(aspect_vec).transpose()
df_as.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
price,0.141602,-0.0271,-0.18457,0.174805,0.103027,-0.170898,0.259766,-0.083496,0.273438,0.310547,-0.332031,-0.031982,-0.026855,0.024902,-0.22168,0.404297,-0.055176,-0.140625,0.246094,-0.223633,-0.057129,-0.016113,0.014282,-0.129883,-0.163086,0.170898,0.072266,0.263672,0.00705,-0.011536,0.128906,0.125977,-0.103027,-0.088379,-0.055176,-0.083496,0.04541,0.120605,0.066895,0.139648,...,0.069824,-0.359375,0.088379,-0.271484,0.166992,-0.056396,-0.302734,-0.414062,0.12793,0.050049,0.162109,0.175781,0.178711,0.012817,0.053223,0.5,0.026367,0.085938,0.045654,-0.099121,0.070312,0.145508,0.0354,-0.058594,0.100098,-0.071289,-0.152344,0.251953,-0.137695,-0.049805,-0.077148,0.022827,-0.310547,0.007568,0.369141,0.165039,0.193359,0.014343,0.148438,-0.03833
moisture,-0.021729,0.071289,-0.202148,-0.449219,0.182617,0.100098,-0.102539,0.038818,-0.3125,-0.057129,0.074707,-0.015991,0.21875,-0.147461,-0.125977,0.093262,-0.128906,0.550781,-0.066895,-0.298828,0.142578,0.025757,0.000182,0.001678,0.172852,-0.223633,-0.094238,0.269531,-0.008118,-0.464844,-0.351562,-0.104004,0.142578,-0.212891,-0.429688,-0.259766,0.091309,-0.025146,-0.041504,0.251953,...,0.028931,-0.253906,-0.433594,-0.279297,0.386719,-0.216797,-0.043701,-0.503906,0.229492,-0.386719,-0.089844,0.225586,0.173828,0.021118,-0.147461,0.145508,-0.04248,0.043457,-0.166992,-0.008606,0.03125,0.048584,0.371094,0.169922,-0.168945,-0.302734,0.46875,0.210938,-0.192383,0.177734,-0.023315,0.236328,0.3125,-0.243164,0.332031,0.375,-0.188477,0.032715,0.019897,0.093262
scent,0.410156,0.029419,0.020142,0.036377,-0.140625,0.011047,-0.255859,-0.235352,-0.240234,0.222656,-0.084961,-0.135742,0.205078,0.000782,-0.084961,0.067383,-0.11377,0.088379,0.243164,-0.244141,-0.044678,0.063965,0.103027,-0.048096,-0.208008,-0.091309,-0.19043,0.237305,-0.097168,0.07666,0.002838,-0.05249,-0.014282,-0.136719,-0.197266,0.078613,-0.024292,-0.02832,0.068359,0.113281,...,0.142578,-0.014832,-0.589844,0.166016,0.182617,-0.014099,-0.380859,-0.333984,-0.066406,0.160156,0.055908,0.039795,-0.24707,0.625,0.065918,0.056641,-0.120605,0.064453,0.143555,0.083008,-0.200195,-0.06543,0.318359,0.032959,-0.267578,-0.083008,-0.124512,-0.062256,0.056152,0.199219,-0.357422,0.229492,0.09082,-0.171875,0.28125,-0.25,-0.081055,-0.029053,-0.115723,0.326172
ingredient,0.005432,-0.01123,0.002487,0.101562,-0.020508,0.164062,0.271484,-0.300781,-0.026489,0.206055,-0.091797,-0.308594,0.150391,-0.214844,-0.089355,0.289062,0.034912,0.068848,0.007019,-0.298828,0.150391,0.145508,0.085449,-0.074707,0.021973,-0.077637,0.097168,0.128906,-0.208008,0.166992,0.206055,-0.15332,0.236328,0.217773,0.152344,0.275391,0.026733,0.115723,0.128906,-0.018677,...,0.176758,-0.135742,-0.097168,-0.013184,0.119141,-0.188477,-0.070801,-0.326172,-0.198242,-0.494141,-0.062988,0.134766,-0.006165,-0.075195,-0.052002,-0.075195,-0.185547,-0.160156,-0.111328,-0.119141,0.097656,-0.388672,-0.261719,0.21582,-0.359375,-0.427734,-0.136719,0.369141,-0.132812,0.115234,-0.037354,-0.306641,-0.130859,0.145508,-0.241211,-0.558594,0.052246,0.197266,0.292969,0.006042


In [36]:
# Load previously saved data
df_sub = pd.read_csv('/content/drive/My Drive/Data Mining/df_sub.csv')

In [37]:
# Use defined function pos_tagging() to extract nouns from reviews: extract_rwords
extract_rwords = pos_tagging(df_sub['preprocess_str'])

# Select words that are in word2vec model and convert them into vector
words_filtered = [word for word in extract_rwords if word in model.vocab]
vector_list = [model[word] for word in extract_rwords if word in model.vocab]

word_vec_zip = zip(words_filtered, vector_list)
word_vec = dict(word_vec_zip)

df_wv = pd.DataFrame(word_vec).transpose()
df_wv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
fragrance,0.118164,0.072266,0.087891,0.051514,0.021606,0.077148,-0.101074,-0.503906,0.031982,0.180664,-0.3125,-0.234375,0.125,0.087891,-0.194336,0.12207,0.175781,0.261719,0.129883,0.115723,-0.1875,0.061279,0.165039,-0.291016,-0.15625,0.277344,0.193359,0.28125,-0.034424,0.116699,-0.201172,0.089355,-0.271484,0.103516,-0.193359,-0.051025,0.124023,-0.302734,-0.0625,-0.15332,...,0.131836,-0.016724,-0.237305,0.046875,0.216797,-0.088379,-0.371094,-0.137695,-0.006531,0.133789,-0.061768,0.008911,-0.253906,0.133789,0.511719,0.197266,-0.014038,-0.142578,0.011658,0.380859,-0.185547,0.205078,0.186523,0.037842,0.185547,-0.061523,-0.199219,0.062012,0.095215,0.314453,-0.105957,0.058838,0.025513,-0.205078,0.106445,-0.037598,0.047363,0.417969,0.098145,0.115723
hand,0.093262,-0.046387,-0.133789,0.048096,-0.316406,0.10791,0.300781,-0.28125,-0.018311,0.048584,-0.044434,-0.182617,-0.117188,0.137695,-0.178711,0.003342,-0.107422,0.013428,0.089844,0.072266,0.216797,0.103516,0.189453,0.057617,0.037598,-0.251953,-0.059814,-0.010071,-0.028442,0.12793,0.013,0.014343,-0.10791,0.12793,-0.178711,0.046143,0.006409,0.116699,-0.086914,-0.15332,...,-0.1875,-0.083496,-0.07959,-0.091797,-0.121582,0.077637,-0.091309,0.069824,-0.138672,5.2e-05,-0.146484,0.160156,-0.065918,0.071289,0.171875,-0.091309,0.060547,-0.138672,-0.144531,-0.000641,0.148438,-0.069824,0.197266,0.109863,0.077148,0.124023,-0.232422,-0.066406,0.107422,0.017456,-0.057617,-0.012451,-0.068359,-0.070312,0.176758,-0.039062,0.067383,-0.071777,-0.207031,-0.060303
size,0.012817,0.289062,0.083496,0.115723,0.032959,-0.265625,0.363281,-0.291016,0.116211,0.220703,-0.0625,-0.109863,-0.057373,-0.055908,-0.013916,-0.006592,-0.233398,0.347656,-0.137695,-0.168945,-0.289062,-0.196289,-0.261719,0.310547,-0.248047,0.014221,-0.134766,-0.014832,-0.004822,0.009094,-0.015076,0.224609,-0.126953,0.040771,-0.07373,-0.068848,0.083496,0.157227,-0.043701,0.324219,...,0.223633,-0.085449,-0.335938,0.079102,0.214844,-0.010437,0.048828,-0.033447,0.09668,0.02124,-0.213867,0.091797,0.061279,-0.140625,-0.059814,-0.111328,0.09375,-0.090332,0.083984,-0.141602,0.007812,0.269531,0.217773,0.09668,-0.212891,0.100586,-0.135742,-0.194336,-0.067383,0.016357,-0.277344,-0.025391,-0.084473,-0.398438,0.055908,0.071777,0.024658,0.066406,-0.152344,-0.185547
lotion,-0.025635,0.233398,-0.015625,0.092773,-0.003845,0.204102,-0.05127,-0.400391,-0.019897,-0.233398,0.092285,-0.40625,-0.174805,0.173828,0.013184,0.191406,-0.275391,0.21875,0.063477,-0.10498,-0.141602,0.474609,-0.006927,0.0019,-0.217773,0.113281,0.154297,0.178711,-0.199219,-0.078613,-0.168945,-0.098633,-0.023926,-0.196289,-0.241211,-0.100586,-0.111816,0.017944,0.132812,-0.016357,...,0.236328,-0.054932,-0.25,-0.223633,0.373047,-0.066406,-0.131836,-0.001564,-0.425781,0.051025,-0.099121,0.275391,-0.186523,0.347656,0.073242,-0.120605,-0.269531,-0.105469,0.034912,0.054688,-0.062012,0.257812,0.043701,0.314453,-0.057373,-0.097168,0.07373,0.294922,-0.171875,0.201172,0.09668,-0.171875,0.000233,-0.292969,0.073242,0.287109,-0.125,0.404297,0.119629,0.057373
skin,-0.077148,0.339844,0.120117,-0.277344,-0.273438,0.185547,0.040283,-0.165039,-0.022583,-0.023682,-0.170898,-0.131836,-0.143555,-0.137695,-0.091309,0.146484,-0.164062,0.582031,-0.185547,0.107422,-0.160156,-0.092773,0.149414,-0.09668,-0.353516,-0.255859,0.048828,0.209961,0.081055,-0.228516,-0.287109,0.014709,-0.033936,-0.138672,-0.105469,-0.014404,-0.118652,0.103027,0.058594,-0.049805,...,0.139648,-0.318359,-0.306641,-0.013245,0.257812,0.031738,-0.121094,-0.180664,-0.059326,-0.204102,-0.057861,0.208008,-0.046631,0.136719,0.212891,-0.162109,0.005096,0.019165,-0.15918,0.053223,0.287109,0.031982,-0.081055,0.018433,-0.083496,-0.194336,-0.034424,0.204102,-0.062988,-0.123047,0.148438,0.131836,-0.046875,0.074219,0.228516,0.359375,0.099121,0.089355,-0.043945,0.109375


In [38]:
# Use cosine simiarity to find the words that are near aspects
similarity = pd.DataFrame(columns = df_as.index)
for i in range(len(df_as.index)):
    for j in range(len(df_wv.index)):
        cos_similarity = df_as.iloc[i].dot(df_wv.iloc[j])/(LA.norm(df_as.iloc[i])*LA.norm(df_wv.iloc[j]))
        similarity.loc[df_wv.index[j], df_as.index[i]] = cos_similarity

similarity.head()

Unnamed: 0,price,moisture,scent,ingredient
fragrance,0.0694993,0.150226,0.631274,0.224314
hand,0.0442646,0.0493699,0.0956732,0.0106502
size,0.256229,0.114422,0.0341541,0.0526447
lotion,0.0719066,0.276864,0.353154,0.245955
skin,0.0614008,0.33829,0.240666,0.15311


In [39]:
# Find top 15 words that are close to the word price
h =  similarity['price'].sort_values(ascending = False)
price_top15 = h[h>0.4][0:15].index.values

# Find top 15 words that are close to the word moisture
l =  similarity['moisture'].sort_values(ascending = False)
moisture_top15 = l[l>0.4][0:15].index.values

# Find top 15 words that are close to the word scent
m =  similarity['scent'].sort_values(ascending = False)
scent_top15 = m[m>0.4][0:15].index.values

# Find top 15 words that are close to the word ingredient
n = similarity['ingredient'].sort_values(ascending = False)
ingredient_top15 = n[n>0.4][0:15].index

3. Sentiment Analysis on four aspect

In [40]:
# Load positive and negative words from open source dataset
pos_word = pd.read_csv('/content/drive/My Drive/Data Mining/positive.csv', encoding='latin1')
pos = pos_word['word'].values

neg_word = pd.read_csv('/content/drive/My Drive/Data Mining/negative.csv', encoding='latin1')
neg = neg_word['word'].values

In [41]:
# create a list of globally defined positive and negative words to identify sentiment
# sentiment score based on the laxicon neg, pos words
def feature_sentiment(sentence, pos, neg):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature
              did not exist previously,then updates sentiment to
              each of the new or existing features
    output: updated dictionary
    '''
    sent_dict = dict()
    sentence = nlp(sentence)
    opinion_words = list(neg) + list(pos)
  
    debug = 0
    for token in sentence:
        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:
            sentiment = 1 if token.text in pos else -1
            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            if (token.dep_ == "advmod"):
                continue
            elif (token.dep_ == "amod"):
                sent_dict[token.head.text] = sentiment
            # for opinion words that are adjectives, adverbs, verbs...
            else:
                for child in token.children:
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if child.dep_ == "neg":
                        sentiment *= -1
                for child in token.children:
                    # if verb, check if there's a direct object
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):                        
                        sent_dict[child.text] = sentiment
                        # check for conjugates (a AND b), then add both to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj=1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] = sentiment

                # check for negation
                for child in token.head.children:
                    noun = ""
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"): 
                        sentiment *= -1
                
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.text
                        # Check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] = sentiment
                    debug += 1
    return sent_dict

In [42]:
df_cat = df_category[['category']]
df_cat.reset_index(inplace = True)
df_cat.columns = ['asin', 'category']

In [43]:
# Merge category information to df_sub
df_sub = df_sub.merge(df_cat, how = 'left')

In [44]:
# Select skin care product as an example
df_skin = df_sub[df_sub['category'] == 'skin_care']
df_skin.reset_index(inplace = True, drop = True)

In [45]:
df_absa = pd.DataFrame(columns=['price', 'moisture', 'scent', 'ingredient'])

for p in df_skin['asin'].unique():
    df_skin1 = df_skin[df_skin['asin'] == p]
  
  # Conduct sentiment analysis for one product
    dic = {}
    for i in df_skin1['preprocess_lower']:
        sent_dict = feature_sentiment(i, pos, neg)
        
        for key, value in sent_dict.items():
            if key in dic:
                dic[key] += value
            else:
                dic[key] = value
  
  # Sum up sentiment scores for four aspect
    aspect_group = [price_top15, moisture_top15, scent_top15, ingredient_top15]
    for j in range(4):
        score = 0
        for k in aspect_group[j]:
            if k in dic.keys():
                score += dic[k]
            df_absa.loc[p,df_absa.columns[j]] = score

In [46]:
df_absa.head(10)

Unnamed: 0,price,moisture,scent,ingredient
B0001EL5OU,0,0,0.0,0
B0001EL5R2,-1,0,0.0,0
B0001EL5Q8,0,0,0.0,0
B0001UWRCI,0,0,-1.0,0
B00021D2Z2,0,-1,1.0,0
B00021D2TI,0,0,-1.0,0
B0002CEIQ8,-1,0,-1.5,0
B0002X4F0Q,0,0,0.0,0
B0002X3SH2,0,0,-1.0,0
B0002XBTVO,0,0,-1.0,0
