In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_bestbuy = pd.read_csv('bestbuy_dataset.csv',error_bad_lines=False)

In [2]:
for idx,row in df_bestbuy.iterrows():
    if row['Quality'] == row['Quality']:
        row['Quality'] = row['Quality'][0]
    if row['Value'] == row['Value']:
        row['Value'] = row['Value'][0]
    if row['Ease of Use'] == row['Ease of Use']:
        row['Ease of Use'] = row['Ease of Use'][0]
    row['Rating'] = row['Rating'][6]


In [95]:
# General stats

total_products = df_bestbuy['Link'].nunique()
prod_names = df_bestbuy['Name'].unique()
mod_names = df_bestbuy['Model'].unique()
print("Total number of reviews =",len(df_bestbuy))
print("Total number of unique products = ", prod_names)
print("Total number of unique reviewers =",df_bestbuy['User'].nunique())

Total number of reviews = 25644
Total number of unique products =  100
Total number of unique reviewers = 16738


## Aspect-based sentiment analysis

1) create list of features from Nikita's feature list

2) use word2vec to analyze similarity between current aspect and any of the features mentioned

3) if they match -> append the feature

In [4]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load(parse=True, tag=True, entity=False)

In [5]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('enwiki_20180420_nolg_100d.txt')

In [67]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from unidecode import unidecode
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

lemmatizer = WordNetLemmatizer()

def clean(text):
    # convert text to lower case
    text = text.lower()
    stopset = stopwords.words('english') + list(string.punctuation)
    # remove stop words and punctuations 
    # word_tokenize is used to tokenize the input corpus in word tokens.
    text = " ".join([i for i in word_tokenize(text) if i not in stopset])
    return unidecode(text)

def relevant(noun,features,similarity_threshold = 0.5):
    """
    Convert noun to word2vec embedding
    Check if there are similar items in given feature vectors
    If yes, return the key of the relevant aspect
    """
    if noun not in model:
        return False
    word_distances = {}
    for word in features:
        if word not in model:
            word_distances[word] = 0
            continue
        word_distances[word] = model.similarity(noun,word)
    key, value = max(word_distances.items(), key=lambda x:x[1])
    if value > similarity_threshold:
        return key
    
    return False

def extract_aspects(text,features):
    """
    text is a sentence from a review
    features is a list of word2vec embeddings of features you are searching for 
    """
    irrelevant_aspects = []
    relevant_aspects = []
    
    doc = nlp(clean(lemmatizer.lemmatize(text)))
    doc_index = 0
    
    for tok in doc:
        if (tok.pos_ == 'ADJ' and doc[doc_index - 1].pos_ == 'NOUN'):
            #CHECK HERE IF YOUR ASPECT IS SIMILAR TO ANY OF THE ASPECTS WE ARE SEARCHING FOR~~~
            relevant_feature = relevant(str(doc[doc_index - 1]),features)
            if relevant_feature:
                # append the relevant aspect along with the predefined feature to which it is relevant
                relevant_aspects.append([str(doc[doc_index - 1]) + " " + str(tok), relevant_feature])
            else:
                # store irrelevant aspects separately
                irrelevant_aspects.append(str(doc[doc_index - 1]) + " " + str(tok))
        doc_index = doc_index + 1
        
    return irrelevant_aspects,relevant_aspects

In [152]:
# Df that stores aggregated scores for each model
file = open("AGGREGATES_DF.csv",'a')
file.write(','.join(["model","website name"]+features))
file.write('\n')

1

In [153]:
import networkx as nx
import matplotlib.pyplot as plt
import os
from afinn import Afinn

afinn = Afinn(language='en')

features = []

with open('vc_features.txt') as f:
    content = f.read()
    for line in content.splitlines():
        features.append(line)

#Predefine list of relevant and irrelevant aspects
irrelevant_aspects = []
relevant_aspects = []

for link in df_bestbuy['Link'].unique():
    # Take reviews relevant to the model in your dataframe
    mask = df_bestbuy['Link'] == link
    product_df = df_bestbuy[mask] 
    model_name = product_df['Model'].unique()[0].replace("/"," ")
    website_name = product_df['Name'].unique()[0].replace("/"," ")
    print(model_name,website_name)
    for review in product_df['Reviews']:
        if review == review:
            for sentence in review.split('.'):
                irr,rel = extract_aspects(sentence,features)
                
                if irr:
                    irrelevant_aspects.append(irr)
                if rel:
                    relevant_aspects.append(rel)
    
    # Create a dataframe with all the extracted aspects and their categories 
    path = os.path.join("Extracted Aspects", ' '.join([model_name,website_name]) +'.csv')
    df = pd.DataFrame([[asp[0][1], asp[0][0],afinn.score(asp[0][0]) ] for asp in relevant_aspects],columns = ['aspect','review','score'])
    df.to_csv(path)
    
    agg = [model_name,website_name]
    # Aggregate the scores, normalize and write them specific to the current model
    for feature in features:
        mask = df['aspect'] == feature
        try: 
            #if the feature is present
            agg.append(sum(df[mask]['score']) / len(df[mask]))
        except :
            agg.append(None)
    file.write(','.join([str(x) for x in agg]))
    file.write('\n')
    
    
file.close()
        
        

E515020 iRobot - Roomba e5 Wi-Fi Connected Robot Vacuum - Charcoal
S955020 iRobot - Roomba s9+ Wi-Fi Connected Robot Vacuum with Automatic Dirt Disposal - Java Black
I755020 iRobot - Roomba i7+ Wi-Fi Connected Robot Vacuum with Automatic Dirt Disposal - Charcoal
M611020 iRobot - Braava jet m6 Wi-Fi Connected Robot Mop - White
R960020 iRobot - Roomba 960 Wi-Fi Connected Robot Vacuum - Gray
DG3G ECOVACS Robotics - DEEBOT OZMO 930 Wi-Fi Connected Robot Vacuum & Mop - Black
DG3G ECOVACS Robotics - DEEBOT OZMO 930 Wi-Fi Connected Robot Vacuum & Mop - Black
945-0270 Neato Robotics - Botvac D7 Wi-Fi Connected Robot Vacuum - Black Gray
N79SE ECOVACS Robotics - DEEBOT N79SE Wi-Fi Connected Robot Vacuum - Espresso
4632813 iRobot - Braava jet Hard Floor Cleaning Solution
RV1001AE Shark - IQ R100AE Wi-Fi Connected Robot Vacuum - Black
T5 ECOVACS Robotics - DEEBOT OZMO T5 Wi-Fi Connected Robot Vacuum & Mop with Advanced Navigation - Black
4657840 Wet Mopping Pads for iRobot Braava jet 240 (10-Pack)