# Computing product ease of assembly with NLP

In [58]:
# Import cell
import numpy as np
import pandas as pd
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer 
#importing plotly and cufflinks in offline mode
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [2]:
# Products that I've scraped data for
product_set = ( 'office+chair',
                'office+desk',
                'kitchen+table',
                'living+room+couch',
                'end+table',
                'bed+frame',
                'crib',
                'bookcase',
                'coffee+table')

In [17]:
# Load in your csv data
product_dfs = []
for i in product_set:
    this_df = pd.read_csv('./review_csvs/' + i + '_review_data.csv')
    this_df.insert(0, 'Type', i)
    product_dfs.append(this_df)
products = pd.concat(product_dfs)
display(products)

Unnamed: 0,Type,title,content,date,variant,images,verified,author,rating,product,url
0,office+chair,Good if you only need a chair for a month,The fabric/pleather on this chair has torn aft...,13 May 2020,Color: Black,https://images-na.ssl-images-amazon.com/images...,Yes,Idk,1.0,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,/AmazonBasics-Puresoft-PU-Padded-Mid-Back-Comp...
1,office+chair,Unfortunate,Received chair and all of the parts for assemb...,05 Apr 2020,Color: Black,,Yes,Jen P,2.0,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,/AmazonBasics-Puresoft-PU-Padded-Mid-Back-Comp...
2,office+chair,Wrong parts and unusable,Came with incorrect mount for seat,21 Mar 2020,Color: Black,https://images-na.ssl-images-amazon.com/images...,Yes,Smelvin,1.0,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,/AmazonBasics-Puresoft-PU-Padded-Mid-Back-Comp...
3,office+chair,Great Office Chair for the money,After reading all of the reviews I decided to ...,26 Mar 2020,Color: Black,,Yes,AzIronman,5.0,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,/AmazonBasics-Puresoft-PU-Padded-Mid-Back-Comp...
4,office+chair,the chair back rest was hollow,"the back rest was hollow, when i received the ...",10 Aug 2020,Color: Black,,Yes,daniel lomas,1.0,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,/AmazonBasics-Puresoft-PU-Padded-Mid-Back-Comp...
...,...,...,...,...,...,...,...,...,...,...,...
55,coffee+table,Coffee Table is Supreme Quality,I like everything about this coffee table. I l...,05 Aug 2020,,https://images-na.ssl-images-amazon.com/images...,Yes,sorisa77,5.0,"VASAGLE INDESTIC Coffee Table, Living Room Tab...",/VASAGLE-INDESTIC-Cocktail-Industrial-ULCT64X/...
56,coffee+table,Solid Table,This was a perfect size for our living room. ...,01 Feb 2020,,,Yes,Thomas M. Smiley Jr.,4.0,"VASAGLE INDESTIC Coffee Table, Living Room Tab...",/VASAGLE-INDESTIC-Cocktail-Industrial-ULCT64X/...
57,coffee+table,Better than expected!,Love this table! I also got the end tables and...,11 Jan 2020,,https://images-na.ssl-images-amazon.com/images...,Yes,Chris J.,5.0,"VASAGLE INDESTIC Coffee Table, Living Room Tab...",/VASAGLE-INDESTIC-Cocktail-Industrial-ULCT64X/...
58,coffee+table,Excellent value !!,Coffee tables can be so expensive and vary in ...,14 Feb 2020,,https://images-na.ssl-images-amazon.com/images...,Yes,Sonya H.,5.0,"VASAGLE INDESTIC Coffee Table, Living Room Tab...",/VASAGLE-INDESTIC-Cocktail-Industrial-ULCT64X/...


In [46]:
# Define functions that I'll use to pre-process the data

# Remove / and replace with a space instead
def remove_dash(text):
    output = ""
    for i in range(len(text)):
        if text[i] == "/":
            output += " "
            continue
        if text[i] == "\\":
            output = " "
            continue
        output += text[i]
    return output

# Punctuation adds no meaning to text, remove it
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

# Split sentences into individual strings
def tokenize(text):
    split=re.split("\W+",text) 
    return split

# Remove stopwords
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text

# Get the root of each of the words (decrease number of distinct words)
def lemmatize_text(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

['fabric', 'tear', 'chair']


In [51]:
%%capture
# We'll make a new dataframe that has only the data we're interested in
cols = ['Type', 'product', 'title', 'content']
reviews = products[cols]

# Remove any rows that don't have review data
reviews.dropna(subset = ["content"], inplace=True)

# Apply the cleaning functions
reviews['review_length'] = reviews['content'].astype(str).apply(len)
reviews['word_count'] = reviews['content'].apply(lambda x: len(str(x).split()))
reviews['no_dashes'] = reviews['content'].apply(lambda x: remove_dash(x))
reviews['lower_case'] = reviews['no_dashes'].apply(lambda x: x.lower())
reviews['no_punctuation'] = reviews['lower_case'].apply(lambda x: remove_punctuation(x))
reviews['tokenized'] = reviews['no_punctuation'].apply(lambda x: tokenize(x))
reviews['no_stops'] = reviews['tokenized'].apply(lambda x: remove_stopwords(x))
reviews['lemmatized'] = reviews['no_stops'].apply(lambda x: lemmatize_text(x))

In [52]:
display(reviews)

Unnamed: 0,Type,product,title,content,review_length,word_count,no_dashes,lower_case,no_punctuation,tokenized,no_stops,lemmatized
0,office+chair,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,Good if you only need a chair for a month,The fabric/pleather on this chair has torn aft...,405,75,The fabric pleather on this chair has torn aft...,the fabric pleather on this chair has torn aft...,the fabric pleather on this chair has torn aft...,"[the, fabric, pleather, on, this, chair, has, ...","[fabric, pleather, chair, torn, month, daily, ...","[fabric, pleather, chair, tear, month, daily, ..."
1,office+chair,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,Unfortunate,Received chair and all of the parts for assemb...,754,144,Received chair and all of the parts for assemb...,received chair and all of the parts for assemb...,received chair and all of the parts for assemb...,"[received, chair, and, all, of, the, parts, fo...","[received, chair, parts, assembly, zippered, b...","[receive, chair, part, assembly, zipper, back,..."
2,office+chair,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,Wrong parts and unusable,Came with incorrect mount for seat,34,6,Came with incorrect mount for seat,came with incorrect mount for seat,came with incorrect mount for seat,"[came, with, incorrect, mount, for, seat]","[came, incorrect, mount, seat]","[come, incorrect, mount, seat]"
3,office+chair,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,Great Office Chair for the money,After reading all of the reviews I decided to ...,759,145,After reading all of the reviews I decided to ...,after reading all of the reviews i decided to ...,after reading all of the reviews i decided to ...,"[after, reading, all, of, the, reviews, i, dec...","[reading, reviews, decided, purchase, amazonba...","[read, review, decide, purchase, amazonbasics,..."
4,office+chair,AmazonBasics Classic Puresoft PU-Padded Mid-Ba...,the chair back rest was hollow,"the back rest was hollow, when i received the ...",726,138,"the back rest was hollow, when i received the ...","the back rest was hollow, when i received the ...",the back rest was hollow when i received the p...,"[the, back, rest, was, hollow, when, i, receiv...","[back, rest, hollow, received, product, box, p...","[back, rest, hollow, receive, product, box, pa..."
...,...,...,...,...,...,...,...,...,...,...,...,...
55,coffee+table,"VASAGLE INDESTIC Coffee Table, Living Room Tab...",Coffee Table is Supreme Quality,I like everything about this coffee table. I l...,390,69,I like everything about this coffee table. I l...,i like everything about this coffee table. i l...,i like everything about this coffee table i lo...,"[i, like, everything, about, this, coffee, tab...","[like, everything, coffee, table, looked, many...","[like, everything, coffee, table, look, many, ..."
56,coffee+table,"VASAGLE INDESTIC Coffee Table, Living Room Tab...",Solid Table,This was a perfect size for our living room. ...,261,51,This was a perfect size for our living room. ...,this was a perfect size for our living room. ...,this was a perfect size for our living room w...,"[this, was, a, perfect, size, for, our, living...","[perfect, size, living, room, replaced, ottoma...","[perfect, size, live, room, replace, ottoman, ..."
57,coffee+table,"VASAGLE INDESTIC Coffee Table, Living Room Tab...",Better than expected!,Love this table! I also got the end tables and...,285,54,Love this table! I also got the end tables and...,love this table! i also got the end tables and...,love this table i also got the end tables and ...,"[love, this, table, i, also, got, the, end, ta...","[love, table, also, got, end, tables, tv, cons...","[love, table, also, get, end, table, tv, conso..."
58,coffee+table,"VASAGLE INDESTIC Coffee Table, Living Room Tab...",Excellent value !!,Coffee tables can be so expensive and vary in ...,257,52,Coffee tables can be so expensive and vary in ...,coffee tables can be so expensive and vary in ...,coffee tables can be so expensive and vary in ...,"[coffee, tables, can, be, so, expensive, and, ...","[coffee, tables, expensive, vary, quality, too...","[coffee, table, expensive, vary, quality, take..."


In [59]:
# Let's visualize a bit
reviews['review_length'].iplot(kind='hist',
                               bins=100,
                               xTitle='review length',
                               linecolor='black',
                               yTitle='count',
                               title='Review Text Length Distribution')

In [61]:
# Let's visualize a bit
reviews['word_count'].iplot(kind='hist',
                            bins=100,
                            xTitle='Number of words',
                            linecolor='black',
                            yTitle='Number of reviews',
                            title='Review Word Count Distribution')