In [2]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [3]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Amazon Music Instrumental Reviews data

In [4]:
df = pd.read_csv('Musical_instruments_reviews.csv')
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"
...,...,...,...,...,...,...,...,...,...
10256,A14B2YH83ZXMPP,B00JBIVXGC,Lonnie M. Adams,"[0, 0]","Great, just as expected. Thank to all.",5.0,Five Stars,1405814400,"07 20, 2014"
10257,A1RPTVW5VEOSI,B00JBIVXGC,Michael J. Edelman,"[0, 0]",I've been thinking about trying the Nanoweb st...,5.0,"Long life, and for some players, a good econom...",1404259200,"07 2, 2014"
10258,AWCJ12KBO5VII,B00JBIVXGC,Michael L. Knapp,"[0, 0]",I have tried coated strings in the past ( incl...,4.0,Good for coated.,1405987200,"07 22, 2014"
10259,A2Z7S8B5U4PAKJ,B00JBIVXGC,"Rick Langdon ""Scriptor""","[0, 0]","Well, MADE by Elixir and DEVELOPED with Taylor...",4.0,Taylor Made,1404172800,"07 1, 2014"


### Extract Summary Column

In [15]:
sentences = df['summary']
sentences = sentences.apply(lambda str: str.lower())
sentences

0                                                     good
1                                                     jake
2                                     it does the job well
3                            good windscreen for the money
4                    no more pops when i record my vocals.
                               ...                        
10256                                           five stars
10257    long life, and for some players, a good econom...
10258                                     good for coated.
10259                                          taylor made
10260    these strings are really quite good, but i wou...
Name: summary, Length: 10261, dtype: object

### Perform text processing

**Using the nltk package for Tokenization, Stemming, and Lemmatization

#### Tokenization

In [16]:
# Create tokens
tokens = sentences.apply(lambda sentence: word_tokenize(sentence))
tokens

0                                                   [good]
1                                                   [jake]
2                               [it, does, the, job, well]
3                      [good, windscreen, for, the, money]
4         [no, more, pops, when, i, record, my, vocals, .]
                               ...                        
10256                                        [five, stars]
10257    [long, life, ,, and, for, some, players, ,, a,...
10258                               [good, for, coated, .]
10259                                       [taylor, made]
10260    [these, strings, are, really, quite, good, ,, ...
Name: summary, Length: 10261, dtype: object

### Stemming

In [17]:
ps = PorterStemmer()

stemmed_tokens = tokens.apply(lambda x: [ps.stem(token) for token in x])
stemmed_tokens

0                                                   [good]
1                                                   [jake]
2                                [it, doe, the, job, well]
3                      [good, windscreen, for, the, money]
4           [no, more, pop, when, i, record, my, vocal, .]
                               ...                        
10256                                         [five, star]
10257    [long, life, ,, and, for, some, player, ,, a, ...
10258                                 [good, for, coat, .]
10259                                       [taylor, made]
10260    [these, string, are, realli, quit, good, ,, bu...
Name: summary, Length: 10261, dtype: object

#### Lemmatization

In [18]:
wl = WordNetLemmatizer()

lemma_tokens = tokens.apply(lambda x: [wl.lemmatize(token) for token in x])
lemma_tokens

0                                                   [good]
1                                                   [jake]
2                                [it, doe, the, job, well]
3                      [good, windscreen, for, the, money]
4           [no, more, pop, when, i, record, my, vocal, .]
                               ...                        
10256                                         [five, star]
10257    [long, life, ,, and, for, some, player, ,, a, ...
10258                               [good, for, coated, .]
10259                                       [taylor, made]
10260    [these, string, are, really, quite, good, ,, b...
Name: summary, Length: 10261, dtype: object

### Print out results

In [31]:
# Find results where there is a big change in the tokens sue to stemming or lemmatization
# Assumption here that if there is a big change in the token and lemmatized token then
# there a likely chance where the stemmed token has changes in it too
test_indices = []
for i, (tokens1, tokens2) in enumerate(zip(tokens, lemma_tokens)):
    if tokens1 != tokens2:
        for t1, t2 in zip(tokens1, tokens2):
            cnt = abs(len(t1) - len(t2))
            for c1, c2 in zip(t1, t2):
                if c1 != c2:
                    cnt += 1 
            if cnt > 1:
                test_indices += [i]

In [33]:
random_indices = np.random.choice(test_indices, 50)
for i in random_indices:
    print(i)
    print(tokens[i])
    print(stemmed_tokens[i])
    print(lemma_tokens[i])
    print()


6627
['your', 'cable', 'store', 'xlr', '3', 'pin', 'microphone', 'cable', '(', '6', 'feet', ')', 'your', 'cable', 'store', 'xlr', '3', 'pin', 'microphone', 'cable', '(', '6', 'feet', ')']
['your', 'cabl', 'store', 'xlr', '3', 'pin', 'microphon', 'cabl', '(', '6', 'feet', ')', 'your', 'cabl', 'store', 'xlr', '3', 'pin', 'microphon', 'cabl', '(', '6', 'feet', ')']
['your', 'cable', 'store', 'xlr', '3', 'pin', 'microphone', 'cable', '(', '6', 'foot', ')', 'your', 'cable', 'store', 'xlr', '3', 'pin', 'microphone', 'cable', '(', '6', 'foot', ')']

3139
['will', 'likely', 'not', 'get', 'two', 'or', 'three', 'uses', 'out', 'of', 'them']
['will', 'like', 'not', 'get', 'two', 'or', 'three', 'use', 'out', 'of', 'them']
['will', 'likely', 'not', 'get', 'two', 'or', 'three', 'us', 'out', 'of', 'them']

8404
['not', '20', 'feet', ';', 'about', '6.5', 'feet', 'instead', ';', 'maybe', 'that', 'matters']
['not', '20', 'feet', ';', 'about', '6.5', 'feet', 'instead', ';', 'mayb', 'that', 'matter']
['not