# Food.com Reviews data analysis

In [134]:
pip install scikit-learn --upgrade

Note: you may need to restart the kernel to use updated packages.


In [1]:
!pip install beautifulsoup4
!pip install tqdm
!pip install NLTK



In [1]:
#Importing Necessary packages
import sqlite3
import numpy as np
import pandas as pd
import re

#Removing HTML tags using Beautiful soap
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#Establishing Connecting to the SQLite Database and listing the table names

con = sqlite3.connect(r"C:\Users\Admin\Food\database.sqlite")
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('Reviews',)]


In [3]:
#Reading SQL Table content using Pandas Function.

df = pd.read_sql_query('select * from Reviews',con)

In [6]:
#Exploring the column details

df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [7]:
df['Text'][3]

'If you are looking for the secret ingredient in Robitussin I believe I have found it.  I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda.  The flavor is very medicinal.'

In [8]:
#Function to categorize score ranges 0-5 as Positive and Negative

def score_categorize(x):
    if x < 3:
        return 'Negative'
    else:
        return 'Positive'
    

In [9]:
#Renaming the column content to category

df['Score'] = df['Score'].map(score_categorize)

In [10]:
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,Positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,Negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,Positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,Negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,Positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,Positive,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,Negative,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,Positive,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,Positive,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


# Data Cleaning

In [12]:
#Filtering duplicate records using SQL query based on User ID and Time 

pd.read_sql_query('select UserId, Time, count(*) \
                  from Reviews \
                  group by UserId,Time \
                  having count(*) > 1 \
                  order by count(*) desc \
                  ',con)

Unnamed: 0,UserId,Time,count(*)
0,A3TVZM3ZIXG8YW,1291420800,199
1,A29JUMRL1US6YP,1278201600,125
2,AJD41FBJD9010,1233360000,73
3,ABDCYK04CL6O4,1323993600,68
4,A26NFIQ7KWI8Y7,1329696000,65
...,...,...,...
76852,AZZA4Q0JACD5U,1340582400,2
76853,AZZH3GGYQSBUC,1229299200,2
76854,AZZTH6DJ0KSIP,1304208000,2
76855,AZZU4D6TZ2L6J,1247875200,2


In [13]:
#Exploring Duplicate records based on Individual user id with review at same time for a product

df[(df['UserId'] == 'A3TVZM3ZIXG8YW') & (df['Time'] == 1291420800)]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
2941,2942,B0002TJAZK,A3TVZM3ZIXG8YW,christopher hayes,7,11,Negative,1291420800,"Filler food is empty, leaves your cat always n...","This review will make me sound really stupid, ..."
2947,2948,B0002TJAZK,A3TVZM3ZIXG8YW,christopher hayes,0,2,Negative,1291420800,"Filler food is empty, leaves your cat always n...","This review will make me sound really stupid, ..."
31782,31783,B00106TG9Y,A3TVZM3ZIXG8YW,christopher hayes,2,8,Negative,1291420800,"Filler food is empty, leaves your cat always n...","This review will make me sound really stupid, ..."
52496,52497,B003ANFMY8,A3TVZM3ZIXG8YW,christopher hayes,19,21,Negative,1291420800,"Filler food is empty, leaves your cat always n...","This review will make me sound really stupid, ..."
52501,52502,B003ANFMY8,A3TVZM3ZIXG8YW,christopher hayes,18,24,Negative,1291420800,"Filler food is empty, leaves your cat always n...","This review will make me sound really stupid, ..."
...,...,...,...,...,...,...,...,...,...,...
499916,499917,B009B87SAC,A3TVZM3ZIXG8YW,christopher hayes,6,14,Negative,1291420800,"Filler food is empty, leaves your cat always n...","This review will make me sound really stupid, ..."
499917,499918,B009B87SAC,A3TVZM3ZIXG8YW,christopher hayes,6,15,Negative,1291420800,"Filler food is empty, leaves your cat always n...","This review will make me sound really stupid, ..."
514140,514141,B003M5VM8O,A3TVZM3ZIXG8YW,christopher hayes,5,9,Negative,1291420800,"Filler food is empty, leaves your cat always n...","This review will make me sound really stupid, ..."
514690,514691,B003MWGSKY,A3TVZM3ZIXG8YW,christopher hayes,3,8,Negative,1291420800,"Filler food is empty, leaves your cat always n...","This review will make me sound really stupid, ..."


In [12]:
#Arranging Dataframe in sorted manner based on Product ID

df.sort_values(by = 'ProductId',axis=0,ascending= True)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
150528,150529,0006641040,A25ACLV5KPB4W,"Matt Hetling ""Matt""",0,1,Positive,1108425600,"Nice cadence, catchy rhymes",In June<br />I saw a charming group<br />of ro...
150506,150507,0006641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,Positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...
150505,150506,0006641040,A2IW4PEEKO2R0U,Tracy,1,1,Positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc..."
150504,150505,0006641040,A2PTSM496CF40Z,"Jason A. Teeple ""Nobody made a greater mistak...",1,1,Positive,1210809600,A classic,Get the movie or sound track and sing along wi...
150503,150504,0006641040,AQEYF1AXARWJZ,"Les Sinclair ""book maven""",1,1,Positive,1212278400,Chicken Soup with Rice,A very entertaining rhyming story--cleaver and...
...,...,...,...,...,...,...,...,...,...,...
191720,191721,B009UOFTUI,AJVB004EB0MVK,D. Christofferson,0,0,Negative,1345852800,weak coffee not good for a premium product and...,"This coffee supposedly is premium, it tastes w..."
1477,1478,B009UOFU20,AJVB004EB0MVK,D. Christofferson,0,0,Negative,1345852800,weak coffee not good for a premium product and...,"This coffee supposedly is premium, it tastes w..."
328481,328482,B009UUS05I,ARL20DSHGVM1Y,Jamie,0,0,Positive,1331856000,Perfect,The basket was the perfect sympathy gift when ...
5702,5703,B009WSNWC4,AMP7K1O84DH1T,ESTY,0,0,Positive,1351209600,DELICIOUS,Purchased this product at a local store in NY ...


In [14]:
#Dropping duplicate entries from Dataframe based on User ID , Text and Time 

df_cor = df.drop_duplicates(subset= ['UserId','Time','Text'],inplace= False)

In [15]:
df_cor.shape

(393892, 10)

In [16]:
df_cor

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,Positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,Negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,Positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,Negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,Positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,Positive,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,Negative,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,Positive,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,Positive,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [17]:
#Percentage of Data after Duplicate removed
df_cor['Id'].size / df['Id'].size * 100

69.29179845686723

In [18]:
df_cor['Score'].value_counts()

Score
Positive    336789
Negative     57103
Name: count, dtype: int64

In [20]:
df_cor['Text'].values[0:5]

array(['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
       'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
       'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
    

# Text Processing on Reviews given by Users

In [21]:
df_cor['Text'][21]

'I bought these for my husband who is currently overseas. He loves these, and apparently his staff likes them also.<br />There are generous amounts of Twizzlers in each 16-ounce bag, and this was well worth the price. <a href="http://www.amazon.com/gp/product/B001GVISJM">Twizzlers, Strawberry, 16-Ounce Bags (Pack of 6)</a>'

In [22]:
for x in df_cor['Text'][0:25]:
    print(x, '\n')

I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. 

Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo". 

This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch. 

If you are looking for the se

In [23]:
#Removing HTML tags using Beautiful soap
from bs4 import BeautifulSoup

x = BeautifulSoup(df_cor['Text'][21],'lxml')
x.get_text()

'I bought these for my husband who is currently overseas. He loves these, and apparently his staff likes them also.There are generous amounts of Twizzlers in each 16-ounce bag, and this was well worth the price. Twizzlers, Strawberry, 16-Ounce Bags (Pack of 6)'

In [24]:
x = BeautifulSoup(df_cor['Text'][21])
x.get_text()

'I bought these for my husband who is currently overseas. He loves these, and apparently his staff likes them also.There are generous amounts of Twizzlers in each 16-ounce bag, and this was well worth the price. Twizzlers, Strawberry, 16-Ounce Bags (Pack of 6)'

In [25]:
for x in df_cor['Text'][0:10]:
    print(x, '\n')

I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. 

Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo". 

This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch. 

If you are looking for the se

In [27]:
import re
def rephrasetext(rephrase):
    rephrase = re.sub(r"don\'t","do not",rephrase)
    rephrase = re.sub(r"didn\'t","did not",rephrase)
    rephrase = re.sub(r"couldn\'t","could not",rephrase)
    rephrase = re.sub(r"let\'s"," us",rephrase)
    rephrase = re.sub(r"that\'s"," that is",rephrase)
    rephrase = re.sub(r"won\'t"," will not",rephrase)
    rephrase = re.sub(r"it\'s"," it is",rephrase)
    rephrase = re.sub(r"can\'t"," can not",rephrase)
    rephrase = re.sub(r"hasn\'t"," has not",rephrase)
    
    rephrase = re.sub(r"\'m", " am",rephrase)
    rephrase = re.sub(r"\'d", " would",rephrase)
    rephrase = re.sub(r"\'ve"," have",rephrase)
    rephrase = re.sub(r"\'ll"," will",rephrase)
    rephrase = re.sub(r"\'re"," are",rephrase)
    rephrase = re.sub(r"let\'s"," us",rephrase)
    
    return rephrase

In [35]:
from tqdm import tqdm

final_processed_review_text = []

for x in tqdm(df_cor['Text'][0:10000]):
    #Calling rephrasetext() method to customize the words on text column during iteration.
    x = rephrasetext(x)
    
    #BeautifulSoap to remove HTML & XML tags in text
    x = BeautifulSoup(x,'lxml').get_text()
    final_processed_review_text.append(x.lower())
    print(final_processed_review_text, '\n')

  x = BeautifulSoup(x,'lxml').get_text()
  3%|██                                                                          | 279/10000 [00:00<00:03, 2789.37it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  6%|████▎                                                                        | 558/10000 [00:00<00:11, 803.72it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  7%|█████▍                                                      

 19%|██████████████▊                                                             | 1945/10000 [00:10<01:05, 123.34it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 20%|███████████████▎                                                             | 1984/10000 [00:11<01:21, 98.52it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 20%|███████████████▊                                                             | 2047/10000 [00:11<01:2

 27%|█████████████████████                                                        | 2736/10000 [00:21<01:34, 77.10it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 28%|█████████████████████▎                                                       | 2762/10000 [00:22<01:43, 70.05it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 28%|█████████████████████▌                                                       | 2808/10000 [00:23<01:5

 34%|█████████████████████████▊                                                   | 3350/10000 [00:33<01:44, 63.66it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 34%|█████████████████████████▉                                                   | 3370/10000 [00:33<02:08, 51.76it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 34%|██████████████████████████▎                                                  | 3424/10000 [00:34<01:3

 39%|██████████████████████████████▎                                              | 3944/10000 [00:45<01:43, 58.68it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 40%|██████████████████████████████▌                                              | 3965/10000 [00:46<02:11, 45.97it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 40%|██████████████████████████████▊                                              | 3996/10000 [00:47<01:4

 44%|█████████████████████████████████▊                                           | 4391/10000 [00:57<02:07, 43.98it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 44%|██████████████████████████████████                                           | 4418/10000 [00:57<01:55, 48.27it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 44%|██████████████████████████████████▎                                          | 4449/10000 [00:58<02:1

 48%|█████████████████████████████████████                                        | 4813/10000 [01:08<01:45, 49.09it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 48%|█████████████████████████████████████▏                                       | 4829/10000 [01:09<02:20, 36.74it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 49%|█████████████████████████████████████▎                                       | 4853/10000 [01:09<02:1

 52%|████████████████████████████████████████                                     | 5200/10000 [01:20<01:54, 41.93it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 52%|████████████████████████████████████████▏                                    | 5215/10000 [01:20<02:10, 36.62it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 52%|████████████████████████████████████████▎                                    | 5228/10000 [01:21<02:3

 55%|██████████████████████████████████████████▋                                  | 5548/10000 [01:31<02:19, 31.83it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 56%|██████████████████████████████████████████▉                                  | 5572/10000 [01:32<02:14, 32.80it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 56%|███████████████████████████████████████████                                  | 5585/10000 [01:33<02:2

 59%|█████████████████████████████████████████████▎                               | 5882/10000 [01:43<02:12, 31.01it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 59%|█████████████████████████████████████████████▍                               | 5901/10000 [01:44<02:04, 32.96it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 59%|█████████████████████████████████████████████▌                               | 5920/10000 [01:44<01:4

 62%|███████████████████████████████████████████████▌                             | 6185/10000 [01:54<02:06, 30.10it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 62%|███████████████████████████████████████████████▋                             | 6197/10000 [01:55<02:12, 28.76it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 62%|███████████████████████████████████████████████▉                             | 6219/10000 [01:56<02:0

 65%|█████████████████████████████████████████████████▉                           | 6479/10000 [02:06<02:39, 22.08it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 65%|██████████████████████████████████████████████████                           | 6498/10000 [02:06<01:53, 30.77it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 65%|██████████████████████████████████████████████████                           | 6509/10000 [02:07<02:0

 67%|███████████████████████████████████████████████████▉                         | 6742/10000 [02:17<02:05, 26.04it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 68%|███████████████████████████████████████████████████▉                         | 6753/10000 [02:17<02:08, 25.18it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 68%|████████████████████████████████████████████████████                         | 6767/10000 [02:18<02:0

 70%|█████████████████████████████████████████████████████▋                       | 6976/10000 [02:27<01:49, 27.59it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 70%|█████████████████████████████████████████████████████▊                       | 6986/10000 [02:28<01:55, 26.07it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 70%|█████████████████████████████████████████████████████▉                       | 6999/10000 [02:28<01:5

 72%|███████████████████████████████████████████████████████▍                     | 7199/10000 [02:37<01:52, 24.90it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 72%|███████████████████████████████████████████████████████▌                     | 7212/10000 [02:38<01:50, 25.19it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 72%|███████████████████████████████████████████████████████▌                     | 7220/10000 [02:38<02:1

 74%|█████████████████████████████████████████████████████████▏                   | 7420/10000 [02:48<01:59, 21.67it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 74%|█████████████████████████████████████████████████████████▎                   | 7436/10000 [02:49<01:51, 22.97it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 74%|█████████████████████████████████████████████████████████▎                   | 7446/10000 [02:49<01:4

 76%|██████████████████████████████████████████████████████████▊                  | 7646/10000 [02:59<01:50, 21.31it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 77%|██████████████████████████████████████████████████████████▉                  | 7656/10000 [03:00<02:02, 19.18it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 77%|███████████████████████████████████████████████████████████                  | 7670/10000 [03:00<01:4

 79%|████████████████████████████████████████████████████████████▍                | 7856/10000 [03:10<01:32, 23.08it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 79%|████████████████████████████████████████████████████████████▌                | 7865/10000 [03:10<01:38, 21.62it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 79%|████████████████████████████████████████████████████████████▋                | 7874/10000 [03:11<01:4

 81%|██████████████████████████████████████████████████████████████▏              | 8071/10000 [03:21<01:27, 22.13it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 81%|██████████████████████████████████████████████████████████████▎              | 8085/10000 [03:22<01:34, 20.25it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 81%|██████████████████████████████████████████████████████████████▎              | 8093/10000 [03:22<01:3

 83%|███████████████████████████████████████████████████████████████▋             | 8269/10000 [03:32<01:25, 20.17it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 83%|███████████████████████████████████████████████████████████████▋             | 8278/10000 [03:33<01:22, 20.76it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 83%|███████████████████████████████████████████████████████████████▊             | 8286/10000 [03:33<01:2

 85%|█████████████████████████████████████████████████████████████████            | 8457/10000 [03:43<01:22, 18.63it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 85%|█████████████████████████████████████████████████████████████████▏           | 8466/10000 [03:43<01:28, 17.24it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 85%|█████████████████████████████████████████████████████████████████▎           | 8480/10000 [03:44<01:1

 86%|██████████████████████████████████████████████████████████████████▌          | 8644/10000 [03:53<01:11, 18.85it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 87%|██████████████████████████████████████████████████████████████████▋          | 8656/10000 [03:54<01:07, 19.90it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 87%|██████████████████████████████████████████████████████████████████▋          | 8663/10000 [03:54<01:1

 88%|███████████████████████████████████████████████████████████████████▉         | 8828/10000 [04:04<00:54, 21.60it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 88%|████████████████████████████████████████████████████████████████████         | 8835/10000 [04:05<01:00, 19.23it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 88%|████████████████████████████████████████████████████████████████████▏        | 8849/10000 [04:05<01:0

 90%|█████████████████████████████████████████████████████████████████████▍       | 9010/10000 [04:15<00:51, 19.12it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 90%|█████████████████████████████████████████████████████████████████████▍       | 9024/10000 [04:15<00:48, 20.02it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 90%|█████████████████████████████████████████████████████████████████████▌       | 9036/10000 [04:16<00:4

 92%|██████████████████████████████████████████████████████████████████████▉      | 9209/10000 [04:26<00:39, 19.99it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 92%|██████████████████████████████████████████████████████████████████████▉      | 9217/10000 [04:27<00:40, 19.15it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 92%|███████████████████████████████████████████████████████████████████████      | 9231/10000 [04:27<00:4

 94%|████████████████████████████████████████████████████████████████████████▍    | 9407/10000 [04:38<00:41, 14.44it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 94%|████████████████████████████████████████████████████████████████████████▌    | 9420/10000 [04:38<00:29, 19.87it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 94%|████████████████████████████████████████████████████████████████████████▌    | 9427/10000 [04:39<00:3

 96%|█████████████████████████████████████████████████████████████████████████▊   | 9593/10000 [04:49<00:22, 17.76it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 96%|█████████████████████████████████████████████████████████████████████████▉   | 9600/10000 [04:49<00:22, 17.50it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 96%|█████████████████████████████████████████████████████████████████████████▉   | 9607/10000 [04:50<00:2

 98%|███████████████████████████████████████████████████████████████████████████▏ | 9764/10000 [04:59<00:13, 17.49it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 98%|███████████████████████████████████████████████████████████████████████████▏ | 9771/10000 [05:00<00:13, 17.01it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 98%|███████████████████████████████████████████████████████████████████████████▎ | 9785/10000 [05:01<00:1

 99%|████████████████████████████████████████████████████████████████████████████▌| 9938/10000 [05:10<00:03, 17.52it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 99%|████████████████████████████████████████████████████████████████████████████▌| 9945/10000 [05:11<00:03, 16.99it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|████████████████████████████████████████████████████████████████████████████▋| 9952/10000 [05:11<00:0

In [36]:
final_processed_review_text[0: 10000]


['i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than  most.',
 'product arrived labeled as jumbo salted peanuts...the peanuts were actually small sized unsalted. not sure if this was an error or if the vendor intended to represent the product as "jumbo".',
 'this is a confection that has been around a few centuries.  it is a light, pillowy citrus gelatin with nuts - in this case filberts. and it is cut into tiny squares and then liberally coated with powdered sugar.  and it is a tiny mouthful of heaven.  not too chewy, and very flavorful.  i highly recommend this yummy treat.  if you are familiar with the story of c.s. lewis\' "the lion, the witch, and the wardrobe" - this is the treat that seduces edmund into selling out his brother and sisters to the witch.',
 'if you are looking f

In [37]:
df_cor['Text'][0:10].values

array(['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
       'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
       'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
    

In [38]:

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Bag of Words
Bag-of-words(BoW) is a statistical language model used to analyze text and documents based on word count. The model does not account for word order within a document. BoW can be implemented as a Python dictionary with each key set to a word and each value set to the number of times that word appears in a text.

In [91]:
#Initializing objects from SKlearn
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [146]:
bag_of_words_final = cv.fit_transform(final_processed_review_text)

feature_names = cv.get_feature_names_out()
print("Feature_Names",feature_names)
print('-' * 100)

#bag_of_words_final = cv.transform(final_processed_review_text)

print("Unique Word List: \n", feature_names.shape) #Total Unique words extracted from our list
print("Bag of Words", bag_of_words_final.shape)



Feature_Names ['00' '000' '0003' ... 'zupas' 'zuppa' 'ît']
----------------------------------------------------------------------------------------------------
Unique Word List: 
 ['00' '000' '0003' ... 'zupas' 'zuppa' 'ît']
Bag of Words (10000, 19777)


In [148]:
bag_of_words_final.shape

(10000, 19777)

In [78]:
final_processed_review_text[0:20]

['i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than  most.',
 'product arrived labeled as jumbo salted peanuts...the peanuts were actually small sized unsalted. not sure if this was an error or if the vendor intended to represent the product as "jumbo".',
 'this is a confection that has been around a few centuries.  it is a light, pillowy citrus gelatin with nuts - in this case filberts. and it is cut into tiny squares and then liberally coated with powdered sugar.  and it is a tiny mouthful of heaven.  not too chewy, and very flavorful.  i highly recommend this yummy treat.  if you are familiar with the story of c.s. lewis\' "the lion, the witch, and the wardrobe" - this is the treat that seduces edmund into selling out his brother and sisters to the witch.',
 'if you are looking f