# Text Processing (Butter and Cheese Reviews)

In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import re
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import nltk
nltk.download('omw-1.4')


import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kftsu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kftsu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
#importing dataset
train = pd.read_csv('butter_cheese_review_ie.csv')

In [3]:
train.head(5)

Unnamed: 0,Review,Brand,Category
0,"I didn’t get the flavor I was expecting, espec...",Irish,Butter
1,Kerrygold is not a dairy in Ireland. It is jus...,Irish,Butter
2,This is an excellent butter for eating but ter...,Irish,Butter
3,I purchased an 8 oz at the local Kroger for 3....,Irish,Butter
4,And I'm picky about the dairy I use. save your...,Irish,Butter


In [4]:
train.shape

(50, 3)

In [5]:
train['word_count'] = train['Review'].apply(lambda x: len(str(x).split(" ")))

In [6]:
# adding a column for the number of words of the review
train[['Review','word_count']].head()

Unnamed: 0,Review,word_count
0,"I didn’t get the flavor I was expecting, espec...",19
1,Kerrygold is not a dairy in Ireland. It is jus...,75
2,This is an excellent butter for eating but ter...,34
3,I purchased an 8 oz at the local Kroger for 3....,38
4,And I'm picky about the dairy I use. save your...,24


In [7]:
train['char_count'] = train['Review'].str.len() 

In [8]:
# Adding a column for the number of characters of the review. This also includes spaces

train[['Review','char_count']].head()

Unnamed: 0,Review,char_count
0,"I didn’t get the flavor I was expecting, espec...",97
1,Kerrygold is not a dairy in Ireland. It is jus...,419
2,This is an excellent butter for eating but ter...,181
3,I purchased an 8 oz at the local Kroger for 3....,199
4,And I'm picky about the dairy I use. save your...,115


In [9]:
train.head(5)

Unnamed: 0,Review,Brand,Category,word_count,char_count
0,"I didn’t get the flavor I was expecting, espec...",Irish,Butter,19,97
1,Kerrygold is not a dairy in Ireland. It is jus...,Irish,Butter,75,419
2,This is an excellent butter for eating but ter...,Irish,Butter,34,181
3,I purchased an 8 oz at the local Kroger for 3....,Irish,Butter,38,199
4,And I'm picky about the dairy I use. save your...,Irish,Butter,24,115


#### Average word

In [10]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))


In [11]:
train['avg_word'] = train['Review'].apply(lambda x: avg_word(x))

In [12]:
train[['Review','avg_word']].head()

Unnamed: 0,Review,avg_word
0,"I didn’t get the flavor I was expecting, espec...",4.157895
1,Kerrygold is not a dairy in Ireland. It is jus...,4.6
2,This is an excellent butter for eating but ter...,4.352941
3,I purchased an 8 oz at the local Kroger for 3....,4.263158
4,And I'm picky about the dairy I use. save your...,3.833333


#### Natural Language Processing

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kftsu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

<font color = red> *This means there are stopwords in the reviews.
I wanted to look at the stopwords.

In [14]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [15]:
#Counting the number of stop words

train['stopwords'] = train['Review'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['Review','stopwords']].head()

Unnamed: 0,Review,stopwords
0,"I didn’t get the flavor I was expecting, espec...",7
1,Kerrygold is not a dairy in Ireland. It is jus...,30
2,This is an excellent butter for eating but ter...,14
3,I purchased an 8 oz at the local Kroger for 3....,12
4,And I'm picky about the dairy I use. save your...,12


In [16]:
#Counting number of hashtags present in the review

train['hastags'] = train['Review'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['Review','hastags']].head()

Unnamed: 0,Review,hastags
0,"I didn’t get the flavor I was expecting, espec...",0
1,Kerrygold is not a dairy in Ireland. It is jus...,0
2,This is an excellent butter for eating but ter...,0
3,I purchased an 8 oz at the local Kroger for 3....,0
4,And I'm picky about the dairy I use. save your...,0


In [17]:
# Counting the numerics in the review

train['numerics'] = train['Review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['Review','numerics']].head()

Unnamed: 0,Review,numerics
0,"I didn’t get the flavor I was expecting, espec...",0
1,Kerrygold is not a dairy in Ireland. It is jus...,0
2,This is an excellent butter for eating but ter...,0
3,I purchased an 8 oz at the local Kroger for 3....,1
4,And I'm picky about the dairy I use. save your...,0


In [18]:
# number of uppercases present in reviews

train['upper'] = train['Review'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['Review','upper']].head()

Unnamed: 0,Review,upper
0,"I didn’t get the flavor I was expecting, espec...",3
1,Kerrygold is not a dairy in Ireland. It is jus...,1
2,This is an excellent butter for eating but ter...,1
3,I purchased an 8 oz at the local Kroger for 3....,2
4,And I'm picky about the dairy I use. save your...,1


I wanted to transform uppercases to lowercase to avoid having multiple copies of the same words.

In [19]:
#transforming uppercases to lowercase

train['Review'] = train['Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['Review'].head()

0    i didn’t get the flavor i was expecting, espec...
1    kerrygold is not a dairy in ireland. it is jus...
2    this is an excellent butter for eating but ter...
3    i purchased an 8 oz at the local kroger for 3....
4    and i'm picky about the dairy i use. save your...
Name: Review, dtype: object

I wanted to remove special characters in the reviews.

In [20]:
#removing special characters

train['Review'] = train['Review'].str.replace('[^\w\s]','')
train['Review'].head()

  train['Review'] = train['Review'].str.replace('[^\w\s]','')


0    i didnt get the flavor i was expecting especia...
1    kerrygold is not a dairy in ireland it is just...
2    this is an excellent butter for eating but ter...
3    i purchased an 8 oz at the local kroger for 39...
4    and im picky about the dairy i use save your m...
Name: Review, dtype: object

I wanted to make a list of stopwords so I can remove them.

In [21]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['Review'] = train['Review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['Review'].head()

0    didnt get flavor expecting especially price wa...
1    kerrygold dairy ireland umbrella marketing bra...
2    excellent butter eating terrible baking makes ...
3    purchased 8 oz local kroger 399 also purchased...
4    im picky dairy use save money good cheaper eve...
Name: Review, dtype: object

I wanted to remove the common words

In [22]:
#count of common words
freq = pd.Series(' '.join(train['Review']).split()).value_counts()[:10]
freq

butter       53
great        15
taste        15
good         10
like          9
use           8
well          8
love          7
kerrygold     7
brand         7
dtype: int64

In [23]:
#Removing the most common words

freq = list(freq.index)
train['Review'] = train['Review'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['Review'].head()

0    didnt get flavor expecting especially price wa...
1    dairy ireland umbrella marketing brandname man...
2    excellent eating terrible baking makes cookies...
3    purchased 8 oz local kroger 399 also purchased...
4        im picky dairy save money cheaper even better
Name: Review, dtype: object

We can see that the common words are removed as their presence will not of any use in classification of my text data.

Next, I wanted to remove the rare words. This is because the association between these rare words and other words is dominated by noise.

In [24]:
freq = pd.Series(' '.join(train['Review']).split()).value_counts()[-10:]
freq

reading     1
tried       1
today       1
lol         1
problem     1
italian     1
cheeseby    1
anymore     1
probobly    1
result      1
dtype: int64

In [25]:
#Removing the most rare words

freq = list(freq.index)
train['Review'] = train['Review'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['Review'].head()

0    didnt get flavor expecting especially price wa...
1    dairy ireland umbrella marketing brandname man...
2    excellent eating terrible baking makes cookies...
3    purchased 8 oz local kroger 399 also purchased...
4        im picky dairy save money cheaper even better
Name: Review, dtype: object

I wanted to correct spelling of the reviews.

In [26]:
#correcting spelling

from textblob import TextBlob
train['Review'][:5].apply(lambda x: str(TextBlob(x).correct()))


0    didn get flavor expecting especially price wan...
1    dairy ireland umbrella marketing brandname man...
2    excellent eating terrible baking makes colonie...
3    purchased 8 oz local roger 399 also purchased ...
4         in pick dairy save money cheaper even better
Name: Review, dtype: object

#### Tokenization
-dividing the text into a sequence of words or sentences. (From David's lecture notes)

In [27]:
#Tokenization

nltk.download('punkt')
TextBlob(train['Review'][1]).words

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kftsu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


WordList(['dairy', 'ireland', 'umbrella', 'marketing', 'brandname', 'many', 'different', 'dairies', 'europe', 'considered', 'premium', 'ordinary', 'yellower', 'color', 'american', 'butters', 'ill', 'give', 'disappointing', 'many', 'common', 'brands', 'even', 'kind', 'served', 'cafeterias', 'much', 'better', 'wont', 'purchasing'])

#### Stemming
-removal of suffices, like “ing”, “ly”, “s”, etc. by a simple rule-based 
approach. For this purpose, I will use PorterStemmer from the NLTK library. (From David's Lecture notes)

In [28]:
#Stemming

from nltk.stem import PorterStemmer
st = PorterStemmer()
train['Review'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0       didnt get flavor expect especi price want much
1    dairi ireland umbrella market brandnam mani di...
2    excel eat terribl bake make cooki fall apart l...
3    purchas 8 oz local kroger 399 also purchas pou...
4        im picki dairi save money cheaper even better
Name: Review, dtype: object

#### Lemmatization
-converts the word into its root word, rather than just stripping the suffices. It makes use of the vocabulary and does a morphological analysis to obtain the root word (From David's Lecture notes)

In [29]:
# Lemmatization

nltk.download('wordnet')
from textblob import Word
train['Review'] = train['Review'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['Review'].head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kftsu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    didnt get flavor expecting especially price wa...
1    dairy ireland umbrella marketing brandname man...
2    excellent eating terrible baking make cooky fa...
3    purchased 8 oz local kroger 399 also purchased...
4        im picky dairy save money cheaper even better
Name: Review, dtype: object

I wanted to extract bigrams from the reviews using the ngrams function of the textblob. I wanted to capture the language structure, like what letter or word is likely to follow the given one. I wanted to work with bigrams to capture general knowlege. (From David's lecture notes)

In [30]:
TextBlob(train['Review'][0]).ngrams(2)

[WordList(['didnt', 'get']),
 WordList(['get', 'flavor']),
 WordList(['flavor', 'expecting']),
 WordList(['expecting', 'especially']),
 WordList(['especially', 'price']),
 WordList(['price', 'wanted']),
 WordList(['wanted', 'much'])]

#### Term frequency
-the ratio of the count of a word present in a sentence, to the length of the sentence. (From David's lecture notes)

In [31]:
#term frequency

tf1 = (train['Review'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,dairy,2
1,many,2
2,ill,1
3,wont,1
4,better,1
5,much,1
6,cafeteria,1
7,served,1
8,kind,1
9,even,1


#### Inverse Document Frequency (IDF)
-the log of the ratio of the total number of rows to the 
number of rows in which that word is present. The more the value of IDF, the more unique is the word. (From David's Lecture notes)


In [32]:
#getting IDF

import numpy as np

for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['Review'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,dairy,2,2.813411
1,many,2,3.912023
2,ill,1,2.525729
3,wont,1,3.218876
4,better,1,2.525729
5,much,1,2.302585
6,cafeteria,1,3.912023
7,served,1,2.813411
8,kind,1,3.912023
9,even,1,2.525729


#### TF-IDF
-the multiplication of the TF and IDF calculated above.The importance of a term is inversely related to its frequency across documents (Capitalone.com)

In [33]:
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,dairy,2,2.813411,5.626821
1,many,2,3.912023,7.824046
2,ill,1,2.525729,2.525729
3,wont,1,3.218876,3.218876
4,better,1,2.525729,2.525729
5,much,1,2.302585,2.302585
6,cafeteria,1,3.912023,3.912023
7,served,1,2.813411,2.813411
8,kind,1,3.912023,3.912023
9,even,1,2.525729,2.525729


#### Bag of Words (BoW)
-representation of text which describes the presence of words within the text data. The intuition behind this is that two similar text fields will contain similar kind of words, and will therefore have a similar bag of words. (From David's Lecture notes)

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['Review'])

train_vect

<50x340 sparse matrix of type '<class 'numpy.float64'>'
	with 482 stored elements in Compressed Sparse Row format>

In [35]:
print(train_vect)

  (0, 322)	0.43557855527610295
  (0, 232)	0.43557855527610295
  (0, 104)	0.43557855527610295
  (0, 110)	0.3939118169288144
  (0, 127)	0.2931190359191502
  (0, 85)	0.43557855527610295
  (1, 239)	0.20286506163374038
  (1, 332)	0.18345931876482977
  (1, 29)	0.15901097411710918
  (1, 45)	0.20286506163374038
  (1, 271)	0.20286506163374038
  (1, 167)	0.20286506163374038
  (1, 35)	0.20286506163374038
  (1, 59)	0.20286506163374038
  (1, 89)	0.20286506163374038
  (1, 154)	0.20286506163374038
  (1, 40)	0.18345931876482977
  (1, 16)	0.18345931876482977
  (1, 57)	0.15901097411710918
  (1, 337)	0.20286506163374038
  (1, 213)	0.20286506163374038
  (1, 230)	0.20286506163374038
  (1, 64)	0.20286506163374038
  (1, 106)	0.20286506163374038
  (1, 86)	0.18345931876482977
  :	:
  (43, 310)	0.26957497045151596
  (43, 236)	0.22702608343277972
  (44, 231)	0.5773502691896257
  (44, 7)	0.5773502691896257
  (44, 186)	0.5773502691896257
  (45, 275)	0.7556627889402567
  (45, 29)	0.6549608762445532
  (46, 103)	0.42

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['Review'])
train_bow

<50x383 sparse matrix of type '<class 'numpy.int64'>'
	with 561 stored elements in Compressed Sparse Row format>

In [37]:
print(train_bow)

  (0, 94)	1
  (0, 158)	1
  (0, 144)	1
  (0, 125)	1
  (0, 115)	1
  (0, 264)	1
  (0, 361)	1
  (0, 228)	1
  (1, 228)	1
  (1, 86)	2
  (1, 184)	1
  (1, 355)	1
  (1, 220)	1
  (1, 42)	1
  (1, 218)	2
  (1, 95)	1
  (1, 117)	1
  (1, 72)	1
  (1, 262)	1
  (1, 245)	1
  (1, 379)	1
  (1, 65)	1
  (1, 18)	1
  (1, 46)	1
  (1, 178)	1
  :	:
  (44, 263)	1
  (45, 35)	1
  (45, 312)	1
  (46, 76)	1
  (46, 106)	1
  (46, 217)	1
  (46, 267)	1
  (46, 11)	1
  (46, 113)	1
  (48, 39)	1
  (48, 143)	1
  (48, 348)	1
  (48, 49)	1
  (48, 54)	1
  (49, 65)	1
  (49, 15)	1
  (49, 84)	1
  (49, 316)	1
  (49, 232)	1
  (49, 378)	1
  (49, 273)	1
  (49, 374)	1
  (49, 282)	1
  (49, 28)	1
  (49, 368)	1


### Sentiment Analysis

In [38]:
train['Review'][:5].apply(lambda x: TextBlob(x).sentiment)

0                                     (0.1, 0.6)
1     (0.045000000000000005, 0.5700000000000001)
2      (0.09999999999999999, 0.7666666666666666)
3    (-0.03714285714285713, 0.22000000000000003)
4                                     (0.5, 0.5)
Name: Review, dtype: object

In [39]:
train['sentiment'] = train['Review'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['Review','sentiment']].head()

Unnamed: 0,Review,sentiment
0,didnt get flavor expecting especially price wa...,0.1
1,dairy ireland umbrella marketing brandname man...,0.045
2,excellent eating terrible baking make cooky fa...,0.1
3,purchased 8 oz local kroger 399 also purchased...,-0.037143
4,im picky dairy save money cheaper even better,0.5


In [40]:
train.head(5)

Unnamed: 0,Review,Brand,Category,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,sentiment
0,didnt get flavor expecting especially price wa...,Irish,Butter,19,97,4.157895,7,0,0,3,0.1
1,dairy ireland umbrella marketing brandname man...,Irish,Butter,75,419,4.6,30,0,0,1,0.045
2,excellent eating terrible baking make cooky fa...,Irish,Butter,34,181,4.352941,14,0,0,1,0.1
3,purchased 8 oz local kroger 399 also purchased...,Irish,Butter,38,199,4.263158,12,0,1,2,-0.037143
4,im picky dairy save money cheaper even better,Irish,Butter,24,115,3.833333,12,0,0,1,0.5


In [41]:
train.tail(5)

Unnamed: 0,Review,Brand,Category,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,sentiment
45,simply better,Dutch,Butter,3,19,5.666667,0,0,0,0,0.5
46,making cooky produced addition end dutch,Dutch,Butter,14,80,4.785714,5,0,0,0,0.0
47,,Dutch,Butter,2,7,3.0,1,0,0,0,0.0
48,first time bought canned buy,Dutch,Butter,19,97,4.157895,6,0,0,2,0.25
49,smooth work recipe also put back winter nice c...,Dutch,Butter,30,158,4.3,11,0,0,0,0.25


In [42]:
train

Unnamed: 0,Review,Brand,Category,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,sentiment
0,didnt get flavor expecting especially price wa...,Irish,Butter,19,97,4.157895,7,0,0,3,0.1
1,dairy ireland umbrella marketing brandname man...,Irish,Butter,75,419,4.6,30,0,0,1,0.045
2,excellent eating terrible baking make cooky fa...,Irish,Butter,34,181,4.352941,14,0,0,1,0.1
3,purchased 8 oz local kroger 399 also purchased...,Irish,Butter,38,199,4.263158,12,0,1,2,-0.037143
4,im picky dairy save money cheaper even better,Irish,Butter,24,115,3.833333,12,0,0,1,0.5
5,drastically priced,Irish,Butter,3,23,7.0,1,0,0,0,0.0
6,dont know pricey,Irish,Butter,11,54,4.0,6,0,0,1,0.0
7,ive read 90 grass fed finished soy corn woman ...,Irish,Butter,38,218,4.763158,13,0,0,0,-0.3125
8,creamy smooth texture nice,Irish,Butter,6,38,5.5,0,0,0,0,0.5
9,taste creamer get buying dealer found costco s...,Irish,Butter,24,130,4.458333,11,0,0,0,0.0


In [43]:
#adding polarity rating

train['Polarity_Rating'] = train['sentiment'].apply(lambda x: 'Positive' if x > 0 else('Neutral' if x == 0 else 'Negative'))

In [44]:
train

Unnamed: 0,Review,Brand,Category,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,sentiment,Polarity_Rating
0,didnt get flavor expecting especially price wa...,Irish,Butter,19,97,4.157895,7,0,0,3,0.1,Positive
1,dairy ireland umbrella marketing brandname man...,Irish,Butter,75,419,4.6,30,0,0,1,0.045,Positive
2,excellent eating terrible baking make cooky fa...,Irish,Butter,34,181,4.352941,14,0,0,1,0.1,Positive
3,purchased 8 oz local kroger 399 also purchased...,Irish,Butter,38,199,4.263158,12,0,1,2,-0.037143,Negative
4,im picky dairy save money cheaper even better,Irish,Butter,24,115,3.833333,12,0,0,1,0.5,Positive
5,drastically priced,Irish,Butter,3,23,7.0,1,0,0,0,0.0,Neutral
6,dont know pricey,Irish,Butter,11,54,4.0,6,0,0,1,0.0,Neutral
7,ive read 90 grass fed finished soy corn woman ...,Irish,Butter,38,218,4.763158,13,0,0,0,-0.3125,Negative
8,creamy smooth texture nice,Irish,Butter,6,38,5.5,0,0,0,0,0.5,Positive
9,taste creamer get buying dealer found costco s...,Irish,Butter,24,130,4.458333,11,0,0,0,0.0,Neutral


In [45]:
train= train[['Review', 'Polarity_Rating']]
train.head(10)

Unnamed: 0,Review,Polarity_Rating
0,didnt get flavor expecting especially price wa...,Positive
1,dairy ireland umbrella marketing brandname man...,Positive
2,excellent eating terrible baking make cooky fa...,Positive
3,purchased 8 oz local kroger 399 also purchased...,Negative
4,im picky dairy save money cheaper even better,Positive
5,drastically priced,Neutral
6,dont know pricey,Neutral
7,ive read 90 grass fed finished soy corn woman ...,Negative
8,creamy smooth texture nice,Positive
9,taste creamer get buying dealer found costco s...,Neutral


In [46]:
train.tail(10)

Unnamed: 0,Review,Polarity_Rating
40,rich cant stop putting cooky,Positive
41,tasting bit expensive side amount surely make ...,Positive
42,mom buy dutch came round yellow cow design fro...,Positive
43,product come clear using toast omelet dish fla...,Positive
44,loved ability keep preserved,Positive
45,simply better,Positive
46,making cooky produced addition end dutch,Neutral
47,,Neutral
48,first time bought canned buy,Positive
49,smooth work recipe also put back winter nice c...,Positive


In [47]:
train.shape

(50, 2)

I wanted to apply One hot encoding on negative, neutral, and positive

In [48]:
one_hot = pd.get_dummies(train["Polarity_Rating"])
train.drop(['Polarity_Rating'],axis=1,inplace=True)
train = pd.concat([train,one_hot],axis=1)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(['Polarity_Rating'],axis=1,inplace=True)


Unnamed: 0,Review,Negative,Neutral,Positive
0,didnt get flavor expecting especially price wa...,0,0,1
1,dairy ireland umbrella marketing brandname man...,0,0,1
2,excellent eating terrible baking make cooky fa...,0,0,1
3,purchased 8 oz local kroger 399 also purchased...,1,0,0
4,im picky dairy save money cheaper even better,0,0,1


Applying Train Test Split

In [49]:
X = train['Review'].values
y = train.drop('Review', axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

Applying Vectorization

In [50]:
vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

Applying frequency, inverse document frequency

In [51]:
tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
X_train = X_train.toarray()
X_test = X_test.toarray()

Adding different layers

In [52]:
model = Sequential()

model.add(Dense(units=12673,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=4000,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=500,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=3, activation='softmax'))

opt=tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

Fitting the Model

In [53]:
# converting to float type
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)

In [54]:
model.fit(x=X_train, y=y_train, batch_size=256, epochs=100, validation_data=(X_test, y_test), verbose=1, callbacks=early_stop)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: early stopping


<keras.callbacks.History at 0x1b40a8c5400>

Evaluating of Model

In [55]:
model_score = model.evaluate(X_test, y_test, batch_size=64, verbose=1)
print('Test accuracy:', model_score[1])

Test accuracy: 0.800000011920929


Prediction

In [56]:
preds = model.predict(X_test)
preds



array([[2.4836038e-01, 4.6408600e-01, 2.8755361e-01],
       [1.4174782e-01, 5.5470020e-01, 3.0355197e-01],
       [2.4836038e-01, 4.6408600e-01, 2.8755361e-01],
       [8.5389741e-02, 6.2101152e-02, 8.5250914e-01],
       [2.7765608e-03, 8.5645297e-04, 9.9636698e-01],
       [1.6613049e-02, 9.3001910e-03, 9.7408676e-01],
       [7.1813978e-02, 9.8141640e-02, 8.3004439e-01],
       [1.1641463e-02, 6.2373206e-03, 9.8212123e-01],
       [1.6613049e-02, 9.3001910e-03, 9.7408676e-01],
       [3.7436872e-03, 1.0016406e-03, 9.9525476e-01],
       [5.0906550e-02, 3.5795685e-02, 9.1329777e-01],
       [2.8177768e-02, 1.5605560e-02, 9.5621669e-01],
       [2.0263985e-02, 8.6981859e-03, 9.7103781e-01],
       [7.2528585e-03, 3.1044241e-03, 9.8964280e-01],
       [3.9283834e-02, 2.4325820e-02, 9.3639034e-01]], dtype=float32)

In [57]:
preds.shape

(15, 3)