<div style="line-height:0.5">
<h1 style="color:#DFD903 ">  Embeddings with Keras </h1>
</div>
<div style="margin-bottom: -2px; line-height:0.3">
<h4> Kaggle tutorial. Gensim Text classification. <br>
</div>
</h4>
<div style="margin-bottom: -33px; line-height:1.8">
<span style="display: inline-block;">
    <h3 style="color: lightblue; display: inline;">Keywords:</h3> ignore CUDA warnings with environ + Natural Language Toolkit (nltk) + BeautifulSoup + Word2Vec + KeyedVectors + mlxtend

</span>
</div>
<br>
<div style="line-height:1.4">
<span style="display: inline-block;">
    <h3 style="color: red; display: ">Notes:</h3> 
    <div style="margin-top: -20px;">
    1) The "word2vec-google-news-300" dataset was not uploaded due to its magnitude, it can be downloaded from "https://huggingface.co/fse/word2vec-google-news-300". <br>
    2) The 'Reviews.csv' dataset was not uploaded, since it weights > 280 MB <br>
    3) Prior to TensorFlow 2.0, it was possible to use Keras as a standalone library without TensorFlow, as in this example. <br>
&emsp;&emsp;&emsp;However, with latest TensorFlow's versions, Keras has been tightly integrated into TensorFlow, <br> &emsp;&emsp;&emsp;and the standalone Keras library is no longer actively developed.
</div>
</span>
</div>

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  #to ignore CUDA warnings

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [2]:
%%script echo Skipping since requirement already satisfied
!pip install wget
import wget

Skipping since requirement already satisfied


In [14]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

from google.colab import files

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk import ne_chunk

from bs4 import BeautifulSoup

from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

import keras
from keras.preprocessing.text import one_hot, Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
#from keras.utils.data_utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Dense, Flatten, Embedding, Input, LSTM

import gensim
import gensim.downloader as api
from gensim.downloader import base_dir
from gensim.models import Word2Vec, KeyedVectors

from mlxtend.plotting import plot_confusion_matrix

<h2 style="color:#DFD903 "> <u> Example 1 </u></h2>

<h3 style="color:#DFD903"> Note: </h3>    
<div style="margin-top: -20px;">
Without using GPU (or with an old CPU) the kernel will probably crash while executing code! <br>
Message => " Canceled future for execute_request message before replies were done. <br>

In [4]:
%%script echo Skipping since it does not work anymore!
#wv = KeyedVectors.load_word2vec_format('./word2vec-google-news-300/word2vec-google-news-300.gz', binary=False, encoding='latin-1', limit=1000)

Skipping since it does not work anymore!


In [5]:
#  load a pre-trained Word2Vec word embedding model (N.B. Slow process)
wv = api.load('word2vec-google-news-300')



In [6]:
# When word2vec-google-news-300 already present in the system
def load_data():
    path = os.path.join(base_dir, 'word2vec-google-news-300', "word2vec-google-news-300.gz")
    model = KeyedVectors.load_word2vec_format(path, binary=True)
    return model
wv = load_data()

In [7]:
# similar words
wv.most_similar(positive=['friend'], topn=5)

[('pal', 0.7476359009742737),
 ('friends', 0.7098034024238586),
 ('buddy', 0.6972493529319763),
 ('dear_friend', 0.6960037350654602),
 ('acquaintance', 0.6843010783195496)]

In [8]:
wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)

[('queen', 0.7118193507194519)]

In [9]:
# Word2Vec vector dimension
len(wv['friend'])
wv['friend']

array([ 0.07080078, -0.21386719,  0.15332031,  0.09423828, -0.03442383,
        0.43359375, -0.16503906, -0.05786133,  0.17578125, -0.08203125,
        0.24511719, -0.19335938, -0.0255127 , -0.09619141, -0.125     ,
        0.02575684,  0.16796875, -0.03759766,  0.09472656, -0.04760742,
        0.20605469,  0.31835938,  0.15917969, -0.17089844,  0.09033203,
       -0.1640625 , -0.15234375,  0.3125    ,  0.06298828, -0.24902344,
        0.15625   , -0.04516602, -0.12890625, -0.00686646, -0.02160645,
        0.14453125,  0.2734375 ,  0.12695312,  0.10742188,  0.11376953,
        0.14355469, -0.00173187,  0.22851562, -0.03515625,  0.17089844,
        0.04516602, -0.07958984, -0.08886719, -0.01342773, -0.09667969,
       -0.12597656,  0.10595703,  0.15332031, -0.03808594,  0.02246094,
        0.01428223, -0.03295898,  0.20703125, -0.03417969,  0.02233887,
        0.00244141,  0.13476562, -0.01403809,  0.13378906,  0.0201416 ,
        0.14746094,  0.00759888, -0.18652344,  0.16113281,  0.10

<h2 style="color:#DFD903 "> <u> Example 2 </u></h2>

Natural Language Toolkit (nltk) provides interfaces to over 50 corpora and lexical resources, such as WordNet. Based on Amazon food reviews data.

In [12]:
#%%script echo Skipping since package already downloaded
nltk.download('stopwords')
stop_words=set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
uploaded_csv = files.upload()

Saving Reviews.csv to Reviews.csv


In [16]:
%%script echo Skipping when Colab is used
rev_frame = pd.read_csv(r'./Reviews.csv')
df = rev_frame.copy()
df.head()

Skipping when Colab is used


In [17]:
# Access the uploaded CSV file's content
if "Reviews.csv" in uploaded_csv:
    df = pd.read_csv("Reviews.csv")
    print(df.head())
else:
    print("CSV file not uploaded.")

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [19]:
df=df[['Text','Score']]


In [20]:
df['review']=df['Text']
df['rating']=df['Score']
df.drop(['Text','Score'], axis=1, inplace=True)

In [21]:
print(df.shape)
df.head()

(568454, 2)


Unnamed: 0,review,rating
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [23]:
# Check for null values
df['review'].isnull().sum()

0

In [24]:
df['rating'].isnull().sum()

0

In [25]:
# remove duplicates/ for every duplicate we will keep only one row of that type.
df.drop_duplicates(subset=['rating','review'],keep='first',inplace=True)

In [26]:
# now check the shape. note that shape is reduced which shows that we did had duplicate rows.
print(df.shape)
df.head()

(393675, 2)


Unnamed: 0,review,rating
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [27]:
# printing some reviews to see insights.
for review in df['review'][:5]:
    print(review+'\n'+'\n')

I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.


Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".


This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.


If you are looking for the se

In [28]:
def mark_sentiment(rating):
    if(rating <= 3):
        return 0
    else:
        return 1

In [29]:
df['sentiment']=df['rating'].apply(mark_sentiment)


In [30]:
df.drop(['rating'],axis=1,inplace=True)


In [31]:
df.head()


Unnamed: 0,review,sentiment
0,I have bought several of the Vitality canned d...,1
1,Product arrived labeled as Jumbo Salted Peanut...,0
2,This is a confection that has been around a fe...,1
3,If you are looking for the secret ingredient i...,0
4,Great taffy at a great price. There was a wid...,1


In [32]:
df['sentiment'].value_counts()

1    306819
0     86856
Name: sentiment, dtype: int64

### => Pre-processing

In [33]:
# function to clean and pre-process the text.
def clean_reviews(review):

    # 1. Removing html tags
    review_text = BeautifulSoup(review,"lxml").get_text()

    # 2. Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)

    # 3. Converting to lower case and splitting
    word_tokens = review_text.lower().split()

    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    word_tokens = [le.lemmatize(w) for w in word_tokens if not w in stop_words]

    cleaned_review=" ".join(word_tokens)
    return cleaned_review

In [34]:
pos_df = df.loc[df.sentiment==1,:][:50000]
neg_df = df.loc[df.sentiment==0,:][:50000]

In [35]:
pos_df.head()


Unnamed: 0,review,sentiment
0,I have bought several of the Vitality canned d...,1
2,This is a confection that has been around a fe...,1
4,Great taffy at a great price. There was a wid...,1
5,I got a wild hair for taffy and ordered this f...,1
6,This saltwater taffy had great flavors and was...,1


In [36]:
neg_df.head()

Unnamed: 0,review,sentiment
1,Product arrived labeled as Jumbo Salted Peanut...,0
3,If you are looking for the secret ingredient i...,0
12,My cats have been happily eating Felidae Plati...,0
16,I love eating them and they are good for watch...,0
26,"The candy is just red , No flavor . Just plan...",0


In [37]:
#combining
df=pd.concat([pos_df,neg_df],ignore_index=True)

In [38]:
print(df.shape)
df.head()

(100000, 2)


Unnamed: 0,review,sentiment
0,I have bought several of the Vitality canned d...,1
1,This is a confection that has been around a fe...,1
2,Great taffy at a great price. There was a wid...,1
3,I got a wild hair for taffy and ordered this f...,1
4,This saltwater taffy had great flavors and was...,1


In [39]:
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)
df.head()

(100000, 2)


Unnamed: 0,review,sentiment
0,Item was yummy. Didn't realize it had MSG hah...,1
1,Never liked regular breakfast bars when I trie...,0
2,My sister in law made biscuits and I couldn't ...,1
3,Some of the Catch was good but about 4 tasted ...,0
4,I took a chance with this product (even though...,0


<h2 style="color:#DFD903 "> <u> Example 3 </u></h2>
Word2Vec embeddings in Gensim

In [40]:
#re_w2v_model = gensim.models.KeyedVectors.load_word2vec_format(r'drive/Colab Notebooks/amazon food reviews/GoogleNews-vectors-negative300.bin', binary=True)

In [42]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [44]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [45]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = []
sum = 0
for review in df['review']:
    sents = tokenizer.tokenize(review.strip())
    sum += len(sents)
    for sent in sents:
        cleaned_sent = clean_reviews(sent)
        sentences.append(cleaned_sent.split())  #use also word_tokenize!
print(sum)
print(len(sentences))

513004
513004


In [46]:
for te in sentences[:5]:
    print(te,"\n")

['item', 'yummy'] 

['realize', 'msg', 'haha', 'kidding', 'product', 'delicious'] 

['mix', 'spoonful', 'special', 'sauce', 'cut', 'firm', 'tofu', 'green', 'onion', 'voila'] 

['never', 'liked', 'regular', 'breakfast', 'bar', 'tried', 'filling', 'sweet'] 

['bar', 'follow', 'footstep', 'regular', 'one'] 



In [49]:
w2v_model=gensim.models.Word2Vec(sentences=sentences, vector_size=300,window=10,min_count=1) # use vector_size, not size!

In [50]:
w2v_model.train(sentences, epochs=10, total_examples=len(sentences))



(38406373, 41193730)

In [51]:
# Embedding of a particular word
w2v_model.wv.get_vector('like')

array([-1.4706005e+00, -1.0472218e+00,  8.2729656e-01,  8.7438500e-01,
        5.0807458e-01, -8.0036056e-01, -2.9483337e-02, -4.9286079e-01,
       -1.1390744e+00, -2.6397204e-01,  1.2314105e+00,  1.0519105e+00,
       -1.2154913e+00,  1.3712431e-01, -6.6436899e-01,  7.7510673e-01,
       -6.6657238e-02,  3.1136429e-01, -5.0796348e-01, -2.6370192e-01,
        1.0823556e-01,  3.5764554e-01,  7.8891683e-01, -9.8900098e-01,
        4.1356605e-01, -1.5508811e+00,  3.3803195e-01,  1.9457597e+00,
        3.3083513e-01,  1.8420376e-01, -1.2674533e+00,  3.1043568e-01,
        4.5664787e-01, -6.1123115e-01,  6.6222721e-01,  1.0805759e+00,
        6.5416545e-01, -9.1645442e-02,  3.3101660e-01, -7.4395925e-01,
       -1.4419241e+00,  1.1908704e+00, -1.4697505e+00, -6.4727849e-01,
        7.3777068e-01,  3.7917390e-01, -3.3586857e-01,  1.0871673e+00,
        8.8693821e-01,  2.3547669e-01,  4.9208358e-01, -2.4307825e-01,
       -4.0062991e-01,  4.0873805e-01,  1.0839374e+00, -2.7673629e-01,
      

In [53]:
""" Store words in a list
N.B.
Do not use "#vocab=w2v_model.wv.vocab"...will lead to AttributeError
The vocab attribute was removed from KeyedVector!
"""
vocab_words = w2v_model.wv.index_to_key
print("The total number of words are : ",len(vocab_words))

The total number of words are :  56379


In [56]:
vocab_words[:15]

['like',
 'taste',
 'product',
 'good',
 'one',
 'flavor',
 'coffee',
 'would',
 'tea',
 'food',
 'great',
 'love',
 'get',
 'dog',
 'really']

In [57]:
# Given a word, get the most similar words
w2v_model.wv.most_similar('like')

[('weird', 0.49776890873908997),
 ('alright', 0.4739651381969452),
 ('strange', 0.47383472323417664),
 ('reminded', 0.47271808981895447),
 ('reminds', 0.4640081524848938),
 ('reminiscent', 0.4527140259742737),
 ('akin', 0.4422779381275177),
 ('odd', 0.4411437511444092),
 ('funny', 0.43418043851852417),
 ('funky', 0.42493459582328796)]

In [58]:
# similaraity b/w two words
w2v_model.wv.similarity('good','like')

0.3923982

In [65]:
print("The no of words is :",len(vocab_words))
# print(vocab)

The no of words is : 56379


In [62]:
word_vec_dict={}
for word in vocab_words:
    word_vec_dict[word]=w2v_model.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict))

The no of key-value pairs :  56379


In [64]:
for word in vocab_words[:5]:
    print(word_vec_dict[word])

[-1.4706005e+00 -1.0472218e+00  8.2729656e-01  8.7438500e-01
  5.0807458e-01 -8.0036056e-01 -2.9483337e-02 -4.9286079e-01
 -1.1390744e+00 -2.6397204e-01  1.2314105e+00  1.0519105e+00
 -1.2154913e+00  1.3712431e-01 -6.6436899e-01  7.7510673e-01
 -6.6657238e-02  3.1136429e-01 -5.0796348e-01 -2.6370192e-01
  1.0823556e-01  3.5764554e-01  7.8891683e-01 -9.8900098e-01
  4.1356605e-01 -1.5508811e+00  3.3803195e-01  1.9457597e+00
  3.3083513e-01  1.8420376e-01 -1.2674533e+00  3.1043568e-01
  4.5664787e-01 -6.1123115e-01  6.6222721e-01  1.0805759e+00
  6.5416545e-01 -9.1645442e-02  3.3101660e-01 -7.4395925e-01
 -1.4419241e+00  1.1908704e+00 -1.4697505e+00 -6.4727849e-01
  7.3777068e-01  3.7917390e-01 -3.3586857e-01  1.0871673e+00
  8.8693821e-01  2.3547669e-01  4.9208358e-01 -2.4307825e-01
 -4.0062991e-01  4.0873805e-01  1.0839374e+00 -2.7673629e-01
 -1.1208154e+00  3.1387535e-01 -2.5282919e-01  1.5104481e+00
  1.5364866e-01  7.5015396e-01 -3.3564457e-01 -6.4486623e-01
  4.7650823e-01 -1.57150

<h3 style="color:#DFD903 "> Preparing data for embedding layer </h3>

In [66]:
# Cleaning reviews.
df['clean_review'] = df['review'].apply(clean_reviews)

In [67]:
""" Find the maximum lenght of any document. Useful for padding
N.B => number of unique words = 56379.
"""
maxi = -1
for i,rev in enumerate(df['clean_review']):
    tokens = rev.split()
    if(len(tokens) > maxi):
        maxi = len(tokens)
print(maxi)

1564


In [69]:
tok = Tokenizer()
tok.fit_on_texts(df['clean_review'])
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(df['clean_review'])

In [70]:
# Set the max lenght of a review
max_rev_len = 1565
vocab_size = len(tok.word_index) + 1
# Choose the embedding's dimension. The one chosen in word2vec constructor
embed_dim = 300

In [71]:
""" Perform Padding to have a maximum length of 1565.
N.B.1
100K reviews are added.
N.B.2
each review is padded to have a lenght of 1565 words.
"""
pad_rev = pad_sequences(encd_rev, maxlen=max_rev_len, padding='post')
pad_rev.shape

(100000, 1565)

### => Embedding Matrix

In [72]:
"""Matrix of embeddings.
If a word is in the vocabulary, it is learned by the w2v model.
if a word is not found, the "embed_vector" corressponding to that vector remain zero.
"""
embed_matrix = np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
    embed_vector = word_vec_dict.get(word)
    if embed_vector is not None:
        embed_matrix[i] = embed_vector

In [74]:
embed_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-1.47060049, -1.04722178,  0.82729656, ...,  0.19299293,
        -0.06685455,  0.16921341],
       [-1.05914783, -0.14229192, -1.33568847, ...,  0.96023971,
        -1.100788  , -0.90617663],
       ...,
       [-0.03994604,  0.01014509, -0.00707868, ...,  0.05418423,
         0.04217557,  0.02987739],
       [ 0.01746368,  0.00753622, -0.02344328, ...,  0.06282504,
         0.06530711, -0.02824233],
       [ 0.01330917, -0.02691734, -0.02911944, ..., -0.00500411,
        -0.05216781,  0.10374828]])

In [76]:
subset_embed = [row[:10] for row in embed_matrix[:10]]

for row in subset_embed:
    print(" ".join(map(str, row)))

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
-1.4706004858016968 -1.0472217798233032 0.8272965550422668 0.8743849992752075 0.5080745816230774 -0.8003605604171753 -0.029483336955308914 -0.4928607940673828 -1.139074444770813 -0.26397204399108887
-1.059147834777832 -0.14229191839694977 -1.3356884717941284 0.17647606134414673 1.796803593635559 -0.8737126588821411 -0.1881643533706665 0.5457282066345215 -0.23364737629890442 -1.0235844850540161
-0.45311927795410156 -0.6348533630371094 1.0980161428451538 -0.05857057496905327 -0.7094829082489014 -0.5552213191986084 0.679408848285675 -0.5254878401756287 0.4672967493534088 1.0894806385040283
-0.5960692763328552 -1.647149682044983 -0.3672001361846924 0.08059047162532806 -0.21876230835914612 0.24482324719429016 0.04018625244498253 -1.1865520477294922 -1.2335060834884644 -0.9128680229187012
-1.7097810506820679 -0.4381442666053772 0.1419142335653305 -1.008131742477417 0.7726464867591858 0.12066451460123062 -0.14475436508655548 -0.8164974451065063 -0.42607