[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/infinite-Joy/natural_language_processing_for_professionals/blob/master/notebooks/chapter_5_word_embeddings_in_natural_language_processing.ipynb)

## install the required modules

In [None]:
%%capture
!pip install fasttext

## download the data

In [None]:
!wget https://github.com/infinite-Joy/natural_language_processing_for_professionals/raw/main/data/Video_Games.json.gz

--2023-05-10 09:36:19--  https://github.com/infinite-Joy/natural_language_processing_for_professionals/raw/main/data/Video_Games.json.gz
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://media.githubusercontent.com/media/infinite-Joy/natural_language_processing_for_professionals/main/data/Video_Games.json.gz [following]
--2023-05-10 09:36:19--  https://media.githubusercontent.com/media/infinite-Joy/natural_language_processing_for_professionals/main/data/Video_Games.json.gz
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 522823613 (499M) [application/octet-stream]
Saving to: ‘Video_Games.json.gz’


2023-05-10 09:3

In [None]:
import gzip
import json
import pandas as pd
import string
import imblearn

# map punctuation to space
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 

def text_preprocessing(text):
    """
    Preprocess the text for better understanding
    
    """
    text = text.strip()
    text = text.lower()
    text = text.replace('\n', '.')
    return text

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    df = {}
    i = 0
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('./Video_Games.json.gz')
df = df[['reviewText', 'overall']]
df = df[df['reviewText'].notnull()]
df['reviewText'] = df['reviewText'].apply(text_preprocessing)
df = df.dropna()
df = df.drop_duplicates()
print(df.shape)

(2191356, 2)


In [None]:
df[(df.overall==1) & (df.reviewText.str.contains('go'))].sample(10)

Unnamed: 0,reviewText,overall
762519,"this product does do as advertised, but the ui...",1.0
188618,"first off garbage is stuff you throw away, and...",1.0
1313145,i got these for my ps4. i plugged them in with...,1.0
2087296,just plugged both controllers into my pc but i...,1.0
2467300,the item is dark yellow (mustard color) in no ...,1.0
1885457,"good game, crappy dlc and updates from massive...",1.0
251610,requires more fiddling/bumping/tweaking of the...,1.0
808708,i got this game because the story line interes...,1.0
18931,ah...the wwii u-boat; it instantly conjures up...,1.0
2244609,some like this game because they know games an...,1.0


## cosine similarity of a scalar

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

A = np.array([0.5]).reshape(1, -1)
B = np.array([0.5]).reshape(1, -1)

print(cosine_similarity(A, B))

[[1.]]


## one hot encoding implementation

In [None]:
## define input string
data = 'the quick brown fox jumped over the lazy dog'
consecutive_words = data.split()

## construct the dictionary
all_words = list(set(consecutive_words))

## define a mapping of word to integers
word_to_int = dict((w, i) for i, w in enumerate(all_words))
int_to_word = dict((i, w) for i, w in enumerate(all_words))

## integer encode input data
integer_encoded = [word_to_int[w] for w in consecutive_words]

## one hot encode
onehot_encoded = list()
for value in integer_encoded:
  letter = [0 for _ in range(len(all_words))]
  letter[value] = 1
  onehot_encoded.append(letter)

def argmax(vector):
  # since vector is actually a list and its one hot encoding hence the
  # maximum value is always 1
  return vector.index(1)

for vec in onehot_encoded:
    print('word={word},\t vec={vec}'.format(word=int_to_word[argmax(vec)], vec=vec))

word=the,	 vec=[0, 0, 0, 0, 1, 0, 0, 0]
word=quick,	 vec=[0, 0, 0, 1, 0, 0, 0, 0]
word=brown,	 vec=[0, 0, 1, 0, 0, 0, 0, 0]
word=fox,	 vec=[0, 1, 0, 0, 0, 0, 0, 0]
word=jumped,	 vec=[1, 0, 0, 0, 0, 0, 0, 0]
word=over,	 vec=[0, 0, 0, 0, 0, 0, 0, 1]
word=the,	 vec=[0, 0, 0, 0, 1, 0, 0, 0]
word=lazy,	 vec=[0, 0, 0, 0, 0, 1, 0, 0]
word=dog,	 vec=[0, 0, 0, 0, 0, 0, 1, 0]


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

A = np.array(onehot_encoded[0]).reshape(1, -1)
B = np.array(onehot_encoded[1]).reshape(1, -1)

print(cosine_similarity(A, B))

[[0.]]


## Fasttext Vectors

fasttext website: https://fasttext.cc/

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
!unzip wiki.en.zip

--2023-05-10 09:38:24--  https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.7.82, 13.35.7.128, 13.35.7.50, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.7.82|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10356881291 (9.6G) [application/zip]
Saving to: ‘wiki.en.zip’


2023-05-10 09:43:30 (32.4 MB/s) - ‘wiki.en.zip’ saved [10356881291/10356881291]

Archive:  wiki.en.zip
  inflating: wiki.en.vec             
  inflating: wiki.en.bin             


In [None]:
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## load the model
ft = fasttext.load_model('wiki.en.bin')

## get the word vectors
vector = ft.get_word_vector('vector').reshape(1, -1)
matrix = ft.get_word_vector('matrix').reshape(1, -1)

## compute and report the similarity
print('similarity:', cosine_similarity(vector, matrix))



similarity: [[0.56451917]]


## Glove Embeddings

GloVe website: https://nlp.stanford.edu/projects/glove/

In [None]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip

--2023-05-11 05:58:34--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2023-05-11 05:58:34--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2023-05-11 05:58:34--  https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/

In [None]:
import numpy as np
from tqdm import tqdm

class GloveModel:
    def __init__(self, filename, oov='__oov__'):
        self.oov = oov
        self.wv = self.load_glove_model(filename, self.oov)

    def load_glove_model(self, File, oov):
        print("Loading Glove Model")
        glove_model = {}
        num_lines = sum(1 for line in open(File, 'r'))
        with open(File, 'r') as f:
            for line in tqdm(f, total=num_lines):
                split_line = line.split()
                word = split_line[0]
                try:
                    embedding = np.array(split_line[1:], dtype=np.float32)
                    assert embedding.shape[0] == 300
                    glove_model[word] = embedding
                except:
                    # not thinking too much about it as the errors are 
                    # not going to impact the other models.
                    # pass
                    pass
        print(f"{len(glove_model)} words loaded!")

        # add out of vocabulary to the dict
        # taking the average as per here 
        # https://stackoverflow.com/questions/49239941/what-is-unk-in-the-pretrained-glove-vector-files-e-g-glove-6b-50d-txt
        assert oov not in glove_model
        vecs = np.array(list(glove_model.values()))
        glove_model[oov] = np.mean(vecs, axis=0)

        return glove_model

    def get_vector(self, word):
        if word in self.wv:
            return self.wv[word]
        else:
            return self.wv[self.oov]

glove_model = GloveModel('glove.840B.300d.txt')

Loading Glove Model


100%|██████████| 2196017/2196017 [03:05<00:00, 11836.07it/s]


2195875 words loaded!


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## get the glove vector
vector = glove_model.get_vector('vector').reshape(1, -1)
matrix = glove_model.get_vector('matrix').reshape(1, -1)

## compute and report the similarities.
print('similarity:', cosine_similarity(vector, matrix))

similarity: [[0.47342833]]
