## install the required modules

In [1]:
%%capture
!pip install fasttext

## download the data

In [2]:
import urllib.request as req
from urllib.parse import urlparse
import os
import progressbar
import zipfile
import gzip
import shutil
import json
import pandas as pd
import re
import string
import imblearn

pbar = None


def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size)
        pbar.start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

def wget(url):
    a = urlparse(url)
    filename = os.path.basename(a.path)
    if not os.path.isfile(filename):
        req.urlretrieve(url, filename, show_progress)
        print(f'downloaded to {filename}')
    else:
        print(f'file {filename} has already been downloaded')
    return filename

def unzip(filename, directory_to_extract_to=os.getcwd()):
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(directory_to_extract_to)
        print(f'extraction done {zip_ref.namelist()}')

def gunzip(gzfile, fout):
    with gzip.open(gzfile, 'rb') as f_in:
        with open(fout, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f'{gzfile} extracted to {fout}')


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    df = {}
    i = 0
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


# map punctuation to space
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 

def text_preprocessing(text):
    """
    Preprocess the text for better understanding
    
    """
    text = text.strip()
    text = text.lower()
    text = text.replace('\n', '.')
    return text


Video_Games_5 = wget('http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz')
df = pd.read_json("./Video_Games_5.json.gz", lines=True, compression='gzip')
df = df[['reviewText', 'overall']]
df = df[df['reviewText'].notnull()]
df['reviewText'] = df['reviewText'].apply(text_preprocessing)
df = df.dropna()
df = df.drop_duplicates()
print(df.shape)

100% (154050105 of 154050105) |##########| Elapsed Time: 0:00:04 Time:  0:00:04


downloaded to Video_Games_5.json.gz
(400985, 2)


In [3]:
df[(df.overall==1) & (df.reviewText.str.contains('go'))].sample(10)

Unnamed: 0,reviewText,overall
137776,i am asking myself if the positive-reviewers h...,1
285985,waste of money. the core of the game is basic...,1
217326,this was a waste of money imo. i got this for ...,1
243977,i wish i can give this game zero star. i gave ...,1
270259,this game doesn't even come close to replicati...,1
463555,it's not the storyline or the puzzles that i h...,1
138955,i played diablo 3 for about a month. i hated i...,1
100686,sounds like a lot of other people have had goo...,1
210223,not going to write a whole paragraph on this p...,1
303344,before anyone jumps to the conclusion that i a...,1


## cosine similarity of a scalar

In [4]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

A = np.array([0.5]).reshape(1, -1)
B = np.array([0.5]).reshape(1, -1)

print(cosine_similarity(A, B))

[[1.]]


## one hot encoding implementation

In [5]:
## define input string
data = 'the quick brown fox jumped over the lazy dog'
consecutive_words = data.split()

## construct the dictionary
all_words = list(set(consecutive_words))

## define a mapping of word to integers
word_to_int = dict((w, i) for i, w in enumerate(all_words))
int_to_word = dict((i, w) for i, w in enumerate(all_words))

## integer encode input data
integer_encoded = [word_to_int[w] for w in consecutive_words]

## one hot encode
onehot_encoded = list()
for value in integer_encoded:
  letter = [0 for _ in range(len(all_words))]
  letter[value] = 1
  onehot_encoded.append(letter)

def argmax(vector):
  # since vector is actually a list and its one hot encoding hence the
  # maximum value is always 1
  return vector.index(1)

for vec in onehot_encoded:
    print('word={word},\t vec={vec}'.format(word=int_to_word[argmax(vec)], vec=vec))

word=the,	 vec=[0, 0, 0, 0, 0, 1, 0, 0]
word=quick,	 vec=[0, 0, 0, 1, 0, 0, 0, 0]
word=brown,	 vec=[1, 0, 0, 0, 0, 0, 0, 0]
word=fox,	 vec=[0, 0, 0, 0, 1, 0, 0, 0]
word=jumped,	 vec=[0, 0, 0, 0, 0, 0, 1, 0]
word=over,	 vec=[0, 1, 0, 0, 0, 0, 0, 0]
word=the,	 vec=[0, 0, 0, 0, 0, 1, 0, 0]
word=lazy,	 vec=[0, 0, 0, 0, 0, 0, 0, 1]
word=dog,	 vec=[0, 0, 1, 0, 0, 0, 0, 0]


In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

A = np.array(onehot_encoded[0]).reshape(1, -1)
B = np.array(onehot_encoded[1]).reshape(1, -1)

print(cosine_similarity(A, B))

[[0.]]


## Fasttext Vectors

fasttext website: https://fasttext.cc/

In [7]:
wiki_en = wget('https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip')
unzip(wiki_en)

100% (10356881291 of 10356881291) |######| Elapsed Time: 0:04:01 Time:  0:04:01


downloaded to wiki.en.zip
extraction done ['wiki.en.vec', 'wiki.en.bin']


In [None]:
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## load the model
ft = fasttext.load_model('wiki.en.bin')

## get the word vectors
vector = ft.get_word_vector('vector').reshape(1, -1)
matrix = ft.get_word_vector('matrix').reshape(1, -1)

## compute and report the similarity
print('similarity:', cosine_similarity(vector, matrix))



similarity: [[0.5645191]]


## Glove Embeddings

GloVe website: https://nlp.stanford.edu/projects/glove/

In [10]:
glove = wget('http://nlp.stanford.edu/data/glove.840B.300d.zip')
unzip(glove)

100% (2176768927 of 2176768927) |########| Elapsed Time: 0:06:50 Time:  0:06:50


downloaded to glove.840B.300d.zip
extraction done ['glove.840B.300d.txt']


In [3]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

glove2word2vec(
    glove_input_file="glove.840B.300d.txt",
    word2vec_output_file="gensim_glove_vectors.txt")

glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

In [4]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## get the glove vector
vector = glove_model.wv.get_vector('vector').reshape(1, -1)
matrix = glove_model.wv.get_vector('matrix').reshape(1, -1)

## compute and report the similarities.
print('similarity:', cosine_similarity(vector, matrix))

similarity: [[0.47342822]]


  """
  
