## load the data and run the data preprocessing

In [None]:
import pandas as pd
import re
import string
import imblearn


# map punctuation to space
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 

def text_preprocessing(text):
    """
    Preprocess the text for better understanding
    
    """
    text = text.strip()
    text = text.lower()
    text = text.replace('\n', '.')
    return text


df = pd.read_json("/app/Video_Games_5.json", lines=True)
df = df[['reviewText', 'overall']]
df = df[df['reviewText'].notnull()]
df['reviewText'] = df['reviewText'].apply(text_preprocessing)
df = df.dropna()
df = df.drop_duplicates()
print(df.shape)

In [None]:
df[(df.overall==1) & (df.reviewText.str.contains('go'))].sample(10)

## cosine similarity of a scalar

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

A = np.array([0.5]).reshape(1, -1)
B = np.array([0.5]).reshape(1, -1)

print(cosine_similarity(A, B))

## one hot encoding implementation

In [None]:
## define input string
data = 'the quick brown fox jumped over the lazy dog'
consecutive_words = data.split()

## construct the dictionary
all_words = list(set(consecutive_words))

## define a mapping of word to integers
word_to_int = dict((w, i) for i, w in enumerate(all_words))
int_to_word = dict((i, w) for i, w in enumerate(all_words))

## integer encode input data
integer_encoded = [word_to_int[w] for w in consecutive_words]

## one hot encode
onehot_encoded = list()
for value in integer_encoded:
  letter = [0 for _ in range(len(all_words))]
  letter[value] = 1
  onehot_encoded.append(letter)

def argmax(vector):
  # since vector is actually a list and its one hot encoding hence the
  # maximum value is always 1
  return vector.index(1)

for vec in onehot_encoded:
    print('word={word},\t vec={vec}'.format(word=int_to_word[argmax(vec)], vec=vec))

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

A = np.array(onehot_encoded[0]).reshape(1, -1)
B = np.array(onehot_encoded[1]).reshape(1, -1)

print(cosine_similarity(A, B))

## Fasttext Vectors

fasttext website: https://fasttext.cc/

The vectors are predownloaded from the website and kept in the image. If you are running this code locally you can download from the link https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip

In [None]:
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## load the model
ft = fasttext.load_model('/app/wiki.en.bin')

## get the word vectors
vector = ft.get_word_vector('vector').reshape(1, -1)
matrix = ft.get_word_vector('matrix').reshape(1, -1)

## compute and report the similarity
print('similarity:', cosine_similarity(vector, matrix))

## Glove Embeddings

GloVe website: https://nlp.stanford.edu/projects/glove/

The vectors are predownloaded from the website and kept in the image. If you are running this code locally you can download from the link http://nlp.stanford.edu/data/glove.840B.300d.zip

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

glove2word2vec(
    glove_input_file="/app/glove.840B.300d.txt",
    word2vec_output_file="gensim_glove_vectors.txt")

glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## get the glove vector
vector = glove_model.wv.get_vector('vector').reshape(1, -1)
matrix = glove_model.wv.get_vector('matrix').reshape(1, -1)

## compute and report the similarities.
print('similarity:', cosine_similarity(vector, matrix))