In [1]:
import os
for dirname, _, filenames in os.walk('../data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


../data/train_gr/game_overview.csv
../data/train_gr/train.csv
../data/test_gr/test.csv


In [2]:
import pandas as pd

train_set = pd.read_csv("../data/train_gr/train.csv")
test_set = pd.read_csv('../data/test_gr/test.csv')
game_ov = pd.read_csv('../data/train_gr/game_overview.csv')

In [3]:
# Dropping unrelevant features
x = train_set["user_review"]
y = train_set["user_suggestion"]

In [133]:
import re

def cleanTexts(texts):
    cleaned = []
    pattern = "[^a-zA-Z0-9]"
    for text in texts:
        clrd = re.sub(pattern," ",text).lower().strip()
        cleaned.append(clrd)
    return cleaned

In [134]:
x_cleaned = cleanTexts(x)
x_cleaned[0]

'i m scared and hearing creepy voices   so i ll pause for a moment and write a review while i wait for my heart beat to return to atleast somewhat calmer times   this game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood  but more bubble and  clean     hello 1990 s what charactes there are  that isnot trying to kill me  were likable and a bit odd   i did do a few noob things though  such as oh look a class room full of ghosts from dead children  lets shine my flashlight on them and stand there staring at them  or  hmm creepy music  i ll turn around and see if i can see what s chasing me never before in a game have i been this afraid of finding a locked door'

In [135]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.compat.v1.keras.layers import GRU


In [136]:
# Tokenizer 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_cleaned)
x_tokens = tokenizer.texts_to_sequences(x_cleaned)

In [137]:
print(x_tokens[0])

[7, 127, 2012, 3, 3558, 3152, 3257, 29, 7, 151, 5595, 14, 4, 819, 3, 1082, 4, 182, 120, 7, 313, 14, 36, 1472, 1021, 2, 1765, 2, 1534, 1030, 13859, 213, 11, 5, 10, 3559, 3, 3152, 27, 36, 922, 610, 184, 15, 16, 1, 171, 16827, 9, 36, 3685, 15, 38, 10871, 3, 2579, 2923, 12072, 19, 63, 23043, 37, 17, 13, 23044, 267, 2, 244, 56, 179, 13860, 3, 4, 206, 2090, 7, 228, 57, 4, 137, 1766, 125, 189, 239, 26, 339, 232, 4, 299, 624, 281, 9, 5147, 44, 781, 2266, 846, 3616, 36, 3856, 20, 88, 3, 1045, 37, 5293, 39, 88, 33, 5596, 3152, 1001, 7, 151, 378, 175, 3, 119, 24, 7, 22, 119, 63, 19, 3506, 56, 173, 148, 12, 4, 5, 18, 7, 108, 11, 2612, 9, 1441, 4, 1138, 1875]


In [138]:
print(x_cleaned[0])

i m scared and hearing creepy voices   so i ll pause for a moment and write a review while i wait for my heart beat to return to atleast somewhat calmer times   this game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood  but more bubble and  clean     hello 1990 s what charactes there are  that isnot trying to kill me  were likable and a bit odd   i did do a few noob things though  such as oh look a class room full of ghosts from dead children  lets shine my flashlight on them and stand there staring at them  or  hmm creepy music  i ll turn around and see if i can see what s chasing me never before in a game have i been this afraid of finding a locked door


In [139]:
import numpy as np

len_arr = [len(s) for s in x_tokens]
MAX_LEN = int(np.percentile(len_arr,.75))

In [140]:
import json
with open("maxlen.json",mode="w") as F:
    json.dump({"maxlen":MAX_LEN},F)

In [141]:
x_tokens_pad = pad_sequences(x_tokens,maxlen=MAX_LEN)
x_tokens_pad.shape

(17494, 21)

In [142]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_tokens_pad, np.asarray(y), test_size=0.2, random_state=42)

In [14]:
word2vec = {} # Trained glove model 
with open("../embeddings/glove.6B.50d.txt", encoding="UTF-8") as f:
    for line in f:
        values = line.split() 
        word = values[0]
        vec = np.asarray(values[1:],dtype="float32")
        word2vec[word] = vec

In [18]:
# initializing as uniform
VOCAB_SIZE = len(tokenizer.word_index) + 1
embedding_matrix = np.random.uniform(-1,1,(VOCAB_SIZE,50))

for word,i in tokenizer.word_index.items():
    if i < VOCAB_SIZE: 
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [86]:
# Each world will be 100D vector.
VECTOR_SIZE = 50

def buildModel(MAX_LEN,embedding_weights=None):
    
    model = keras.Sequential()
    if embedding_weights is not None:
        model.add(layers.Embedding(input_dim=VOCAB_SIZE,
                                   output_dim=VECTOR_SIZE,
                                   input_length=MAX_LEN,
                                   weights=[embedding_weights],
                                   trainable=True
                              ))
        
    else:
        model.add(layers.Embedding(input_dim=VOCAB_SIZE,
                                   output_dim=VECTOR_SIZE,
                                   input_length=MAX_LEN
                                  ))
    
    model.add(GRU(10,return_sequences=False))
    model.add(layers.Dense(1,activation="sigmoid"))
    
    model.compile(optimizer='Adam',
              loss='mse',
              metrics=[keras.metrics.BinaryAccuracy(name="binary_accuracy", dtype=None, threshold=0.5)])
    return model

In [87]:
model = buildModel(MAX_LEN,embedding_matrix)
model.summary()


Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 21, 50)            2596450   
_________________________________________________________________
gru_18 (GRU)                 (None, 10)                1830      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 11        
Total params: 2,598,291
Trainable params: 2,598,291
Non-trainable params: 0
_________________________________________________________________


In [89]:
model.fit(x_train, y_train, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f86d9cb6700>

In [90]:
from sklearn.metrics import confusion_matrix,accuracy_score

_, acc = model.evaluate(x_test, y_test)
print(acc)

0.7199199795722961


In [91]:
model.save('embedding_sentiments')

2021-10-17 16:49:51.029541: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: embedding_sentiments/assets


In [145]:
feature_extractor = keras.Model(
    inputs=model.inputs,
    outputs=[layer.output for layer in model.layers],
)
print(x[0])
print(x_cleaned[0]) 
print(x_tokens[0])
print(pad_sequences([x_tokens[0]],maxlen=MAX_LEN))
feature_extractor(pad_sequences([x_tokens[0]],maxlen=MAX_LEN).reshape(1,21))[1] # requested embbedding

I'm scared and hearing creepy voices.  So I'll pause for a moment and write a review while I wait for my heart beat to return to atleast somewhat calmer times.  This game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood (but more bubble and 'clean').  Hello 1990's.What charactes there are (that isnot trying to kill me) were likable and a bit odd.  I did do a few noob things though, such as:Oh look a class room full of ghosts from dead children, lets shine my flashlight on them and stand there staring at them..Or, hmm creepy music, I'll turn around and see if I can see what's chasing me.Never before in a game have I been this afraid of finding a locked door.
i m scared and hearing creepy voices   so i ll pause for a moment and write a review while i wait for my heart beat to return to atleast somewhat calmer times   this game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood  but more bubble and  cl

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[-0.3324464 , -0.966098  ,  0.93294835,  0.3458986 ,  0.1472227 ,
        -0.5522215 ,  0.6037281 , -0.8544234 , -0.38403505,  0.3401982 ]],
      dtype=float32)>

In [146]:
import json

with open("../../gre-scraper/data/reviews.json", 'r') as f:
    reviews = json.loads(f.read())

In [147]:
scraped_reviews_cleaned = {}
for idx in reviews.keys():
    scraped_reviews_cleaned[idx] = []
    for review in reviews[idx]:
        scraped_reviews_cleaned[idx].append(cleanTexts([review["Review"]]))

In [148]:
x_tokens_scraped = tokenizer.texts_to_sequences(scraped_reviews_cleaned)

In [None]:
x_tokens_scraped

In [149]:
x_tokens_scraped

[[92],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [15500],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [10278],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [15717],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [2526],
 [3596],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [7323],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [10641],
 [],
 [],
 [],
 [],
 [35630],
 [],
 [],
 [1334],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [10048],
 [],
 [],
 [],
 [],
 [