In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
import numpy as np
import pandas as pd

In [8]:
dataset = pd.read_csv("D:\Data Science\Datasets\combined_data.csv")

In [9]:
sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()

In [10]:
training_size = int(len(sentences) *0.8)

train_sentences = sentences[0:training_size]
train_labels = labels[0:training_size]
test_sentences = sentences[training_size:]
test_labels = labels[training_size:]

train_labels_fin = np.array(train_labels)
test_labels_fin = np.array(test_labels)

In [14]:
vocab_size = 1000
embedding_dim = 16
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [29]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(train_sentences)

padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type, 
                       truncating=trunc_type)

In [28]:
testing_sequences = tokenizer.texts_to_sequences(test_sentences)

testing_padded = pad_sequences(testing_sequences,maxlen=max_length, 
                               padding=padding_type, truncating=trunc_type)

In [33]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[1]))
print(train_sentences[1])

good case excellent value ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
Good case Excellent value.


In [35]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1)
])

In [36]:
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [37]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           16000     
                                                                 
 flatten (Flatten)           (None, 1600)              0         
                                                                 
 dense (Dense)               (None, 6)                 9606      
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 25613 (100.05 KB)
Trainable params: 25613 (100.05 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [39]:
num_epochs = 10
model.fit(padded, train_labels_fin, epochs = num_epochs, validation_data = (testing_padded, test_labels_fin))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x29a35cb2b20>

In [40]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(1000, 16)


In [43]:
import io

out_v = io.open('vecs.tsv', 'w', encoding = 'utf-8')
out_m = io.open('meta.tsv', 'w', encoding = 'utf-8')
for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_v.close()
out_m.close()

In [47]:
from IPython.display import FileLink
import os

vecs_file = 'vecs.tsv'
meta_file = 'meta.tsv'

if os.path.exists(vecs_file):
    display(FileLink(vecs_file))
else:
    print(f"{vecs_file} not found.")

if os.path.exists(meta_file):
    display(FileLink(meta_file))
else:
    print(f"{meta_file} not found.")

In [48]:
fake_reviews =  ['I love this phone', 'I hate spaghetti', 
                'Everything was cold',
                'Everything was hot exactly as I wanted', 
                'Everything was green', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the wilted flowers on the table',
                'only works when I stand on tippy toes', 
                'does not work when I stand on my head']

print(fake_reviews)

['I love this phone', 'I hate spaghetti', 'Everything was cold', 'Everything was hot exactly as I wanted', 'Everything was green', 'the host seated us immediately', 'they gave us free chocolate cake', 'not sure about the wilted flowers on the table', 'only works when I stand on tippy toes', 'does not work when I stand on my head']


In [54]:
padding_type = 'post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fake_padded = pad_sequences(sample_sequences, padding = padding_type, maxlen = max_length)

print('\nHOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!\n')
print("Value close to 1 - Positive, \nValue close to 0 - negative, \n0.5 - ambiguous\n")

classes = model.predict(fake_padded)

for x in range(len(fake_reviews)):
    print(fake_reviews[x])
    print(classes[x])
    print('\n')


HOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!

Value close to 1 - Positive, 
Value close to 0 - negative, 
0.5 - ambiguous

I love this phone
[0.979014]


I hate spaghetti
[0.3179871]


Everything was cold
[0.5182383]


Everything was hot exactly as I wanted
[0.5766645]


Everything was green
[0.6091789]


the host seated us immediately
[0.5804747]


they gave us free chocolate cake
[0.69852597]


not sure about the wilted flowers on the table
[0.22057636]


only works when I stand on tippy toes
[0.6093327]


does not work when I stand on my head
[-0.0464014]




In [65]:
print("Adding not to a good review makes the model to predict its negative. Lets see!")
fakest_review = ['i did not expect that to happen. a cake in my 69th bday?!']

Adding not to a good review makes the model to predict its negative. Lets see!


In [66]:
sample = tokenizer.texts_to_sequences(fakest_review)
fake_pad = pad_sequences(sample, padding = 'post', maxlen = max_length)

In [67]:
classes = model.predict(fake_pad)



In [69]:
print(fakest_review[0])
print(classes[0])

if classes[0] < 0.5:
    print("thats bad")
else:
    print("its alright!")

i did not expect that to happen. a cake in my 69th bday?!
[0.13752355]
thats bad
