In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
amz_rev = pd.read_csv('amazon_reviews.csv')

In [3]:
from nltk.tokenize import RegexpTokenizer
def setup_data(corpus):
    print("Cleaning data ... ")
    cleaned_corpus = []
    count = 0
    for doc in corpus:
        tokenizer = RegexpTokenizer(r'\w+')
        cleaned_corpus.append(tokenizer.tokenize(doc.lower()))
        count = count + 1
    print(f"Done cleaning {count} data")
    return cleaned_corpus

In [4]:
amz_rev['reviewText'].fillna('NA', inplace=True)

In [5]:
X = setup_data(amz_rev['reviewText'])

Cleaning data ... 
Done cleaning 4915 data


In [6]:
def convert_binaries(corpus):
    y = []
    print("Converting to binaries ... ")
    for k in corpus:
        if k > 3:
            y.append(1)
        else:
            y.append(0)
    return y

In [7]:
y = convert_binaries(amz_rev['overall'].tolist())

Converting to binaries ... 


In [8]:
def get_vocab(corpus):
    vocab = []
    vocab = [x for line in corpus for x in line]
    vocab = list(set(vocab))
    vocab = sorted(vocab)
    return vocab


In [9]:
MAX_SEQUENCE_LENGTH = 100
VOCAB_SIZE = len(get_vocab(X))

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def get_sequences(corpus):
    tokenizer = Tokenizer(num_words=VOCAB_SIZE)
    tokenizer.fit_on_texts(corpus)
    sequences = tokenizer.texts_to_sequences(corpus)
    padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    return padded_sequences

In [11]:
X_seq = get_sequences(X)

In [12]:
from sklearn.model_selection import train_test_split
def train_test_val_split(X,y, test_size=0.1, val_size = 0.1,random_state=16):
    total_test_size = test_size + val_size
    X_train, X_t, y_train, y_t = train_test_split(X, y, test_size=total_test_size, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_t, y_t, test_size=test_size/total_test_size, random_state=random_state)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [13]:
X_train, X_val, X_test, y_train, y_val, y_test = train_test_val_split(X_seq,y)

In [14]:
y_test = np.array(y_test)
y_train = np.array(y_train)
y_val = np.array(y_val)

In [15]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, MaxPooling1D, Conv1D
from tensorflow.keras.callbacks import EarlyStopping

In [16]:
model = Sequential()

model.add(Embedding(VOCAB_SIZE, output_dim = 128, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1090048   
                                                                 
 lstm (LSTM)                 (None, 100, 128)          131584    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1271105 (4.85 MB)
Trainable params: 1271105 (4.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
earlystopping = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3,min_delta = 0.008)

In [20]:
model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=100,batch_size=128,callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 9: early stopping


<keras.src.callbacks.History at 0x23e5df02790>

In [None]:
model.evaluate(X_test,y_test)



[0.30472633242607117, 0.9044715166091919]

In [None]:
import gensim.downloader as api
def load_embedding_model():
    wv_from_bin = api.load("glove-wiki-gigaword-200")
    print("Loaded vocab size %i" % len(list(wv_from_bin.index_to_key)))
    return wv_from_bin
wv_from_bin = load_embedding_model()

Loaded vocab size 400000


In [None]:
import random
def get_matrix_of_vectors(wv_from_bin, required_words):

    words = list(wv_from_bin.index_to_key)
    print("Shuffling words ...")
    random.seed(225)
    random.shuffle(words)
    words = words[:10000]
    print("Putting %i words into word2ind and matrix M..." % len(words))
    word2ind = {}
    M = []
    curInd = 0
    for w in words:
        try:
            M.append(wv_from_bin.get_vector(w))
            word2ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    for w in required_words:
        if w in words:
            continue
        try:
            M.append(wv_from_bin.get_vector(w))
            word2ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    M = np.stack(M)
    print("Done.")
    return M, word2ind

In [None]:
from sklearn.decomposition import TruncatedSVD
def reduce_to_k_dim(M,k=2):
    print(f"Reducing to {k} dimensions...")
    svd = TruncatedSVD(n_components=k, n_iter=10, random_state=16)
    M_reduced = svd.fit_transform(M)
    return M_reduced

In [None]:
M2,word2index2 = get_matrix_of_vectors(wv_from_bin, get_vocab(X))

Shuffling words ...
Putting 10000 words into word2ind and matrix M...
Done.


In [None]:
M_reduced = reduce_to_k_dim(M2, k=128)

Reducing to 128 dimensions...


In [None]:
embedding_dim = M_reduced.shape[1]
BATCH_SIZE = 20

In [None]:
model1 = Sequential()
model1.add(Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'))
model1.add(Embedding(input_dim = M_reduced.shape[0],output_dim = M_reduced.shape[1],weights = [M_reduced],trainable=False))
model1.add(Conv1D(128, 5, activation='relu'))
model1.add(MaxPooling1D(5))
model1.add(Dense(1))
model1.compile(loss = 'categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:
model1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 128)          2157696   
                                                                 
 conv1d_2 (Conv1D)           (None, 96, 128)           82048     
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 19, 128)           0         
 g1D)                                                            
                                                                 
 dense_2 (Dense)             (None, 19, 1)             129       
                                                                 
Total params: 2239873 (8.54 MB)
Trainable params: 82177 (321.00 KB)
Non-trainable params: 2157696 (8.23 MB)
_________________________________________________________________


In [None]:
model1.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=BATCH_SIZE, callbacks=[earlystopping])

Epoch 1/10


  return dispatch_target(*args, **kwargs)


ValueError: in user code:

    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\engine\training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\engine\training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\losses.py", line 2122, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\kilan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\keras\src\backend.py", line 5560, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None,) and (None, 19, 1) are incompatible


In [None]:
M_reduced.shape

(16857, 128)