In [1]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder
import random
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# nltk.download('punkt')

# Sample sentiment dataset
positive_sentences = [
    "I love this movie",
    "This film is great",
    "Amazing experience",
    "I am very happy with the product",
    "The book is fantastic"
]

negative_sentences = [
    "I hate this movie",
    "This film is terrible",
    "Horrible experience",
    "I am very unhappy with the product",
    "The book is awful"
]

sentences = positive_sentences + negative_sentences
labels = [1] * len(positive_sentences) + [0] * len(negative_sentences)

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Build the vocabulary
vocab = set(word for sentence in tokenized_sentences for word in sentence)
vocab_size = len(vocab)

# Create word_to_index and index_to_word mappings
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Prepare training data for Word2Vec (CBOW)
def generate_training_data(tokenized_sentences, word_to_index, window_size=2):
    data = []
    for sentence in tokenized_sentences:
        for i, word in enumerate(sentence):
            target = word_to_index[word]
            context = []
            for j in range(i - window_size, i + window_size + 1):
                if j != i and j >= 0 and j < len(sentence):
                    context.append(word_to_index[sentence[j]])
            data.append((context, target))
    return data

training_data = generate_training_data(tokenized_sentences, word_to_index)

# Word2Vec CBOW Model
class Word2VecCBOW:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.W1 = np.random.randn(vocab_size, embedding_dim)
        self.W2 = np.random.randn(embedding_dim, vocab_size)
        
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)
    
    def forward_cbow(self, context_indices):
        h = np.mean(self.W1[context_indices], axis=0)
        u = np.dot(h, self.W2)
        y_pred = self.softmax(u)
        return y_pred, h
    
    def backward_cbow(self, error, h, context_indices, learning_rate):
        dW2 = np.outer(h, error)
        dW1 = np.zeros_like(self.W1)
        for context_index in context_indices:
            dW1[context_index] += np.dot(self.W2, error)
        dW1 /= len(context_indices)
        
        self.W1[context_indices] -= learning_rate * dW1[context_indices]
        self.W2 -= learning_rate * dW2

    def train_cbow(self, training_data, epochs, learning_rate):
        for epoch in range(epochs):
            loss = 0
            for context, target in training_data:
                y_pred, h = self.forward_cbow(context)
                error = y_pred.copy()
                error[target] -= 1
                self.backward_cbow(error, h, context, learning_rate)
                loss += -np.log(y_pred[target])
            if epoch % 100 == 0:
                print(f'Epoch {epoch + 1}, Loss: {loss}')

# Initialize and train the Word2Vec CBOW model
embedding_dim = 10
word2vec_cbow = Word2VecCBOW(vocab_size, embedding_dim)
word2vec_cbow.train_cbow(training_data, epochs=1000, learning_rate=0.01)

# Get word embeddings
embeddings = word2vec_cbow.W1

# Prepare data for RNN
max_length = max(len(sentence) for sentence in tokenized_sentences)
encoded_sentences = [[word_to_index[word] for word in sentence] for sentence in tokenized_sentences]
padded_sentences = pad_sequences(encoded_sentences, maxlen=max_length, padding='post')
labels = np.array(labels)

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embeddings], input_length=max_length, trainable=False))
model.add(SimpleRNN(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the RNN model
model.fit(padded_sentences, labels, epochs=100, verbose=1)

# Test the model
test_sentences = ["I love this book", "I hate this product"]
test_tokenized = [word_tokenize(sentence.lower()) for sentence in test_sentences]


Epoch 1, Loss: 173.7556082088693
Epoch 101, Loss: 35.851099001477486
Epoch 201, Loss: 22.087502682542492
Epoch 301, Loss: 18.81803924476258
Epoch 401, Loss: 17.69639723933672
Epoch 501, Loss: 17.166589821632204
Epoch 601, Loss: 16.86763031352376
Epoch 701, Loss: 16.67892094969198
Epoch 801, Loss: 16.550243387916986
Epoch 901, Loss: 16.45739124175441
Epoch 1/100




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.5000 - loss: 0.7731
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5000 - loss: 0.7114
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.6000 - loss: 0.6692
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.6000 - loss: 0.6451
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.6000 - loss: 0.6348
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.6000 - loss: 0.6316
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.6000 - loss: 0.6289
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.6000 - loss: 0.6221
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

# Prediction 

In [7]:
# Test the model
test_sentences = ["I love this book", "I hate this product"]
test_tokenized = [word_tokenize(sentence.lower()) for sentence in test_sentences]
encoded_test_sentences = [[word_to_index.get(word, 0) for word in sentence] for sentence in test_tokenized]
padded_test_sentences = pad_sequences(encoded_test_sentences, maxlen=max_length, padding='post')
predictions = model.predict(padded_test_sentences)

# Print predictions
for i, sentence in enumerate(test_sentences):
    sentiment = 'positive' if predictions[i][0] >= 0.5 else 'negative'
    print(f"Sentence: '{sentence}' - Sentiment prediction: {predictions[i][0]:.4f} ({sentiment})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Sentence: 'I love this book' - Sentiment prediction: 0.9671 (positive)
Sentence: 'I hate this product' - Sentiment prediction: 0.0190 (negative)


In [12]:
tokenized_sentences

[['i', 'love', 'this', 'movie'],
 ['this', 'film', 'is', 'great'],
 ['amazing', 'experience'],
 ['i', 'am', 'very', 'happy', 'with', 'the', 'product'],
 ['the', 'book', 'is', 'fantastic'],
 ['i', 'hate', 'this', 'movie'],
 ['this', 'film', 'is', 'terrible'],
 ['horrible', 'experience'],
 ['i', 'am', 'very', 'unhappy', 'with', 'the', 'product'],
 ['the', 'book', 'is', 'awful']]

In [17]:
padded_sentences

array([[ 0, 18,  1, 11,  0,  0,  0],
       [ 1, 20, 21,  8,  0,  0,  0],
       [13, 12,  0,  0,  0,  0,  0],
       [ 0, 16,  5,  9, 14,  6,  2],
       [ 6, 10, 21, 15,  0,  0,  0],
       [ 0, 19,  1, 11,  0,  0,  0],
       [ 1, 20, 21, 17,  0,  0,  0],
       [ 7, 12,  0,  0,  0,  0,  0],
       [ 0, 16,  5,  3, 14,  6,  2],
       [ 6, 10, 21,  4,  0,  0,  0]])

In [20]:
vocab_size,embedding_dim

(22, 10)