<a href="https://colab.research.google.com/github/kmayutrisna/SA_IMDB/blob/main/Glove_CNNLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary library
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Bidirectional, concatenate, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Get the dataset from gdrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Display top of data
data =pd.read_csv("drive/My Drive/data/IMDB Dataset.csv")

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Text preprocessing
def preprocess_text(text):

    # Text cleaning
    text = re.sub('[^a-zA-Z]', ' ', text)

    #Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)

    #Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Lowercasing
    text = text.lower()

    # Lowercasing
    text = text.split()

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]

    # Join the tokens back to a single string
    text = ' '.join(text)
    return text

In [None]:
# Preprocess the text column
data['review'] = data['review'].apply(preprocess_text)

In [None]:
# Convert sentiment to binary labels
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})
#data = data.sample(frac=1).reset_index(drop=True)  # Shuffling the dataset

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ho...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter mattei love time money visually stunnin...,1


In [None]:
X = data['review'].values
y = data['sentiment'].values

In [None]:
# Splitting the dataset

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.111, random_state=42)
#hasilnya 40005

In [None]:
# Tokenizing the texts
vocab_size = 10000  # Maximum number of words to keep in the vocabulary
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)

In [None]:
# Converting text sequences to integer sequences
train_seq = tokenizer.texts_to_sequences(X_train)
val_seq = tokenizer.texts_to_sequences(X_val)
test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
# Padding sequences
#max_sequence_length = max([len(seq) for seq in train_seq])
max_length = 100  # Maximum sequence length
train_data = pad_sequences(train_seq, maxlen=max_length)
val_data = pad_sequences(val_seq, maxlen=max_length)
test_data = pad_sequences(test_seq, maxlen=max_length)

In [None]:
# Loading the GloVe word embeddings
glove_path = 'drive/My Drive/data/glove.6B.100d.txt'

In [None]:
import numpy as np
embedding_dim = 100  # Dimensionality of the word embeddings
embedding_index = {} #empty dictionary that will store the word embeddings. Each word will be associated with its corresponding embedding coefficients.
with open(glove_path, 'r') as f:
    for line in f:
        values = line.split() #Inside the loop, the current line is split into individual values using the split method, which splits the line based on whitespace. The resulting values are stored in the values list.
        word = values[0] #The first value in values (index 0) is assigned to the variable word. This represents the word itself.
        coefficients = np.asarray(values[1:], dtype='float32') #The word (word) and its corresponding coefficients (coefficients) are stored as a key-value pair in the embedding_index dictionary.
        embedding_index[word] = coefficients #After the loop ends, the embedding_index dictionary will contain the word embeddings for all words in the file.

In [None]:
# Creating an embedding matrix
word_index = tokenizer.word_index
num_words = min(vocab_size, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
#Standar biLSTM
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, MaxPooling1D
from tensorflow.keras.models import Sequential
model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(train_data, y_train, batch_size=32, epochs=10, validation_data=(val_data, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7c3be62d38b0>

In [None]:
# Evaluating the model
y_pred = model.predict(test_data)
y_pred_binary = np.round(y_pred).flatten()



In [None]:
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.8448
Precision: 0.8623700623700624
Recall: 0.8233425962683605
F1-score: 0.842404549147035
