<a href="https://colab.research.google.com/github/kmayutrisna/SA_IMDB/blob/main/5_IMDB_Robert_CNN_Base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from transformers import RobertaTokenizer

In [3]:
#Get the dataset from gdrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Display top of data
data =pd.read_csv("drive/My Drive/data/IMDB Dataset.csv")

In [5]:
# Split the dataset into training, validation, and test sets
train_df, val_test_df = train_test_split(data, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)

In [6]:
# Load the tokenizer and encode the text
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer(list(train_df['review']), truncation=True, padding=True)
val_encodings = tokenizer(list(val_df['review']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['review']), truncation=True, padding=True)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [14]:
# Convert the labels to TensorFlow compatible format
import numpy as np
train_labels = np.array(list(train_df['sentiment'].replace({'positive': 1, 'negative': 0})))
val_labels = np.array(list(val_df['sentiment'].replace({'positive': 1, 'negative': 0})))
test_labels = np.array(list(test_df['sentiment'].replace({'positive': 1, 'negative': 0})))


In [15]:
# Pad the input sequences
max_length = 512
train_sequences = pad_sequences(train_encodings['input_ids'], maxlen=max_length)
val_sequences = pad_sequences(val_encodings['input_ids'], maxlen=max_length)
test_sequences = pad_sequences(test_encodings['input_ids'], maxlen=max_length)


In [16]:
# Create the CNN model
model = Sequential([
    Embedding(len(tokenizer), 100, input_length=max_length),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])

In [17]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

In [24]:
# Train the model
history=model.fit(train_sequences, train_labels, validation_data=(val_sequences, val_labels),
          epochs=10, batch_size=64, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: ignored

In [20]:
# Evaluate the model
val_loss, val_accuracy = model.evaluate(val_sequences, val_labels)
test_loss, test_accuracy = model.evaluate(test_sequences, test_labels)



In [21]:
# Make predictions on the test set
test_predictions = model.predict(test_sequences)
test_predictions = [1 if p >= 0.5 else 0 for p in test_predictions]



In [22]:
# Calculate accuracy and F1 score
test_accuracy = accuracy_score(test_labels, test_predictions)
test_f1_score = f1_score(test_labels, test_predictions)

In [23]:
print("Validation Accuracy:", val_accuracy)
print("Test Accuracy:", test_accuracy)
print("Test F1 Score:", test_f1_score)

Validation Accuracy: 0.9017999768257141
Test Accuracy: 0.9046
Test F1 Score: 0.9056379821958457


In [None]:
# Model Performance Charts

import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()