## Setup

In [27]:
!pip install transformers



In [28]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout, GlobalMaxPooling1D, Bidirectional,Conv1D,MaxPooling1D,Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import re

In [29]:
df=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [30]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
y=le.fit_transform(df['sentiment'])

In [31]:
from sklearn.model_selection import train_test_split
df_train, df_test,y_train,y_test= train_test_split(df,y,test_size = 0.20, random_state = 0)

In [32]:
#Removes Punctuations
def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

#Removes HTML syntaxes
def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

#Removes URL data
def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

#Removes Emojis
def remove_emoji(data):
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    data=emoji_clean.sub(r'',data)
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

df_train['review'] = df_train['review'].apply(lambda z: remove_punctuations(z))
df_train['review'] = df_train['review'].apply(lambda z: remove_html(z))
df_train['review'] = df_train['review'].apply(lambda z: remove_url(z))
df_train['review'] = df_train['review'].apply(lambda z: remove_emoji(z))

df_test['review'] = df_test['review'].apply(lambda z: remove_punctuations(z))
df_test['review'] = df_test['review'].apply(lambda z: remove_html(z))
df_test['review'] = df_test['review'].apply(lambda z: remove_url(z))
df_test['review'] = df_test['review'].apply(lambda z: remove_emoji(z))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [33]:
# Define BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


## Encoding 

In [34]:
# Tokenize text data
MAX_LEN = 128
X_train_tokenized = tokenizer.batch_encode_plus(
    df_train['review'].tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False
)
X_test_tokenized = tokenizer.batch_encode_plus(
    df_test['review'].tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False
)

In [35]:
attention_masks = np.array(X_train_tokenized['attention_mask'])
X_train = np.array(X_train_tokenized['input_ids'])
X_test = np.array(X_test_tokenized['input_ids'])

In [36]:
# Define input layer for BERT model
input_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")

# Connect tokenizer output to BERT model
bert_output = bert_model(input_layer)[0]

net = Bidirectional(LSTM(128, return_sequences=True))(bert_output)
net = Conv1D(128, 7, activation='relu', padding='same')(net)
net = MaxPooling1D()(net)
net = Conv1D(256, 5, activation='relu', padding='same')(net)
net = MaxPooling1D()(net)
net = Conv1D(512, 3, activation='relu', padding='same')(net)
net = MaxPooling1D()(net)
net = Flatten()(net)
net = Dense(128, activation='relu')(net)
net = Dropout(0.5)(net)
outputs = Dense(1, activation='sigmoid')(net) 

# Define the model
model = Model(inputs=[input_layer], outputs=[outputs])  # Fix: use 'outputs' instead of 'output_layer'

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-5), metrics=['accuracy'])

In [37]:
history = model.fit(
    x=X_train,
    y=y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=16,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
batch_size = 16  # Set batch size to the same value as used in training
loss, accuracy = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test accuracy:', accuracy)

Test accuracy: 0.8889999985694885
