<a href="https://colab.research.google.com/github/karolinakuligowska/TMSMM_codes/blob/main/Class_12_Neural_Nets_RNN__R_example3_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Enhance NN model for classifying sentiment of movie_review.csv
# by incorporating DistilBERT, a lightweight Transformer model
# Let's see if the modelimproves performance

In [2]:
# Steps:
#
# 1. Replace the text vectorization + embedding layers with a pretrained DistilBERT model from transformers.
# 2. Use the Hugging Face transformers library to tokenize and process text.
# 3. Use a simple classifier head with Dropout + Dense layers on top of DistilBERT.
# 4. Train the model using TensorFlow/Keras.

In [17]:
# Install transformers if not installed
!pip install transformers datasets
!pip install --upgrade fsspec==2024.10.0
!pip install --upgrade gcsfs==2024.10.0

import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

Collecting fsspec==2024.10.0
  Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2024.10.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.9.0
    Uninstalling fsspec-2024.9.0:
      Successfully uninstalled fsspec-2024.9.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.2.0 requires fsspec[http]<=2024.9.0,>=2023.1.0, but you have fsspec 2024.10.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2024.10.0


In [18]:
# 1. Load dataset ----
df = pd.read_csv('movie_review.csv', encoding='utf-8')
print(df.head())


   fold_id cv_tag  html_id  sent_id  \
0        0  cv000    29590        0   
1        0  cv000    29590        1   
2        0  cv000    29590        2   
3        0  cv000    29590        3   
4        0  cv000    29590        4   

                                                text  tag  
0  films adapted from comic books have had plenty...  pos  
1  for starters , it was created by alan moore ( ...  pos  
2  to say moore and campbell thoroughly researche...  pos  
3  the book ( or " graphic novel , " if you will ...  pos  
4  in other words , don't dismiss this film becau...  pos  


In [19]:
# 2. Explore the data ----
print(df['tag'].value_counts())
print(df['text'][0])

# Split dataset
training, testing = train_test_split(df, test_size=0.2, random_state=123)

tag
pos    32937
neg    31783
Name: count, dtype: int64
films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .


In [20]:
# 3. Prepare the tokenizer ----
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

def encode_texts(texts, tokenizer, max_length=128):
    """ Tokenizes text using DistilBERT tokenizer """
    return tokenizer(
        list(texts),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

In [21]:
# Tokenize training and testing data
train_encodings = encode_texts(training['text'], tokenizer)
test_encodings = encode_texts(testing['text'], tokenizer)

# Convert labels to binary format
train_labels = (training['tag'] == "pos").astype(int).values
test_labels = (testing['tag'] == "pos").astype(int).values

In [22]:
# 4. Build the Transformer-based model ----
input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Load DistilBERT base model
bert_model = TFDistilBertModel.from_pretrained(MODEL_NAME)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [23]:
# Extract CLS token representation
x = bert_model(input_ids, attention_mask=attention_mask)[0][:, 0, :]
x = Dropout(0.3)(x)  # Add dropout for regularization
x = Dense(128, activation="relu")(x)  # Fully connected layer
x = Dropout(0.3)(x)
output_layer = Dense(1, activation="sigmoid")(x)  # Output layer for binary classification

ValueError: Exception encountered when calling layer 'tf_distil_bert_model_3' (type TFDistilBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

Call arguments received by layer 'tf_distil_bert_model_3' (type TFDistilBertModel):
  • input_ids=<KerasTensor shape=(None, 128), dtype=int32, sparse=False, name=input_ids>
  • attention_mask=<KerasTensor shape=(None, 128), dtype=int32, sparse=False, name=attention_mask>
  • head_mask=None
  • inputs_embeds=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

In [24]:
# The ValueError occurs because bert_model(input_ids, attention_mask=attention_mask) is being called
# with Keras tensors inside a functional Keras model.
# Hugging Face's TFDistilBertModel expects TensorFlow tensors (from tf.Tensor),
# but KerasTensors from the Functional API cause issues.

In [25]:
# Solution: Use Lambda Layer to Wrap the Transformer Model
# We will wrap the DistilBERT model inside a Lambda layer,
# which ensures it works correctly inside a Keras Functional Model.

In [26]:
import tensorflow as tf
from transformers import TFDistilBertModel

# Define input layers
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Load DistilBERT base model
bert_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [29]:
# Wrap BERT model inside a Lambda layer
def bert_layer(inputs):
    input_ids, attention_mask = inputs
    outputs = bert_model(input_ids, attention_mask=attention_mask)
    return outputs.last_hidden_state[:, 0, :]  # Extract CLS token representation



In [37]:
# x = tf.keras.layers.Lambda(bert_layer)([input_ids, attention_mask]) #this line generated error

# To fix this, we need to explicitly specify the output_shape argument when defining the Lambda layer.
# The output shape of the bert_layer is (768,) as it returns the CLS token representation
# which has a dimensionality of 768 for distilbert-base-uncased

# fixed line:

x = tf.keras.layers.Lambda(bert_layer, output_shape=(768,))([input_ids, attention_mask])

In [38]:
# Fully connected layers
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.3)(x)

In [39]:
# Output layer for binary classification
output_layer = tf.keras.layers.Dense(1, activation="sigmoid")(x)

In [40]:
# Define the model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output_layer)

In [41]:
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [42]:
# Check model structure (optional)
model.summary()

In [43]:
# Train the model
history = model.fit(
    x={'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    y=train_labels,
    epochs=2,  # Adjust based on dataset size
    batch_size=16,  # Recommended batch size for transformers
    validation_split=0.2,
    verbose=2
)

Epoch 1/2


KeyboardInterrupt: 

In [None]:
# Evaluate the model
results = model.evaluate(
    x={'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
    y=test_labels,
    verbose=0
)

# Print final evaluation metrics
print(f"Test Loss: {results[0]:.2f}")
print(f"Test Accuracy: {results[1]:.2f}")