<a href="https://colab.research.google.com/github/itsmuditt/Bert_Finetuning/blob/main/Bert_Finetuning_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dataset download and split
---

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_from_disk

# Define paths to the train and test dataset folders within your Google Drive
train_dataset_path = '/content/drive/MyDrive/DLNLP_Project/Datasets/train'
test_dataset_path = '/content/drive/MyDrive/DLNLP_Project/Datasets/test'

# Load the train and test datasets separately using load_from_disk
train_dataset = load_from_disk(train_dataset_path)
test_dataset = load_from_disk(test_dataset_path)

In [None]:
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 2247
})

---

In [None]:
from datasets import DatasetDict, Dataset
import pandas as pd

In [None]:
# Extract 'text' and 'label' columns from the train and test datasets
train_data = {'text': train_dataset['text'], 'label': train_dataset['label']}
test_data = {'text': test_dataset['text'], 'label': test_dataset['label']}

In [None]:
# Convert dictionaries to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

In [None]:
# Convert back to Hugging Face Dataset format
train = Dataset.from_pandas(train_df)
test = Dataset.from_pandas(test_df)

# Creating DatasetDict with train, validation, and test subsets
split_dataset = DatasetDict({
    "train": train,
    "test": test
})

In [None]:
len(split_dataset['train'])

2247

---

## Tokenizing the Datasets and segregating them (input_ids & labels)

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TFTrainer, TFTrainingArguments
from transformers.training_args_tf import TFTrainingArguments

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = TFAutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=3)

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the dataset with padding for a max_length of 128
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

In [None]:
tokenized_dataset = split_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/2247 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2247
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 250
    })
})

In [None]:
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

In [None]:
import numpy as np

---

In [None]:
label_map = {'pos': 0, 'neu': 1, 'neg': 2}

# Map string labels to integers
train_labels = np.array([label_map[label] for label in train_dataset['label']])
test_labels = np.array([label_map[label] for label in test_dataset['label']])

In [None]:
train_inputs = np.array(train_dataset['input_ids'])
test_inputs = np.array(test_dataset['input_ids'])

---

In [None]:
print(type(train_inputs))
print(train_inputs.shape if isinstance(train_inputs, np.ndarray) else "Not a NumPy array")
print(type(train_labels))
print(train_labels.shape if isinstance(train_labels, np.ndarray) else "Not a NumPy array")


<class 'numpy.ndarray'>
(2247, 128)
<class 'numpy.ndarray'>
(2247,)


In [None]:
print(type(test_inputs))
print(test_inputs.shape if isinstance(test_inputs, np.ndarray) else "Not a NumPy array")
print(type(test_labels))
print(test_labels.shape if isinstance(test_labels, np.ndarray) else "Not a NumPy array")

<class 'numpy.ndarray'>
(250, 128)
<class 'numpy.ndarray'>
(250,)


---

## Defining Model Parameters and training the Model

In [None]:
# Define a callback to save the best model based on validation accuracy
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='/content/drive/MyDrive/DLNLP_Project/Models/best_model',  # Specify the path to save the model
    monitor='val_accuracy',    # Monitor validation accuracy
    mode='max',                # Mode can be 'max' or 'min' depending on what's monitored
    save_best_only=True,        # Save only the best model
    save_format = 'tf'
)

In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  237556224 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 237558531 (906.21 MB)
Trainable params: 237558531 (906.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Access the encoder layers
BERT_layers = model.layers[0]

In [None]:
# Access the encoder layers within the BERT model
encoder_layers = BERT_layers.encoder.layer

In [None]:
len(encoder_layers)

12

In [None]:
# Freeze the parameters of the first two and last encoder layers
num_layers = len(encoder_layers)
for i, layer in enumerate(encoder_layers):
    if i < 2 or i == num_layers - 2:
        layer.trainable = False

In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  237556224 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 237558531 (906.21 MB)
Trainable params: 216294915 (825.10 MB)
Non-trainable params: 21263616 (81.11 MB)
_________________________________________________________________


In [None]:
# Define training parameters
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [None]:
# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=[metric], run_eagerly=True)

In [None]:
# Define a function to train the model
def train_model(train_inputs, train_labels):
    model.fit(
        train_inputs,
        train_labels,
        validation_split=0.2,
        epochs=48,
        batch_size=32,
        shuffle=True,
        callbacks=[checkpoint]
    )

In [None]:
# Train the model
train_model(train_inputs, train_labels)

Epoch 1/48




Epoch 2/48
Epoch 3/48
Epoch 4/48
Epoch 5/48
Epoch 6/48
Epoch 7/48
Epoch 8/48
Epoch 9/48
Epoch 10/48
Epoch 11/48
Epoch 12/48
Epoch 13/48
Epoch 14/48
Epoch 15/48
Epoch 16/48
Epoch 17/48
Epoch 18/48
Epoch 19/48
Epoch 20/48
Epoch 21/48
Epoch 22/48
Epoch 23/48
Epoch 24/48
Epoch 25/48
Epoch 26/48
Epoch 27/48
Epoch 28/48
Epoch 29/48
Epoch 30/48
Epoch 31/48
Epoch 32/48
Epoch 33/48
Epoch 34/48
Epoch 35/48
Epoch 36/48
Epoch 37/48
Epoch 38/48
Epoch 39/48
Epoch 40/48
Epoch 41/48
Epoch 42/48
Epoch 43/48
Epoch 44/48
Epoch 45/48
Epoch 46/48
Epoch 47/48
Epoch 48/48


In [None]:
model.save('/content/drive/MyDrive/DLNLP_Project/Models/full_trained_model', save_format='tf')

---

## Evaluating the Model

In [None]:
# Define a function to evaluate the model
def evaluate_model(test_inputs, test_labels):
    results = model.evaluate(test_inputs, test_labels)
    print("Test Loss:", results[0])
    print("Test Accuracy:", results[1])

In [None]:
# Evaluate the model on the test dataset
evaluate_model(test_inputs, test_labels)

Test Loss: 1.2507719993591309
Test Accuracy: 0.6320000290870667


## Sample 1-Line Text

In [None]:
text = 'इसमें अच्छा परफॉरमेंस और क्षमता होने के साथ इसकी बैटरी लाइफ अन्य टेबलेट्स की तुलना में बहुत अच्छी है।'

In [None]:
output = model.predict(tokenizer(text=text, padding=True, truncation=True, max_length=128, return_tensors='tf'))



In [None]:
output

TFSequenceClassifierOutput(loss=None, logits=array([[ 1.399113  ,  0.45861956, -1.8765097 ]], dtype=float32), hidden_states=None, attentions=None)

In [None]:
predicted_class = tf.argmax(output.logits, axis=1).numpy()[0]

In [None]:
# Reverse the label_map dictionary to map class values to labels
reverse_label_map = {v: k for k, v in label_map.items()}

# Get the label corresponding to the predicted class value
predicted_label = reverse_label_map[predicted_class]

print(f"Input Text: {text}\n")
# Display the predicted label
print("Predicted label:", predicted_label)

Input Text: इसमें अच्छा परफॉरमेंस और क्षमता होने के साथ इसकी बैटरी लाइफ अन्य टेबलेट्स की तुलना में बहुत अच्छी है।

Predicted label: pos


---
---

                                                                      The End!

---
---
