In [2]:
# Step 1: Import necessary libraries
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
import numpy as np
import matplotlib.pyplot as plt

# Step 2: Load dataset
df = pd.read_csv(r'C:\Users\ksvsu\OneDrive\Desktop\Minor Project\Tweets.csv',
                 encoding='ISO-8859-1', header=None, on_bad_lines='skip')
df.columns = ['target', 'id', 'date', 'text']  # Adjust according to the actual number of columns

# Step 3: Simplify target labels (0 = negative, 4 = positive to binary 0 and 1)
df['target'] = df['target'].apply(lambda x: 1 if x == 4 else 0)

# Step 4: Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data using the BERT tokenizer
def tokenize_tweets(tweets, tokenizer, max_length=128):
    return tokenizer(
        tweets.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )

tokens = tokenize_tweets(df['text'], tokenizer)

# Prepare inputs for TensorFlow
input_ids = tokens['input_ids']
attention_masks = tokens['attention_mask']

# **Convert TensorFlow tensors to NumPy arrays**
input_ids_np = input_ids.numpy()
attention_masks_np = attention_masks.numpy()
targets_np = df['target'].values

# **Step 5: Split the data into training and validation sets together**
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids_np, attention_masks_np, targets_np,
    test_size=0.2, random_state=42, stratify=targets_np
)

# Step 6: Define a function to create TensorFlow datasets
def create_tf_dataset(inputs, masks, labels, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': inputs, 'attention_mask': masks}, labels))
    return dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

batch_size = 32
train_dataset = create_tf_dataset(train_inputs, train_masks, train_labels, batch_size)
val_dataset = create_tf_dataset(val_inputs, val_masks, val_labels, batch_size)

# Step 7: Create a custom BERT model with additional layers
class CustomBERTModel(tf.keras.Model):
    def __init__(self, base_model):
        super(CustomBERTModel, self).__init__()
        self.base_model = base_model
        self.dropout = Dropout(0.3)
        self.classifier = Dense(2, activation='softmax')

    def call(self, inputs, training=False):
        outputs = self.base_model(inputs)
        pooled_output = outputs.pooler_output  # Use the pooled output for classification
        x = self.dropout(pooled_output, training=training)
        return self.classifier(x)

# Load pre-trained BERT and attach custom layers
base_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
model = CustomBERTModel(base_model.bert)

# Step 8: Define loss, optimizer, and metrics
loss_fn = SparseCategoricalCrossentropy(from_logits=False)
optimizer = Adam(learning_rate=3e-5)  # Use a standard learning rate for fine-tuning BERT

# Step 9: Compile the model
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Step 10: Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Step 11: Train the model with early stopping
history = model.fit(
    train_dataset,
    epochs=5,
    validation_data=val_dataset,
    callbacks=[early_stopping],
    verbose=1
)

# Step 12: Evaluate the model
val_loss, val_accuracy = model.evaluate(val_dataset)
print(f"Final Validation Loss: {val_loss:.4f}")
print(f"Final Validation Accuracy: {val_accuracy:.4f}")

# Step 13: Plot the training and validation accuracy and loss
plt.figure(figsize=(12, 5))

# Accuracy Plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', marker='o')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', marker='o')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

# Loss Plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', marker='o')
plt.plot(history.history['val_loss'], label='Validation Loss', marker='o')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()










All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5


In [1]:
# Step 11: Train the model with early stopping
history = model.fit(
    train_dataset,
    epochs=5,
    validation_data=val_dataset,
    callbacks=[early_stopping],
    verbose=2  # Increase verbosity to get more details in the output
)

# Step 12: Evaluate the model
val_loss, val_accuracy = model.evaluate(val_dataset)
print(f"Final Validation Loss: {val_loss:.4f}")
print(f"Final Validation Accuracy: {val_accuracy:.4f}")

# Clear TensorFlow session
tf.keras.backend.clear_session()

# Print confirmation to check code progression
print("Model evaluation complete. Proceeding to plotting.")

# Step 13: Plot the training and validation accuracy and loss
plt.figure(figsize=(12, 5))

# Accuracy Plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', marker='o')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', marker='o')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

# Loss Plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', marker='o')
plt.plot(history.history['val_loss'], label='Validation Loss', marker='o')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


NameError: name 'model' is not defined