In [None]:
!pip install -q keras

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('punkt')
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.utils import plot_model
from transformers import BertTokenizer, TFBertModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
nltk.download('stopwords')
nltk.download('wordnet')
# Load the dataset
train_file_path = '/content/train.tsv'
valid_file_path = '/content/valid.tsv'
test_file_path = '/content/test.tsv'

train_data = pd.read_csv(train_file_path, sep='\t', encoding="Latin-1")
valid_data = pd.read_csv(valid_file_path, sep='\t', encoding="Latin-1")
test_data = pd.read_csv(test_file_path, sep='\t', encoding="Latin-1")

# Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|@[^\s]+|#\w+', '', text)
    text = re.sub(r'[^\w\s\d]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

train_data['Text'] = train_data['Text'].apply(preprocess_text)
valid_data['Text'] = valid_data['Text'].apply(preprocess_text)
test_data['Text'] = test_data['Text'].apply(preprocess_text)

# Tokenize text using BERT
maxlen = 100
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text):
    tokens = tokenizer.encode_plus(text, max_length=maxlen, truncation=True, padding='max_length', return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

# Tokenize train, validation, and test data
train_tokens = [tokenize_text(text) for text in train_data['Text']]
val_tokens = [tokenize_text(text) for text in valid_data['Text']]
test_tokens = [tokenize_text(text) for text in test_data['Text']]

# Stack input_ids and attention_masks
X_train_input_ids = np.vstack([token[0] for token in train_tokens])
X_train_attention_masks = np.vstack([token[1] for token in train_tokens])

X_val_input_ids = np.vstack([token[0] for token in val_tokens])
X_val_attention_masks = np.vstack([token[1] for token in val_tokens])

X_test_input_ids = np.vstack([token[0] for token in test_tokens])
X_test_attention_masks = np.vstack([token[1] for token in test_tokens])

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['Label'])
valid_labels = label_encoder.transform(valid_data['Label'])
test_labels = label_encoder.transform(test_data['Labels'])

# Define the BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Freeze BERT layers
bert_model.trainable = False

# Define the CNN model
bert_input_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
bert_attention_masks = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32)

# Get BERT embeddings
embedding = bert_model(bert_input_ids, attention_mask=bert_attention_masks)[0]  # Use token embeddings

# CNN layer
conv_layer = tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu')(embedding)
maxpool_layer = tf.keras.layers.GlobalMaxPooling1D()(conv_layer)

# Output layer
output = tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')(maxpool_layer)

# Build model
model = tf.keras.models.Model(inputs=[bert_input_ids, bert_attention_masks], outputs=output)

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

plot_model(model, to_file='model_architecture.png', show_shapes=True, show_layer_names=True)

# Train the model
history = model.fit([X_train_input_ids, X_train_attention_masks], train_labels,
                    epochs=10, batch_size=512,
                    validation_data=([X_val_input_ids, X_val_attention_masks], valid_labels))

# Evaluate the model
test_loss, test_accuracy = model.evaluate([X_test_input_ids, X_test_attention_masks], test_labels)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


val_loss, val_accuracy = model.evaluate([X_val_input_ids, X_val_attention_masks], valid_labels)
print("Valid Loss:", val_loss)
print("Validation Accuracy:", val_accuracy)

plot_model(model, to_file='model_architecture.png', show_shapes=True, show_layer_names=True)


# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cl

Epoch 1/10
 2/20 [==>...........................] - ETA: 1:12:29 - loss: 2.4667 - accuracy: 0.1758

In [None]:
import numpy as np

# true labels and predicted probabilities from your dataset
true_labels = X_val_attention_masks
predicted_probabilities = model.predict(X_val_input_ids)

# Threshold the predicted probabilities to obtain binary predictions
predicted_labels = (predicted_probabilities > 0.5).astype(int)

# Flatten the predicted labels array
predicted_labels_flattened = predicted_labels.flatten()

# Reshape the predicted labels array to match the shape of the true labels array
predicted_labels_reshaped = predicted_labels_flattened[:len(true_labels)]

# Compute true positives, false positives, false negatives
tp = np.sum((true_labels == 1) & (predicted_labels_reshaped == 1))
fp = np.sum((true_labels == 0) & (predicted_labels_reshaped == 1))
fn = np.sum((true_labels == 1) & (predicted_labels_reshaped == 0))

# Compute precision, recall, and F1 score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)