In [None]:
import spacy
import re
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Lowering down the text
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Tokenization and lemmatization using spaCy
    doc = nlp(text)
    words = [token.lemma_ for token in doc]

    # Additional stopwords including 'mkr'
    custom_stopwords = set(['rt', '#mkr', "i'm", 'mkr'] + list(nlp.Defaults.stop_words))

    # Removing stopwords
    words = [word for word in words if word not in custom_stopwords]

    # Removing words starting with '@'
    words = [word for word in words if not word.startswith('@')]

    # Removing punctuation and special characters
    words = [word for word in words if word.isalnum()]

    # Join the words back into a string
    processed_text = ' '.join(words)

    return processed_text


In [None]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

combined_df = pd.read_csv("/content/combined_tweet_data_and_preprocessed.csv")
# Assume 'preprocessed_text' for text and 'oh_label' for one-hot encoded labels
texts = combined_df['Preprocessed_data'].astype(str).tolist()
labels = pd.get_dummies(combined_df['oh_label']).values  # Convert categorical labels to one-hot

# Split the data into train, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.25, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="tf", max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="tf", max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="tf", max_length=128)

# Preparing the dataset
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))

batch_size = 32
train_dataset = train_dataset.shuffle(len(train_texts)).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

# Load the Pre-trained BERT Model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=labels.shape[1])

# Fine-Tuning
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Early Stopping Callback
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=20)

# Training the model
model.fit(train_dataset, epochs=100, validation_data=val_dataset, callbacks=[early_stopping_callback])

# Evaluation
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


In [None]:
import numpy as np
from sklearn.metrics import f1_score

# Make predictions on the test dataset
test_predictions = model.predict(test_dataset)

# Convert softmax predictions to class labels
predicted_labels = np.argmax(test_predictions.logits, axis=1)

# Convert one-hot encoded true labels to class labels
true_labels = np.argmax(test_labels, axis=1)  # Assuming test_labels is one-hot encoded

# Calculate F1 score
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"F1 Score on Test Data: {f1}")

F1 Score on Test Data: 0.9289504890580017


In [None]:
from sklearn.metrics import classification_report

# Make predictions on the test dataset
test_predictions = model.predict(test_dataset)

# Convert softmax predictions to class labels
predicted_labels = np.argmax(test_predictions.logits, axis=1)

# Convert one-hot encoded true labels to class labels
true_labels = np.argmax(test_labels, axis=1)  # Assuming test_labels is one-hot encoded

# Generate a classification report
report = classification_report(true_labels, predicted_labels)  # Replace class_names with your actual class names

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      5540
           1       0.86      0.81      0.83      1536

    accuracy                           0.93      7076
   macro avg       0.90      0.89      0.89      7076
weighted avg       0.93      0.93      0.93      7076



In [None]:
model.save_pretrained("bert_model_cyber.h5")

In [None]:
model.save_pretrained('/content/bert_model_cyber/')

In [None]:
from transformers import TFBertForSequenceClassification, BertTokenizer

model_path = '/content/bert_model_cyber/'
loaded_model = TFBertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # Adjust the tokenizer name as needed

sentence = "RT @Newmanzaa: There's something wrong when a girl wins Wayne Rooney street striker #NotSexist"
input_ids = tokenizer.encode(sentence, truncation=True, padding=True, return_tensors="tf")

predictions = loaded_model.predict(input_ids)
# Example: Get the predicted class label
predicted_class_index = tf.argmax(predictions.logits, axis=1).numpy()[0]


Some layers from the model checkpoint at /content/bert_model_cyber/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/bert_model_cyber/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.




In [None]:
print(predicted_class_index)

1


In [None]:
!pip install -U huggingface-cli


Collecting huggingface-cli
  Downloading huggingface_cli-0.1-py3-none-any.whl (1.0 kB)
Installing collected packages: huggingface-cli
Successfully installed huggingface-cli-0.1


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) N
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
%cd /content/bert_model_cyber

/content/bert_model_cyber


In [None]:
!git config --global user.email "kumarrohitkspk25@gmail.com"
!git config --global user.name "k-Rohit"


In [None]:
!git init
!git add .
!git commit -m "Initial commit: Add model files"


Reinitialized existing Git repository in /content/bert_model_cyber/.git/
[master (root-commit) f40c808] Initial commit: Add model files
 2 files changed, 25 insertions(+)
 create mode 100644 config.json
 create mode 100644 tf_model.h5


In [None]:
!git remote add origin https://huggingface.com/Rohit1234/CYBER-BERT

error: remote origin already exists.


In [None]:
!git remote set-url origin git@huggingface.co:Rohit1234/CYBER-BERT

In [None]:
!git push -u origin master


ssh: connect to host huggingface.co port 22: Cannot assign requested address
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


In [1]:
from transformers import TFBertForSequenceClassification, BertTokenizer

# Replace 'your-username/your-repo-name' with your actual Hugging Face username and repository name
model_name = 'Rohit1234/CYBER-BERT'

# Load the model
loaded_model = TFBertForSequenceClassification.from_pretrained(model_name)

print(loaded_model)
# Now you can use the loaded_model and tokenizer for predictions


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/438M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at Rohit1234/CYBER-BERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


<transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x7d123bd46710>
