In [6]:
!pip  install transformers==4.22.1 -q

In [7]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [8]:
model = TFAutoModel.from_pretrained("bert-base-uncased")


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
df = pd.read_csv("/content/labelled.csv")
df.head()

Unnamed: 0,Heading,Body,Category,URL
0,free speech not hate speech madras high court ...,madras high court issue significant remark ami...,Judiciary,https://www.indiatoday.in/law/high-courts/stor...
1,comment take context say us cop mock indian st...,seattle police officer guild friday come defen...,Crime,https://www.indiatoday.in/world/story/indian-s...
2,first meeting one nation one election committe...,first official meeting one nation one election...,Politics,https://www.indiatoday.in/india/story/one-nati...
3,us airlines flight depressurize midair plummet...,united airlines jet head rome turn around less...,Crime,https://www.indiatoday.in/world/story/us-fligh...
4,terrorist kill security force foil infiltratio...,three terrorist kill infiltration bid foil sec...,Crime,https://www.indiatoday.in/india/story/one-terr...


In [11]:
category_count = df['Category'].value_counts()

categories = category_count.index

categories


Index(['Entertainment', 'Business', 'Politics', 'Judiciary', 'Crime',
       'Culture', 'Sports', 'Science', 'International', 'Technology'],
      dtype='object')

In [12]:
categories = {
    "Entertainment" :   0,
"Business"  :       1,
"Politics"   :      2,
"Judiciary"   :     3,
"Crime"     :       4,
"Culture"    :      5,
"Sports"      :      6,
"Science"      :     7,
"International" :    8,
"Technology"     :   9
}

print(categories)


{'Entertainment': 0, 'Business': 1, 'Politics': 2, 'Judiciary': 3, 'Crime': 4, 'Culture': 5, 'Sports': 6, 'Science': 7, 'International': 8, 'Technology': 9}


In [13]:
def map_to_encoding(text):
    return categories.get(text, -1)  # Return -1 if not found

# Create the 'category_encoding' column
df['category_encoding'] = df['Category'].apply(map_to_encoding)

In [14]:
newsArticlesBody = df['Body'].to_list()

categoryLabels = df['category_encoding'].to_list()

In [15]:
# Split the data into training, validation, and test sets (80% train, 10% val, 10% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    newsArticlesBody, categoryLabels, test_size=0.2, random_state=42, stratify = categoryLabels)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42, stratify = test_labels)

# Display the size of each dataset
print("Train set size:", len(train_texts))
print("Validation set size:", len(val_texts))
print("Test set size:", len(test_texts))

# Verify the splitting and shuffle
print("Sample train text:", train_texts[0])
print("Sample train label:", train_labels[0])


Train set size: 9266
Validation set size: 1158
Test set size: 1159
Sample train text: rashmika mandanna allu arjun highly anticipate film pushpa rule leave fan eager excite film release maker leave part cliffhanger people high expectation sequel among cast member malayalam superstar fahadh faasil cameo appearance ips bhanwar singh shekhawat receive major attention intensity menacing performance although make brief appearance prequel fahadh play significant role alongside allu arjun upcoming film meanwhile late report state actor charge staggering rs crore role villain sequel leave fan astonishedfahadh faasil substantial fee role make one highestpaid villain south indian film industry accord report deccan chronicle consider film grand scale budget producer willing meet demand fahadh faasil fee rs crore role pushpa portion demand pushpa despite low compensation menacing performance iconic dialogue make last impacta source close production say talented actor kind role effortlessly ask ess

In [16]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)


In [17]:
class BERTForClassification(tf.keras.Model):

    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [22]:
classifier = BERTForClassification(model, num_classes=10)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [27]:
num_epochs = 3
batch_size = 8

In [28]:
# Convert the encodings and labels to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings), train_labels))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings), val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings), test_labels))

# Shuffle and batch the datasets
train_dataset = train_dataset.shuffle(len(train_texts)).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.shuffle(len(test_texts)).batch(batch_size)
# Define the number of epochs and batch size

# Train the mod



In [29]:
train_dataset

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [30]:
classifier.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=num_epochs,
    batch_size=batch_size
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7e61940fa710>

In [31]:
classifier.evaluate(test_dataset)



[0.6796050667762756, 0.7989646196365356]

In [33]:
classifier.save("FineTunedBERT", save_format = "tf")




In [35]:
!zip -r /content/FineTuned.zip /content/FineTunedBERT


  adding: content/FineTunedBERT/ (stored 0%)
  adding: content/FineTunedBERT/keras_metadata.pb (deflated 96%)
  adding: content/FineTunedBERT/fingerprint.pb (stored 0%)
  adding: content/FineTunedBERT/assets/ (stored 0%)
  adding: content/FineTunedBERT/saved_model.pb (deflated 92%)
  adding: content/FineTunedBERT/variables/ (stored 0%)
  adding: content/FineTunedBERT/variables/variables.index (deflated 79%)
  adding: content/FineTunedBERT/variables/variables.data-00000-of-00001 (deflated 12%)


In [37]:
import pickle

# Save the classifier object using pickle
with open('Fine.pkl', 'wb') as file:
    pickle.dump(classifier, file)


