In [None]:
import pandas as pd
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split

# Load the JSON file
with open('/kaggle/input/filtered-wccftech-dataset-of-articles/filtered_data.json', 'r') as f:
    data = json.load(f)

# Extract the data from the JSON file
urls = []
titles = []
texts = []
tags = []
for article in data:
    urls.append(article['URL'])
    titles.append(article['Title'])
    texts.append(article['Text'])
    tags.append(article['Tags'])

# Create a DataFrame from the extracted data
df = pd.DataFrame({'URL': urls, 'Title': titles, 'Text': texts, 'Tags': tags})

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    tokens = tokenizer.encode_plus(
        text,
        max_length=512,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='tf'
    )
    return tokens

train_tokens = tokenize_text(train_df['Text'])
test_tokens = tokenize_text(test_df['Text'])

# Create input arrays
train_input = [train_tokens['input_ids'], train_tokens['attention_mask']]
test_input = [test_tokens['input_ids'], test_tokens['attention_mask']]

# Encode the tags
tags = df['Tags'].str.split(', ')
unique_tags = set(tag for sublist in tags for tag in sublist)
tag2idx = {tag: idx for idx, tag in enumerate(unique_tags)}

train_tags = [to_categorical([tag2idx[tag] for tag in sample_tags], num_classes=len(unique_tags)) for sample_tags in train_df['Tags'].str.split(', ')]
test_tags = [to_categorical([tag2idx[tag] for tag in sample_tags], num_classes=len(unique_tags)) for sample_tags in test_df['Tags'].str.split(', ')]

# Convert the input arrays to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_input, train_tags))
test_dataset = tf.data.Dataset.from_tensor_slices((test_input, test_tags))

# Define the BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased', num_labels=len(unique_tags))
input_ids = tf.keras.Input(shape=(512,), dtype=tf.int32)
attention_mask = tf.keras.Input(shape=(512,), dtype=tf.int32)
output = bert_model([input_ids, attention_mask])[0]
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile and train the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_dataset.batch(16), epochs=5)

# Evaluate the model
loss, accuracy = model.evaluate(test_dataset.batch(16))
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")
