In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Define a custom dataset class
class ArticleDataset(Dataset):
    def __init__(self, json_file, tokenizer, max_tag_length=10):
        self.data = []
        self.tokenizer = tokenizer
        self.tag2idx = {}
        self.max_tag_length = max_tag_length
        
        with open(json_file, 'r') as file:
            articles = json.load(file)
            for article in articles:
                url = article['URL']
                title = article['Title']
                text = article['Text']
                tags = article['Tags'].split(', ')
                self.data.append((url, title, text, tags))
                self.update_tags(tags)
    
    def update_tags(self, tags):
        for tag in tags:
            if tag not in self.tag2idx:
                self.tag2idx[tag] = len(self.tag2idx)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        url, title, text, tags = self.data[index]
        inputs = self.tokenizer.encode_plus(
            title,
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        tag_indices = [self.tag2idx[tag] for tag in tags]
        padded_tags = self.pad_tags(tag_indices)
        tags = torch.tensor(padded_tags)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'tags': tags}
    
    def pad_tags(self, tag_indices):
        if len(tag_indices) >= self.max_tag_length:
            return tag_indices[:self.max_tag_length]
        else:
            padded_tags = tag_indices + [0] * (self.max_tag_length - len(tag_indices))
            return padded_tags

# Load the labeled dataset
dataset = ArticleDataset('/kaggle/input/filtered-wccftech-dataset-of-articles/filtered_data.json', tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'))

# Determine the number of tags
num_tags = len(dataset.tag2idx)

# Define the training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

num_epochs = 10
batch_size = 8
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tags = batch['tags'].to(device)
        
        # Compute the maximum length of non-zero elements in each batch
        non_zero_lengths = torch.sum(tags != 0, dim=1)
        max_length = torch.max(non_zero_lengths).item()
        
        # Adjust the target labels dynamically based on the maximum length
        target_labels = tags[:, :max_length].contiguous().view(-1)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=target_labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    average_loss = total_loss / len(dataloader)
    print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}')

# Save the trained model
model.save_pretrained('trained_model')
