In [24]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

# Define your dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        
        # Create a mapping of unique tags to integers
        self.tag2idx = {tag: idx for idx, tag in enumerate(set([article["Tags"] for article in self.data]))}
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data[index]
        text = article["Text"]
        label = article["Tags"]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()
        label = self.tag2idx[label]
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label
        }

# Load and preprocess your dataset from JSON file
def load_dataset_from_json(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

json_file = "/kaggle/input/filtered-wccftech-dataset-of-articles/filtered_data.json"
dataset = load_dataset_from_json(json_file)
custom_dataset = CustomDataset(dataset)

# Determine the number of unique tags in your dataset
num_classes = len(set([article["Tags"] for article in dataset]))

# Split your dataset into training and validation sets
train_size = int(0.8 * len(custom_dataset))
train_dataset, val_dataset = torch.utils.data.random_split(custom_dataset, [train_size, len(custom_dataset) - train_size])

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)

# Specify device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Define training loop
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            val_loss += loss.item()

    val_loss /= len(val_loader)

    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained("./")
tokenizer.save_pretrained("./")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch 1: Train Loss = 1.7497, Val Loss = 1.7292


('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json')

In [30]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from newspaper import Article

# Step 1: Load the saved model and tokenizer
model_path = "./"  # Path where the saved model and tokenizer are stored
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Step 2: Take input URL of the article
article_url = input("Enter the URL of the article: ")

# Step 3: Fetch the article text using newspaper3k
article = Article(article_url)
article.download()
article.parse()
article_text = article.text

# Step 4: Give the article text to the saved model to output predicted tags
inputs = tokenizer.encode_plus(article_text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Ensure the model is in evaluation mode
model.eval()

# Pass the input through the model
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# Get the predicted label index
predicted_label_index = torch.argmax(outputs.logits, dim=1).item()

# Step 5: Print the output
print("Predicted Tag: ", predicted_label_index)


Enter the URL of the article:  https://wccftech.com/pixel-7a-refresh-rate-reduces-90hz-to-60hz-under-direct-sunlight/?dark=1


Predicted Tag:  85
