In [None]:
!pip install transformers



In [37]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import wandb


In [35]:
wandb.init(project="news-subject-classification", name="bert-base-title-text", config={
    "model": "bert-base-uncased",
    "batch_size": 8,
    "epochs": 3,
    "lr": 2e-5,
    "max_len": 512
})
config = wandb.config

df = pd.read_csv("/content/True.csv")
df['text'] = df['title'] + " " + df['text']

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['subject'])

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

tokenizer = AutoTokenizer.from_pretrained(config.model)


In [None]:
df['text'].head()

Unnamed: 0,text
0,"As U.S. budget fight looms, Republicans flip t..."
1,U.S. military to accept transgender recruits o...
2,Senior U.S. Republican senator: 'Let Mr. Muell...
3,FBI Russia probe helped by Australian diplomat...
4,Trump wants Postal Service to charge 'much mor...


In [None]:
df['subject'].value_counts()

Unnamed: 0_level_0,count
subject,Unnamed: 1_level_1
politicsNews,11272
worldnews,10145


In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = NewsDataset(train_texts, train_labels, tokenizer, config.max_len)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer, config.max_len)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

In [None]:
class NewsClassifier(nn.Module):
    def __init__(self, num_classes):
        super(NewsClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(config.model)
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.fc(self.drop(pooled_output))

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = len(label_encoder.classes_)
model = NewsClassifier(num_classes).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
criterion = nn.CrossEntropyLoss()

for epoch in range(config.epochs):
  print(f"\n🟢 Début Epoch {epoch + 1}")
  model.train()
  total_loss = 0
  all_preds = []
  all_labels = []

  for step, batch in enumerate(train_loader):
    if step % 10 == 0:
      print(f"Step {step}/{len(train_loader)}")
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    preds = torch.argmax(outputs, dim=1)
    all_preds.extend(preds.detach().cpu().numpy())
    all_labels.extend(labels.detach().cpu().numpy())

  acc = accuracy_score(all_labels, all_preds)
  wandb.log({"train_loss": total_loss / len(train_loader), "train_accuracy": acc})
  print(f"Epoch {epoch+1}: Loss={total_loss:.4f}, Accuracy={acc:.4f}")






🟢 Début Epoch 1
Step 0/2142
Step 10/2142
Step 20/2142
Step 30/2142
Step 40/2142
Step 50/2142
Step 60/2142
Step 70/2142
Step 80/2142
Step 90/2142
Step 100/2142
Step 110/2142
Step 120/2142
Step 130/2142
Step 140/2142
Step 150/2142
Step 160/2142
Step 170/2142
Step 180/2142
Step 190/2142
Step 200/2142
Step 210/2142
Step 220/2142
Step 230/2142
Step 240/2142
Step 250/2142
Step 260/2142
Step 270/2142
Step 280/2142
Step 290/2142
Step 300/2142
Step 310/2142
Step 320/2142
Step 330/2142
Step 340/2142
Step 350/2142
Step 360/2142
Step 370/2142
Step 380/2142
Step 390/2142
Step 400/2142
Step 410/2142
Step 420/2142
Step 430/2142
Step 440/2142
Step 450/2142
Step 460/2142
Step 470/2142
Step 480/2142
Step 490/2142
Step 500/2142
Step 510/2142
Step 520/2142
Step 530/2142
Step 540/2142
Step 550/2142
Step 560/2142
Step 570/2142
Step 580/2142
Step 590/2142
Step 600/2142
Step 610/2142
Step 620/2142
Step 630/2142
Step 640/2142
Step 650/2142
Step 660/2142
Step 670/2142
Step 680/2142
Step 690/2142
Step 700/2142


In [32]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [36]:
report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_, output_dict=True)
wandb.log({"eval_accuracy": accuracy_score(all_labels, all_preds)})
wandb.log({"classification_report": report})
print("\n📊 Rapport de classification :")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

wandb.finish()


📊 Rapport de classification :
              precision    recall  f1-score   support

politicsNews       1.00      0.99      1.00      2256
   worldnews       0.99      1.00      1.00      2028

    accuracy                           1.00      4284
   macro avg       1.00      1.00      1.00      4284
weighted avg       1.00      1.00      1.00      4284



0,1
eval_accuracy,▁

0,1
eval_accuracy,0.99627


In [None]:
import os

output_dir = "saved_model_bert"
os.makedirs(output_dir, exist_ok=True)


torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))


