In [None]:
# ===================================================================================
# THE DEFINITIVE, SELF-CLEANING TRAINING SCRIPT FOR VS CODE (FINAL VERSION)
# This script installs a specific, stable set of libraries to guarantee compatibility.
# ===================================================================================

# --- Step 1: Force-Clean and Install Correct, Stable Libraries ---
# This is the most critical step to solve all previous errors.
import sys
import subprocess
try:
    print("--- Step 1: Creating a clean and stable environment... ---")
    
    # Force-uninstall any potentially conflicting libraries first.
    print("Uninstalling old versions to prevent conflicts...")
    subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "transformers", "accelerate", "sentence-transformers"])
    
    # CRITICAL FIX: Install a specific, stable set of libraries known to work together.
    print("\nInstalling correct, stable library versions...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers[torch]==4.30.2", "datasets", "accelerate==0.21.0", "scikit-learn", "huggingface_hub"])
    
    print("\nEnvironment setup complete! The script will now proceed with the correct libraries.")

except subprocess.CalledProcessError as e:
    print(f"CRITICAL ERROR during library setup: {e}")
    print("Please try restarting VS Code and running this cell again.")
    exit()


# --- Step 2: Import Everything We Need ---
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import os

# Create a directory to save the final model if it doesn't exist
os.makedirs("./satya-checker-multilingual-final", exist_ok=True)

print("\nAll libraries imported successfully.")


# --- Step 3: Load Your Master Dataset ---
# We use a local file path. This file MUST be in the same folder as your notebook.
file_path = 'master_balanced_training_data.csv'
print(f"\nLoading the master dataset from: {file_path}")
try:
    df = pd.read_csv(file_path)
    df.dropna(subset=['text', 'label'], inplace=True)
    df = df[df['text'].str.strip() != '']
    df['label'] = df['label'].astype(int)
    print(f"Data loaded and cleaned successfully with {len(df)} articles.")
except FileNotFoundError:
    print(f"CRITICAL ERROR: The file '{file_path}' was not found.")
    print("Please run the '1_Data_Preparation.ipynb' notebook first.")
    exit()

hg_dataset = Dataset.from_pandas(df)


# --- Step 4: Tokenization ---
model_name = "xlm-roberta-base"
print(f"\nLoading tokenizer for model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
print("Tokenizing the dataset...")
tokenized_dataset = hg_dataset.map(tokenize_function, batched=True)
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
print("Tokenization and splitting complete.")


# --- Step 5: Define Performance Metrics ---
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}
print("\nPerformance metrics function defined.")


# --- Step 6: The Final, Local-Safe Training Setup ---
print("\nPreparing training for local machine (CPU)...")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./local-training-checkpoints", 
    num_train_epochs=1,
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8, 
    weight_decay=0.01,
    fp16=False,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


# --- Step 7: Train the Model ---
print("\n--- STARTING DEEP LEARNING TRAINING ON YOUR COMPUTER ---")
print("!!! WARNING: THIS WILL BE EXTREMELY SLOW AND CAN TAKE MANY HOURS OR DAYS. !!!")
trainer.train()
print("\n--- TRAINING COMPLETE ---")


# --- Step 8: Save the Final, Best Model ---
output_save_path = "./satya-checker-multilingual-final"
print(f"\nSaving the final best model and tokenizer to: '{output_save_path}'")
trainer.save_model(output_save_path)
tokenizer.save_pretrained(output_save_path)
print("New Deep Learning model saved successfully!")


# --- Step 9: Final Evaluation Report Card ---
print("\n--- FINAL MODEL EVALUATION ---")
final_evaluation = trainer.evaluate()
print("\n--- YOUR MODEL'S FINAL REPORT CARD ---")
print(f"  Accuracy:  {final_evaluation['eval_accuracy']:.4f}")
print(f"  F1 Score:  {final_evaluation['eval_f1']:.4f}")
print(f"  Precision: {final_evaluation['eval_precision']:.4f}")
print(f"  Recall:    {final_evaluation['eval_recall']:.4f}")
print("------------------------------------------")


--- Step 1: Creating a clean and stable environment... ---
Uninstalling old versions to prevent conflicts...

Installing correct, stable library versions...

Environment setup complete! The script will now proceed with the correct libraries.


  from .autonotebook import tqdm as notebook_tqdm
W0725 10:02:00.586000 10648 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.



All libraries imported successfully.

Loading the master dataset from: master_balanced_training_data.csv
Data loaded and cleaned successfully with 160090 articles.

Loading tokenizer for model: xlm-roberta-base




Tokenizing the dataset...


Map: 100%|██████████| 160090/160090 [01:48<00:00, 1482.08 examples/s]


Tokenization and splitting complete.

Performance metrics function defined.

Preparing training for local machine (CPU)...


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.d


--- STARTING DEEP LEARNING TRAINING ON YOUR COMPUTER ---


  0%|          | 7/16009 [06:10<228:56:56, 51.51s/it]