In [10]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load dataset
df = pd.read_csv("multilingual_dataset_test_final.csv")

# Remove any NaN values (if present)
df = df.dropna(subset=["text", "labels"])

# Strip extra spaces from text
df["text"] = df["text"].str.strip()

# Convert labels to integers
df["labels"] = df["labels"].astype(int)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

# Load a smaller, faster model
model_name = "prajjwal1/bert-tiny"  # Tiny BERT (~4x faster than distilbert)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function (Reduced max_length to 32 for ultra-fast training)
def tokenize_function(example):
    return tokenizer(
        example["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=32  # Less computation
    )

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Ensure labels are correctly formatted
def preprocess_labels(example):
    example["labels"] = int(example["labels"])  # Convert to integer
    return example

# Apply label preprocessing
train_dataset = train_dataset.map(preprocess_labels)
test_dataset = test_dataset.map(preprocess_labels)

# Set dataset format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Ultra-fast training settings
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,  
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,  
    num_train_epochs=1,  
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    optim="adamw_torch",
    gradient_accumulation_steps=2,  
    bf16=False  # ❌ Removed to avoid compatibility issues
)

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Convert logits to class predictions
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at C:\Users\KIIT/.cache\huggingface\hub\models--prajjwal1--bert-tiny\snapshots\6f75de8b60a9f8a2fdf7b69cbd86d9e64bcb3837\config.json
Model config BertConfig {
  "_name_or_path": "prajjwal1/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\KIIT/.cache\huggingface\hub\models--prajjwal1--bert-tiny\snapshots\6f75de8b60a9f8a2fdf7b69cbd86d9e64bc

Map:   0%|          | 0/18367 [00:00<?, ? examples/s]

Map:   0%|          | 0/4592 [00:00<?, ? examples/s]

Map:   0%|          | 0/18367 [00:00<?, ? examples/s]

Map:   0%|          | 0/4592 [00:00<?, ? examples/s]

loading configuration file config.json from cache at C:\Users\KIIT/.cache\huggingface\hub\models--prajjwal1--bert-tiny\snapshots\6f75de8b60a9f8a2fdf7b69cbd86d9e64bcb3837\config.json
Model config BertConfig {
  "_name_or_path": "prajjwal1/bert-tiny",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at C:\Users\KIIT/.cache\huggingface\hub\models--prajjwal1--bert-tiny\snapshots\6f75de8b60a9f8a2fdf7b69cbd86d9e64bcb3837\pytorch_model.bin
Some weights of the model checkpoint at prajjwal1/ber

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4748,0.435469,0.785279,0.800486,0.754962,0.851852


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, Unnamed: 0. If text, Unnamed: 0 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4592
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=287, training_loss=0.5263095898910682, metrics={'train_runtime': 1687.7732, 'train_samples_per_second': 10.882, 'train_steps_per_second': 0.17, 'total_flos': 1458439716480.0, 'train_loss': 0.5263095898910682, 'epoch': 1.0})

In [11]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


Configuration saved in ./fine_tuned_model\config.json
Model weights saved in ./fine_tuned_model\pytorch_model.bin
tokenizer config file saved in ./fine_tuned_model\tokenizer_config.json
Special tokens file saved in ./fine_tuned_model\special_tokens_map.json


('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\vocab.txt',
 './fine_tuned_model\\added_tokens.json',
 './fine_tuned_model\\tokenizer.json')

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load fine-tuned model and tokenizer
model_path = "./fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Prediction function
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # Fixed typo (was "logists")
        predicted_class = torch.argmax(logits, dim=1).item()  # Fixed dims -> dim

    labels = {0: "real", 1: "fake"}
    return labels[predicted_class]

# Example text
example_texts = [
    "Breaking news: Scientist discovers cure for cancer",  # Fake (English)
    "NASA confirms water on Mars, opening doors for future colonization.",  # Real (English)
    "El gobierno ha anunciado nuevas medidas económicas para combatir la inflación.",  # Real (Spanish)
    "La NASA ha encontrado vida en Marte, los extraterrestres existen!",  # Fake (Spanish)
    "政府宣布新疫苗可100%防止所有疾病",  # Fake (Chinese: "The government announced a new vaccine that 100% prevents all diseases")
    "法国研究人员成功开发了新的癌症治疗方法",  # Real (Chinese: "French researchers successfully developed a new cancer treatment")
    "L'IA peut désormais prédire l'avenir avec une précision de 99%.",  # Fake (French: "AI can now predict the future with 99% accuracy.")
    "Le président a signé un accord de paix historique avec le pays voisin.",  # Real (French: "The president signed a historic peace agreement with the neighboring country.")
    "पृथ्वी के अंदर एक गुप्त सभ्यता पाई गई।",  # Fake (Hindi: "A secret civilization was found inside the Earth.")
    "भारत ने चंद्रमा पर अपना दूसरा मिशन सफलतापूर्वक पूरा किया।"  # Real (Hindi: "India successfully completed its second mission to the Moon.")
]     
for text in example_texts:
    result = predict(text)
    print(f"Text: {text}\nPrediction: {result}\n")


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./fine_tuned_model\config.json
Model config BertConfig {
  "_name_or_path": "./fine_tuned_model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./fine_tuned_model\pytorch_model.bin
All model check

Text: Breaking news: Scientist discovers cure for cancer
Prediction: fake

Text: NASA confirms water on Mars, opening doors for future colonization.
Prediction: fake

Text: El gobierno ha anunciado nuevas medidas económicas para combatir la inflación.
Prediction: real

Text: La NASA ha encontrado vida en Marte, los extraterrestres existen!
Prediction: fake

Text: 政府宣布新疫苗可100%防止所有疾病
Prediction: fake

Text: 法国研究人员成功开发了新的癌症治疗方法
Prediction: real

Text: L'IA peut désormais prédire l'avenir avec une précision de 99%.
Prediction: real

Text: Le président a signé un accord de paix historique avec le pays voisin.
Prediction: fake

Text: पृथ्वी के अंदर एक गुप्त सभ्यता पाई गई।
Prediction: fake

Text: भारत ने चंद्रमा पर अपना दूसरा मिशन सफलतापूर्वक पूरा किया।
Prediction: fake

