In [None]:
# 🚀 Install / Upgrade dependencies
!pip install -U transformers datasets evaluate accelerate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m110.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv("/content/anti_india_keywords_updated_v2.csv")   # replace with your file name

# Split 80:20
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

# Save as separate Excel files
train_df.to_excel("train.xlsx", index=False)
test_df.to_excel("test.xlsx", index=False)

print("✅ Files created: train.xlsx and test.xlsx")


✅ Files created: train.xlsx and test.xlsx


In [None]:
# ✅ Imports
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

# ======================
# 1. Load Your Dataset
# ======================
df_loaded = pd.read_excel("/content/train.xlsx")

# Label columns
label_cols = ["propaganda", "toxic", "neutral"]

# ✅ Clean labels: fill NaN with 0 and convert to int
df_loaded[label_cols] = df_loaded[label_cols].fillna(0).astype(int)

# Convert pandas → Hugging Face Dataset
dataset = Dataset.from_pandas(df_loaded)


# Label columns
label_cols = ["propaganda", "toxic", "neutral"]
num_labels = len(label_cols)

# ======================
# 2. Choose Model (XLM-R Base)
# ======================
model_ckpt = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Tokenization
def encode_batch(batch):
    encodings = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    batch["labels"] = [[batch[col][i] for col in label_cols] for i in range(len(batch["text"]))]
    encodings["labels"] = batch["labels"]
    return encodings

encoded_dataset = dataset.map(encode_batch, batched=True)

# ======================
# 3. Create Model
# ======================
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

# ======================
# 4. Metrics
# ======================
metric_f1 = evaluate.load("f1")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    labels = labels.astype(int)

    results = {
        "f1_micro": metric_f1.compute(predictions=predictions, references=labels, average="micro")["f1"],
        "f1_macro": metric_f1.compute(predictions=predictions, references=labels, average="macro")["f1"],
        "precision_micro": metric_precision.compute(predictions=predictions, references=labels, average="micro")["precision"],
        "recall_micro": metric_recall.compute(predictions=predictions, references=labels, average="micro")["recall"],
    }
    return results

# ======================
# 5. Training Setup
# ======================
from transformers import TrainingArguments

def encode_batch(batch):
    encodings = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    batch["labels"] = [[float(batch[col][i]) for col in label_cols] for i in range(len(batch["text"]))]
    encodings["labels"] = batch["labels"]
    return encodings

encoded_dataset = dataset.map(encode_batch, batched=True)
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    eval_steps=500
)



# ======================
# 6. Trainer
# ======================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ======================
# 7. Train
# ======================
trainer.train()

# ======================
# 8. Save Model
# ======================
trainer.save_model("./anti_india_model")
tokenizer.save_pretrained("./anti_india_model")

print("✅ Model training complete and saved!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/1431 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/1431 [00:00<?, ? examples/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mramya-kjwala21[0m ([33mramya-kjwala21-national-institute-of-electronics-and-inf[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.6331
100,0.6044
150,0.5283
200,0.3113
250,0.3566
300,0.2195
350,0.1931
400,0.1569
450,0.1519
500,0.1228


✅ Model training complete and saved!


In [None]:
import torch
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 🔹 Load model and tokenizer
model_path = "./anti_india_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 🔹 Label names (same order as training)
label_cols = ["propaganda", "toxic", "neutral"]

# 🔹 Prediction function
def predict(texts, threshold=0.5):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    probs = torch.sigmoid(logits).cpu().numpy()
    preds = (probs >= threshold).astype(int)
    return probs, preds

# 🔹 Load dataset from Excel
df = pd.read_excel("/content/test.xlsx")   # <-- your .xlsx file
texts = df["text"].tolist()
y_true = df[label_cols].values    # ground truth labels

# 🔹 Run predictions
all_probs, all_preds = predict(texts)

# 🔹 Calculate metrics
print("Accuracy:", accuracy_score(y_true, all_preds))
print("\nClassification Report:\n")
print(classification_report(y_true, all_preds, target_names=label_cols, zero_division=0))

# 🔹 (Optional) Save results back to Excel
df_pred = df.copy()
for i, col in enumerate(label_cols):
    df_pred[f"pred_{col}"] = all_preds[:, i]
    df_pred[f"prob_{col}"] = all_probs[:, i]

df_pred.to_excel("results_with_predictions.xlsx", index=False)
print("✅ Predictions saved to results_with_predictions.xlsx")

Accuracy: 0.9720670391061452

Classification Report:

              precision    recall  f1-score   support

  propaganda       0.99      0.98      0.99       141
       toxic       0.94      1.00      0.97        93
     neutral       0.99      1.00      0.99       217

   micro avg       0.98      0.99      0.98       451
   macro avg       0.97      0.99      0.98       451
weighted avg       0.98      0.99      0.98       451
 samples avg       0.98      0.99      0.98       451

✅ Predictions saved to results_with_predictions.xlsx


In [None]:
import requests
import os

def download_file(url, filename=None):
    """
    Download a file from a URL

    Args:
        url (str): The URL of the file to download
        filename (str, optional): The name to save the file as.
                                 If None, uses the filename from the URL
    """
    try:
        # Send GET request
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Get filename from URL if not provided
        if filename is None:
            filename = url.split('/')[-1]

        # Write the file in chunks to handle large files
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:  # filter out keep-alive chunks
                    file.write(chunk)

        print(f"File downloaded successfully: {filename}")
        return filename

    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return None

# Example usage
url = "/content/anti_india_model/model.safetensors"
download_file(url)

Error downloading file: Invalid URL '/content/anti_india_model/model.safetensors': No scheme supplied. Perhaps you meant https:///content/anti_india_model/model.safetensors?


In [None]:
from google.colab import files

def download_file_colab(file_path):
    """
    Download a file from Colab using the built-in files.download() method
    """
    try:
        files.download(file_path)
        print(f"Download initiated for: {file_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# Example usage
download_file_colab('/content/anti_india_model2.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download initiated for: /content/anti_india_model2.zip


In [None]:
from google.colab import drive
import shutil
import os
from tqdm import tqdm

def copy_folder_with_progress(source_folder, destination_folder=None):
    """
    Copy folder with progress tracking
    """
    try:
        # Mount Google Drive
        drive.mount('/content/drive')

        if destination_folder is None:
            folder_name = os.path.basename(source_folder.rstrip('/'))
            destination_folder = f'/content/drive/MyDrive/{folder_name}'

        if not os.path.exists(source_folder):
            print(f"Error: Source folder '{source_folder}' does not exist!")
            return False

        # Count total files for progress bar
        total_files = 0
        for root, dirs, files in os.walk(source_folder):
            total_files += len(files)

        if total_files == 0:
            print("No files found in the source folder!")
            return False

        # Create progress bar
        pbar = tqdm(total=total_files, desc="Copying files")

        # Custom copy function with progress
        def copy_with_progress(src, dst):
            shutil.copy2(src, dst)
            pbar.update(1)

        # Remove destination if exists
        if os.path.exists(destination_folder):
            shutil.rmtree(destination_folder)

        # Copy with progress
        shutil.copytree(source_folder, destination_folder, copy_function=copy_with_progress)

        pbar.close()
        print(f"✅ Successfully copied {total_files} files to: {destination_folder}")
        return True

    except Exception as e:
        print(f"❌ Error: {e}")
        return False
    finally:
        drive.flush_and_unmount()

In [None]:
!cp -r /content/anti_india_model /content/drive/MyDrive/