<a href="https://colab.research.google.com/github/joyashre/ciis-hackathon-uni5/blob/main/XLM_Roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🚀 Install / Upgrade dependencies
!pip install -U transformers datasets evaluate accelerate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ✅ Imports
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

# ======================
# 1. Load Your Dataset
# ======================
# Example CSV format: text,label1,label2,label3,label4
# Each label column should be 0/1 (multi-label setup)

df_loaded = pd.read_csv("/content/processed_dataset.csv")

# Convert pandas → Hugging Face Dataset
dataset = Dataset.from_pandas(df_loaded)

# Label columns
label_cols = ["propaganda", "toxic", "coordinated", "neutral"]
num_labels = len(label_cols)

# ======================
# 2. Choose Model (XLM-R Base)
# ======================
model_ckpt = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Tokenization
def encode_batch(batch):
    encodings = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    batch["labels"] = [[batch[col][i] for col in label_cols] for i in range(len(batch["text"]))]
    encodings["labels"] = batch["labels"]
    return encodings

encoded_dataset = dataset.map(encode_batch, batched=True)

# ======================
# 3. Create Model
# ======================
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

# ======================
# 4. Metrics
# ======================
metric_f1 = evaluate.load("f1")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    labels = labels.astype(int)

    results = {
        "f1_micro": metric_f1.compute(predictions=predictions, references=labels, average="micro")["f1"],
        "f1_macro": metric_f1.compute(predictions=predictions, references=labels, average="macro")["f1"],
        "precision_micro": metric_precision.compute(predictions=predictions, references=labels, average="micro")["precision"],
        "recall_micro": metric_recall.compute(predictions=predictions, references=labels, average="micro")["recall"],
    }
    return results

# ======================
# 5. Training Setup
# ======================
from transformers import TrainingArguments

def encode_batch(batch):
    encodings = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    batch["labels"] = [[float(batch[col][i]) for col in label_cols] for i in range(len(batch["text"]))]
    encodings["labels"] = batch["labels"]
    return encodings

encoded_dataset = dataset.map(encode_batch, batched=True)
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    eval_steps=500
)



# ======================
# 6. Trainer
# ======================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ======================
# 7. Train
# ======================
trainer.train()

# ======================
# 8. Save Model
# ======================
trainer.save_model("./anti_india_model")
tokenizer.save_pretrained("./anti_india_model")

print("✅ Model training complete and saved!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/4665 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/4665 [00:00<?, ? examples/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjoyashreem246[0m ([33mjoyashreem246-national-institute-of-electronics-and-info[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.6047
100,0.5362
150,0.5636
200,0.5434
250,0.5033
300,0.4822
350,0.4743
400,0.4832
450,0.4178
500,0.4402


✅ Model training complete and saved!


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 🔹 Load model and tokenizer
model_path = "./anti_india_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 🔹 Label names (same order as training)
label_cols = ["propaganda", "toxic", "coordinated", "neutral"]

# 🔹 Function to predict
def predict(text, threshold=0.5):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Apply sigmoid → probabilities
    probs = torch.sigmoid(logits).squeeze().cpu().numpy()

    # Apply threshold → binary prediction
    predictions = (probs >= threshold).astype(int)

    # Collect results
    results = {label: float(prob) for label, prob in zip(label_cols, probs)}
    predicted_labels = [label for label, pred in zip(label_cols, predictions) if pred == 1]

    return results, predicted_labels

# 🔹 Try it on a paragraph
text = "अभी सूअर 6 महीने में दिल्ली में विधानसभा चुनाव है अब हिंदू मुसलमान नहीं होगा तो कब होगा जिसने भी किया है उसे सजा मिलेगी दिल्ली पुलिस कार्रवाई कर रही है धर्म के नाम पर किसी की जिंदगी छीन लेना उसका विरोध होता है नफरत की राजनीति बीजेपी आरएसएस करती है आज तक उनसे कोई सवाल किया है तूने"
scores, labels = predict(text)

print("Probabilities:", scores)
print("Predicted labels:", labels)


Probabilities: {'propaganda': 0.9730333685874939, 'toxic': 0.32847434282302856, 'coordinated': 0.7395163178443909, 'neutral': 0.14152248203754425}
Predicted labels: ['propaganda', 'coordinated']


In [None]:
import zipfile

# File to zip
file_to_zip = '/content/anti_india_model'
# Name of the zip file
zip_file_name = '/content/anti_india_model.zip'

# Create a zip file and add the file
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    zipf.write(file_to_zip)

print(f"Zipped {file_to_zip} into {zip_file_name}")

Zipped /content/anti_india_model into /content/anti_india_model.zip


In [None]:
!cp -r /content/anti_india_model /content/drive/MyDrive/

### database


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from datasets import Dataset

# ======================
# 1. Load Dataset
# ======================
# Replace 'dataset.tsv' with your file path
df = pd.read_csv("/content/drive/MyDrive/hackathon/hindi_dataset/hindi_dataset/hindi_dataset.tsv", sep="\t")

# Inspect columns
print(df.head())

# ======================
# 2. Define label mapping
# ======================
# Example mapping from task labels to model labels
# Adjust this mapping depending on your classification scheme
label_map = {
    "HOF": "propaganda",   # Hate / Offensive → Propaganda
    "HATE": "toxic",       # Hate speech → Toxic
    "TIN": "coordinated",  # Targeted Insult → Coordinated
    "PRFN": "neutral",     # Profane but not hateful → Neutral
    "NOT": "neutral",      # Not offensive → Neutral
    "NONE": "neutral"      # None given → Neutral
}

label_cols = ["propaganda", "toxic", "coordinated", "neutral"]

# ======================
# 3. Convert to one-hot format
# ======================
def encode_labels(row):
    # Initialize label dict
    labels = {col: 0 for col in label_cols}

    # Collect all tasks
    tasks = [row["task_1"], row["task_2"], row["task_3"]]

    for t in tasks:
        if t in label_map:
            labels[label_map[t]] = 1

    return pd.Series(labels)

# Apply encoding
encoded_labels = df.apply(encode_labels, axis=1)

# Merge text + labels
df_final = pd.concat([df["text"], encoded_labels], axis=1)

print(df_final.head())

# ======================
# 4. Convert pandas → Hugging Face Dataset
# ======================
dataset = Dataset.from_pandas(df_final)

# ======================
# 5. Model Info
# ======================
num_labels = len(label_cols)
model_ckpt = "xlm-roberta-base"

print(f"Dataset ready with {num_labels} labels: {label_cols}")
print(dataset)


         text_id                                               text task_1  \
0  hasoc_hi_5556  बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...    NOT   
1  hasoc_hi_5648  सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...    HOF   
2   hasoc_hi_164  तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...    HOF   
3  hasoc_hi_3530  बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...    NOT   
4  hasoc_hi_5206  चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...    NOT   

  task_2 task_3  
0   NONE   NONE  
1   PRFN    UNT  
2   PRFN    TIN  
3   NONE   NONE  
4   NONE   NONE  
                                                text  propaganda  toxic  \
0  बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...           0      0   
1  सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...           1      0   
2  तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...           1      0   
3  बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...           0      0   
4  चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रद

In [None]:
# Save as CSV or TSV
df_final.to_csv("processed_dataset.csv", index=False)
# OR
df_final.to_csv("processed_dataset.tsv", sep="\t", index=False)
