In [None]:
# !sinfo -p gpu
# !nvidia-smi
# !free -h
# !df -h ~


# VIPAA=Violence, Impunity, and Peace in the Americas Archive
### domain-specific dataset compiled at the University of Arizona (School of Government & Public Policy, with NSF support) for computational research on political violence and human rights across Latin America.

# https://www.colombiaarmedactors.org/

#short description
Fine-tunning conflibert on the VIPAA corpus to perform multi-label classification of violent event descriptions in Spanish
The source dataset (45,512 instances) was preprocessed by extracting human-readable labels from the column Tipificación_with_codes, removing numerical prefixes (e.g., “D:4:701”) and normalizing orthography. Sixty-five unique labels from list_of_labels.xlsx were used to build a binary multi-hot vector representation matrix for training. We used the continuation checkpoint snowood1/ConfliBERT-cont-uncased (Hugging Face, 2023) as the base model, fine-tuned with a binary cross-entropy loss and the Hugging Face Trainer API (Transformers v4.46.3, Accelerate v1.0.1, PyTorch v2.4.1) on the UA HPC system. Evaluation used micro- and macro-averaged F1, precision, and recall, with a tuned sigmoid threshold (≈ 0.4) to optimize micro-F1 performance.

# Përshkrim i procesit dhe skriptit
##  **E përgjitshme**

We prepared and fine-tuned a **multi-label ConfliBERT classifier** to automatically assign violence-event categories (e.g., *ASESINATO*, *BIENES CIVILES*, *PILLAJE*, etc.) to text descriptions of incidents from the **VIPAA corpus** (~45 K Spanish event reports downloaded from nocheyniebla.org).

The workflow covered full preprocessing, label normalization, and fine-tuning of **snowood1/ConfliBERT-cont-uncased** under the UA HPC environment.

##  **Çdo hap**

### 1 Data preparation

* Two Excel files were used:

  * **vipaa_for_training.xlsx** — main dataset with event descriptions and coded labels.
  * **list_of_labels.xlsx** — master list of 65 standardized action labels (actions_clean column).
* Both were loaded into Pandas DataFrames.

### 2 Label extraction and cleaning

* The relevant column was **Tipificacion_with_codes** (entries like
  "D:2:80 BIENES CIVILES, D:2:801 ATAQUE A OBRAS E INST. QUE CONT. FUERZAS PELIGR.").
* We removed numeric codes (B:2:40, D:4:701, etc.) using a regular-expression parser, yielding clean upper-case labels such as:

  text
  ['BIENES CIVILES', 'ATAQUE A OBRAS E INST. QUE CONT. FUERZAS PELIGR.']
  
* Nested or repeated lists (e.g., [["A","B"]] or [A,A,A]) were flattened and deduplicated:

  python
  data_df[label_col] = data_df[label_col].apply(lambda lst: sorted(set(lst)))
  

### 3 Normalization and mapping

* Accents were stripped (unicodedata.normalize), text upper-cased, and matched to the 65 canonical labels in list_of_labels.xlsx.
* A **label->ID** dictionary (label_map) and **ID->label** reverse map (id2label) were created.
* Each event's label list was converted to numeric IDs (e.g., [6, 36]).

### 4 Multi-hot encoding

* A binary matrix y ∈ ℝ^{45511×65} was built, where 1 marks presence of a label.
  Example: row ₀ -> 1 for columns 6 and 36.

### 5 Environment setup on UA HPC

* Installed missing dependencies inside the HPC Jupyter environment:

  bash
  pip install --user accelerate
  pip install --user --upgrade transformers torch pandas scikit-learn
  
* Verified versions:
  accelerate 1.0.1, transformers 4.46.3, torch 2.4.1+cu121.

### 6 Model and tokenizer

* Loaded **ConfliBERT**:

  python
  tokenizer = AutoTokenizer.from_pretrained("snowood1/ConfliBERT-cont-uncased")
  model = AutoModelForSequenceClassification.from_pretrained(
      "snowood1/ConfliBERT-cont-uncased",
      num_labels=65,
      problem_type="multi_label_classification"
  )
  

### 7 Train/validation split and tokenization

* 90 % train / 10 % validation split (train_test_split).
* Tokenized descriptions (max_length = 256).

### 8 Training configuration

* Used Hugging Face Trainer API with BCEWithLogitsLoss (multi-label default).
* Key arguments:

  python
  TrainingArguments(
      output_dir="./confli_bert_vipaa",
      evaluation_strategy="epoch",
      save_strategy="epoch",
      load_best_model_at_end=True,
      metric_for_best_model="micro/f1",
      learning_rate=2e-5,
      per_device_train_batch_size=8,
      num_train_epochs=3,
      weight_decay=0.01,
      fp16=True  # mixed precision on GPU
  )
  
* Custom metrics: micro/macro F1, precision, recall using sigmoid + 0.4 threshold.

### 9 Threshold tuning and evaluation

* Post-training, we tuned the sigmoid threshold (0.2–0.6) on validation data to maximize micro-F1.
* Produced per-label F1 reports and saved the best model/tokenizer.

### 10 Output

* Saved fine-tuned model directory: ./confli_bert_vipaa_best/
  (contains config.json, pytorch_model.bin, tokenizer.*)
* Achieved multi-label predictions for new event texts using:

  python
  preds, probs = predict_labels(["Guerrilleros atacaron el puesto de policía..."])
  

## 11 **Idea kryesore**

We turned raw textual incident narratives into structured multi-label conflict event classifications aligned with the **VIPAA typology** and the **ConfliBERT** language representation, creating a pipeline that:

1. Preprocesses and normalizes domain-specific labels.
2. Encodes them in a multi-hot representation.
3. Fine-tunes a Spanish conflict-domain transformer for multi-label classification.
4. Evaluates and saves a deployable model.


In [2]:
#imports

In [3]:
# pip install --user openpyxl

In [4]:
# !pip install --user accelerate
# !pip install --user "transformers[torch]"
# !pip install --user --upgrade transformers torch pandas scikit-learn


In [5]:
import accelerate
import transformers
import torch
print("Accelerate:", accelerate.__version__)
print("Transformers:", transformers.__version__)
print("Torch:", torch.__version__)


Accelerate: 1.0.1
Transformers: 4.46.3
Torch: 2.4.1+cu121


In [6]:
import pandas as pd
import re
import unicodedata
import numpy as np

In [7]:
#load Excel files
#  label vocabulary (list_of_labels.xlsx) 
labels_df = pd.read_excel("list_of_labels.xlsx")

#  main dataset (vipaa_for_training.xlsx) 
data_df = pd.read_excel("vipaa_for_training.xlsx")

print("Columns in main dataset:")
print(data_df.columns.tolist())

  warn(msg)
  warn(msg)


Columns in main dataset:
['Fecha del hecho', 'Ubicaciones', 'P. Responsables', 'Tipificacion', 'Tipificacion_with_codes', 'Tipificacion_clean', 'Víctimas', 'Descripción', 'Acciones']


In [8]:
#select label column
label_col = "Tipificacion_with_codes"

In [9]:
#extract label names (remove the D:2:80 codes etc.)

def extract_label_names(s):
    """
    Takes a string like:
    'D:2:80 BIENES CIVILES, D:2:801 ATAQUE A OBRAS E INST. QUE CONT. FUERZAS PELIGR.'
    and returns:
    ['BIENES CIVILES', 'ATAQUE A OBRAS E INST. QUE CONT. FUERZAS PELIGR.']
    """
    parts = re.split(r",\s*", str(s))
    labels = []
    for p in parts:
        match = re.sub(r"^[A-Z]:\d+:\d+\s*", "", p).strip()
        if match:
            labels.append(match.upper())
    return labels

#apply extraction
data_df[label_col] = data_df[label_col].fillna("").apply(extract_label_names)
print("After extraction:")
print(data_df[label_col].head(3))

After extraction:
0    [BIENES CIVILES, ATAQUE A OBRAS E INST. QUE CO...
1    [LESIÓN A PERSONA PROTEGIDA, LESIÓN A PERSONA ...
2                            [BIENES CIVILES, PILLAJE]
Name: Tipificacion_with_codes, dtype: object


In [10]:
#flatten nested lists (fix [['A','B']] -> ['A','B'])

def flatten_nested_lists(x):
    if isinstance(x, list) and len(x) == 1 and isinstance(x[0], list):
        return x[0]
    return x

data_df[label_col] = data_df[label_col].apply(flatten_nested_lists)

print("After flattening:")
print(data_df[label_col].head(3))

After flattening:
0    [BIENES CIVILES, ATAQUE A OBRAS E INST. QUE CO...
1    [LESIÓN A PERSONA PROTEGIDA, LESIÓN A PERSONA ...
2                            [BIENES CIVILES, PILLAJE]
Name: Tipificacion_with_codes, dtype: object


In [11]:
#normalize label vocabulary and build mappings


def normalize_label(s):
    s = str(s).strip().upper()
    s = ''.join(c for c in unicodedata.normalize('NFD', s)
                if unicodedata.category(c) != 'Mn')
    return s

labels_df["actions_clean"] = labels_df["actions_clean"].apply(normalize_label)
labels_df["label_id"] = labels_df.index

label_map = dict(zip(labels_df["actions_clean"], labels_df["label_id"]))
id2label = {v: k for k, v in label_map.items()}

print(f"Loaded {len(label_map)} unique labels from list_of_labels.xlsx")
print(list(label_map.items())[:5])

Loaded 65 unique labels from list_of_labels.xlsx
[('ASESINATO', 0), ('EJECUCION EXTRAJUDICIAL', 1), ('HOMICIDIO INTENCIONAL DE PERSONA PROTEGIDA', 2), ('COLECTIVO AMENAZADO', 3), ('AMENAZA', 4)]


In [12]:
#encode each case's label list as numeric IDs

def encode_labels(label_list):
    normed = [normalize_label(l) for l in label_list]
    return [label_map[l] for l in normed if l in label_map]

data_df[label_col] = data_df[label_col].apply(encode_labels)

print("Sample encoded labels:")
print(data_df[["Descripción", label_col]].head(5))

Sample encoded labels:
                                         Descripción Tipificacion_with_codes
0  Guerrilleros de las FARC-EP incursionaron en l...                 [6, 36]
1  Guerrilleros de las FARC-EP bloquearon la vía ...    [13, 13, 13, 13, 11]
2  Guerrilleros de las FARC-EP irrumpieron en la ...                 [6, 11]
3  Guerrilleros del Frente Manuel Vásquez Castaño...                 [6, 11]
4  Guerrilleros del Frente Ernesto Che Guevara de...                     [2]


In [13]:
#build multi-hot matrix for training

num_labels = len(label_map)
y = np.zeros((len(data_df), num_labels))

for i, label_ids in enumerate(data_df[label_col]):
    for lid in label_ids:
        y[i, lid] = 1

print("Multi-hot matrix created")
print("y shape:", y.shape)
print("Example row:", y[0])


Multi-hot matrix created
y shape: (45511, 65)
Example row: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [14]:
data_df[label_col] = data_df[label_col].apply(lambda lst: sorted(list(set(lst))))


In [15]:
#de-dup labels per row, then rebuild y

In [16]:
#de-duplicate label IDs like [13,13,13] -> [13]
data_df[label_col] = data_df[label_col].apply(lambda lst: sorted(set(lst)))

#rebuild y (multi-hot)
import numpy as np
num_labels = len(label_map)
y = np.zeros((len(data_df), num_labels), dtype=np.float32)
for i, label_ids in enumerate(data_df[label_col]):
    for lid in label_ids:
        y[i, lid] = 1.0

print("y shape:", y.shape)


y shape: (45511, 65)


In [17]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

model_name = "snowood1/ConfliBERT-cont-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

texts = data_df["Descripción"].astype(str).tolist()

X_train, X_val, y_train, y_val = train_test_split(
    texts, y, test_size=0.10, random_state=42, stratify=(y.sum(axis=1) > 0)
)

train_enc = tokenizer(
    X_train, truncation=True, padding=True, max_length=256, return_tensors="pt"
)
val_enc = tokenizer(
    X_val, truncation=True, padding=True, max_length=256, return_tensors="pt"
)


In [18]:
import torch
from torch.utils.data import Dataset

class VIPAADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return self.labels.shape[0]

train_ds = VIPAADataset(train_enc, y_train)
val_ds   = VIPAADataset(val_enc,   y_val)


In [19]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import AutoModelForSequenceClassification, AutoConfig

config = AutoConfig.from_pretrained(
    model_name,
    num_labels = y.shape[1],
    problem_type = "multi_label_classification",
    id2label = id2label,
    label2id = {v:k for k,v in id2label.items()}
)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

#default BCEWithLogitsLoss is used by Trainer for multi-label
SIGMOID_THRESH = 0.40  # I'll tune this later

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))          # sigmoid
    preds = (probs > SIGMOID_THRESH).astype(int)
    return {
        "micro/f1":        f1_score(labels, preds, average="micro", zero_division=0),
        "macro/f1":        f1_score(labels, preds, average="macro", zero_division=0),
        "micro/precision": precision_score(labels, preds, average="micro", zero_division=0),
        "micro/recall":    recall_score(labels, preds, average="micro", zero_division=0),
    }


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at snowood1/ConfliBERT-cont-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="/xdisk/josorio1/bliko/vipaa/model/confli_bert_vipaa",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="micro/f1",
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16, #6 locally
    per_device_eval_batch_size=8,
    num_train_epochs=5, #beje tre ne laptop
    weight_decay=0.01,
    fp16=True,                 # if GPU supports it
    logging_steps=100,
    report_to="none",
    gradient_accumulation_steps=2, #hiqe në laptop
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
import sys
import datetime

#create a timestamped log file
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
logfile = open(f"confliBERT_train_{timestamp}.log", "w")

#redirect all prints and Trainer logs to this file
sys.stdout = logfile
sys.stderr = logfile

print("Training started")
print("Timestamp:", timestamp)

In [None]:
trainer.train()

In [None]:
print("Training completed successfully")
logfile.close()

In [None]:
import torch

# get val logits
pred_out = trainer.predict(val_ds)
logits = pred_out.predictions
labels = pred_out.label_ids
probs = 1 / (1 + np.exp(-logits))

best_t, best_micro = None, -1
for t in np.linspace(0.2, 0.6, 21):
    pred = (probs > t).astype(int)
    micro = f1_score(labels, pred, average="micro", zero_division=0)
    if micro > best_micro:
        best_micro, best_t = micro, t

print(f"Best threshold: {best_t:.2f} with micro-F1={best_micro:.4f}")
SIGMOID_THRESH = best_t


In [None]:
from sklearn.metrics import classification_report

final_preds = (probs > SIGMOID_THRESH).astype(int)
print("Final (tuned) metrics:")
print({
    "micro/f1":        f1_score(labels, final_preds, average="micro", zero_division=0),
    "macro/f1":        f1_score(labels, final_preds, average="macro", zero_division=0),
    "micro/precision": precision_score(labels, final_preds, average="micro", zero_division=0),
    "micro/recall":    recall_score(labels, final_preds, average="micro", zero_division=0),
})

#optional: per-label F1 (top few labels)
report = classification_report(
    labels, final_preds,
    target_names=[id2label[i] for i in range(num_labels)],
    zero_division=0, output_dict=True
)
#show 10 most frequent labels by support
supports = [(i, report[id2label[i]]["support"]) for i in range(num_labels)]
top10 = [i for i,_ in sorted(supports, key=lambda x: x[1], reverse=True)[:10]]
for i in top10:
    r = report[id2label[i]]
    print(f"{id2label[i]:55s}  P={r['precision']:.2f}  R={r['recall']:.2f}  F1={r['f1-score']:.2f}  n={int(r['support'])}")


In [None]:
save_dir = "./confli_bert_vipaa_best"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved to:", save_dir)


In [None]:
import torch

def predict_labels(texts, threshold=SIGMOID_THRESH, max_length=256):
    enc = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    with torch.no_grad():
        out = trainer.model(**{k: v.to(trainer.model.device) for k,v in enc.items()})
        probs = torch.sigmoid(out.logits).cpu().numpy()
    preds = (probs > threshold).astype(int)
    decoded = [[id2label[j] for j, v in enumerate(row) if v==1] for row in preds]
    return decoded, probs

sample_texts = [
    data_df["Descripción"].iloc[0],
    "Guerrilleros atacaron el puesto de policía y dinamitaron un banco.",
]
pred_labels, pred_probs = predict_labels(sample_texts)
for t, labs in zip(sample_texts, pred_labels):
    print("\nTEXT:", t[:180], "...")
    print("PRED:", labs)
