---
## 📦 SECTION 1: Install Packages (Optimized for Free Colab)
This will install only the essential packages for Filipino augmentation


In [1]:
print("🔧 Installing packages for Filipino augmentation...\\n")

# Essential packages only (optimized for free Colab)
!pip install -q googletrans==4.0.0-rc1
!pip install -q sentence-transformers
!pip install -q nlpaug
!pip install -q torch torchvision

print("✅ Installation complete!\\n")
print("📦 Installed:")
print("   • googletrans (back-translation testing)")
print("   • sentence-transformers (quality filtering)")
print("   • nlpaug (contextual augmentation with XLM-RoBERTa)")
print("   • torch (deep learning backend)")
print("\\n⚠️  Note: Using XLM-RoBERTa for Filipino-aware augmentation")


🔧 Installing packages for Filipino augmentation...\n
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for googletrans (setup.py) ... [?25l[?25hd

---
## 📂 SECTION 2: Upload Your Dataset


In [2]:
from google.colab import files
import pandas as pd

print("📂 Upload adjudications_2025-10-22.csv\\n")
uploaded = files.upload()

filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print(f"\\n✅ Loaded: {filename}")
print(f"📊 Total samples: {len(df)}")
print(f"\\n📊 Sentiment Distribution:")
print(df['Final Sentiment'].value_counts())
print(f"\\n📊 Polarization Distribution:")
print(df['Final Polarization'].value_counts())

obj_count = len(df[df['Final Polarization'] == 'objective'])
neu_count = len(df[df['Final Sentiment'] == 'neutral'])

print(f"\\n🎯 Augmentation Targets:")
print(f"   • Objective: {obj_count} → ~{obj_count * 5} samples (5x)")
print(f"   • Neutral: {neu_count} → ~{neu_count * 3} samples (3x)")

# Show sample Filipino text
print(f"\\n🇵🇭 Sample Text (to verify Filipino content):")
print(f"   {df['Comment'].iloc[0][:100]}...")


📂 Upload adjudications_2025-10-22.csv\n


Saving adjudications_2025-10-22.csv to adjudications_2025-10-22 (1).csv
\n✅ Loaded: adjudications_2025-10-22 (1).csv
📊 Total samples: 9965
\n📊 Sentiment Distribution:
Final Sentiment
negative    5905
neutral     2677
positive    1383
Name: count, dtype: int64
\n📊 Polarization Distribution:
Final Polarization
partisan         6606
non_polarized    2771
objective         588
Name: count, dtype: int64
\n🎯 Augmentation Targets:
   • Objective: 588 → ~2940 samples (5x)
   • Neutral: 2677 → ~8031 samples (3x)
\n🇵🇭 Sample Text (to verify Filipino content):
   ganun din yan, magtayo lang ng sari-sari store sa bahay gamit ang bintana tas chichirya ang tinda. k...


---
## 🧪 SECTION 3: Test Back-Translation Quality (Filipino)
We'll test if Google Translate preserves Filipino meaning well enough


In [3]:
from googletrans import Translator
import time

translator = Translator()

print("🧪 Testing back-translation quality on 5 Filipino samples...\\n")

# Test on 5 random samples
test_samples = df['Comment'].sample(5, random_state=42).tolist()
good_translations = 0

for i, text in enumerate(test_samples, 1):
    print(f"Test {i}/5:")
    print(f"  Original: {text[:80]}...")

    try:
        # Tagalog → English → Tagalog
        english = translator.translate(text, src='tl', dest='en').text
        time.sleep(0.5)
        back = translator.translate(english, src='en', dest='tl').text
        time.sleep(0.5)

        print(f"  Back-translated: {back[:80]}...")

        # Simple similarity check (word overlap)
        orig_words = set(text.lower().split())
        back_words = set(back.lower().split())
        overlap = len(orig_words & back_words) / len(orig_words) if len(orig_words) > 0 else 0

        print(f"  Word overlap: {overlap*100:.1f}%")
        if overlap >= 0.5:  # At least 50% word overlap
            good_translations += 1
            print("  ✅ Good quality")
        else:
            print("  ⚠️ Low quality")
    except Exception as e:
        print(f"  ❌ Error: {e}")

    print()

quality_rate = good_translations / 5 * 100
print(f"{'='*70}")
print(f"Back-translation quality: {good_translations}/5 ({quality_rate:.0f}%)")

if quality_rate >= 60:
    USE_BACK_TRANSLATION = True
    print("✅ Quality is good enough - WILL use back-translation")
else:
    USE_BACK_TRANSLATION = False
    print("⚠️ Quality is too low - WILL NOT use back-translation")
    print("   (Will use XLM-RoBERTa contextual augmentation only)")

print(f"{'='*70}")


🧪 Testing back-translation quality on 5 Filipino samples...\n
Test 1/5:
  Original: haha maisingit lang drug war..lahat ng pulis naman na involved sa drug war.....
  Back-translated: haha ipasok lang ang digmaan ng droga .. lahat ng pulisya na kasangkot sa digmaa...
  Word overlap: 41.7%
  ⚠️ Low quality

Test 2/5:
  Original: isang babae lang pinagtutulungan nila..patunay na ni sara ay isang banta sa kani...
  Back-translated: Tumutulong lang sila sa isang babae..pagbabawas na si Sara ay isang banta sa kan...
  Word overlap: 66.7%
  ✅ Good quality

Test 3/5:
  Original: alam n po ang pagtulong s mga magulang ay karapatan ntin .. pero mas mainam kyo ...
  Back-translated: Alam ko na ang pagtulong sa mga magulang ay ang aming karapatan .. ngunit mas ma...
  Word overlap: 31.6%
  ⚠️ Low quality

Test 4/5:
  Original: pag inatake ka sa sobrang galit mo sa mga doctors natin walang gagamot sayo!...
  Back-translated: Kung inaatake ka dahil galit ka sa aming mga doktor, walang gagamot sa iyo

---
## 🛠️ SECTION 4: Filipino-Aware Augmentation Toolkit
XLM-RoBERTa understands Filipino context - perfect for Taglish!


In [4]:
from tqdm.notebook import tqdm
from typing import List, Tuple
import numpy as np
import nlpaug.augmenter.word as naw

print("🔧 Initializing Filipino-aware augmentation toolkit...\\n")

# XLM-RoBERTa Contextual Augmenter (Filipino-aware!)
class FilipinoContextualAugmenter:
    def __init__(self):
        print("📦 Loading XLM-RoBERTa for contextual augmentation...")
        self.aug = naw.ContextualWordEmbsAug(
            model_path='xlm-roberta-base',  # Multilingual! Understands Filipino!
            action='substitute',
            aug_p=0.20,  # Replace 20% of words
            device='cuda' if __name__ == '__main__' else 'cpu'
        )
        print("✅ XLM-RoBERTa ready (understands Filipino + Taglish!)")

    def augment_batch(self, texts: List[str], multiplier=3) -> List[str]:
        all_augmented = []
        print(f"🔄 Augmenting {len(texts)} samples (x{multiplier} each)...")

        for text in tqdm(texts, desc="XLM-R augmentation"):
            for _ in range(multiplier):
                try:
                    aug_text = self.aug.augment(text)
                    all_augmented.append(aug_text)
                except:
                    continue

        print(f"✅ Generated {len(all_augmented)} samples via XLM-RoBERTa")
        return all_augmented

# Back-Translation Augmenter (if quality test passed)
class BackTranslationAugmenter:
    def __init__(self):
        self.translator = Translator()
        print("✅ Back-translation ready (Tagalog ↔ English)")

    def augment_batch(self, texts: List[str]) -> List[str]:
        all_augmented = []
        print(f"🔄 Back-translating {len(texts)} samples...")

        for text in tqdm(texts, desc="Back-translation"):
            try:
                english = self.translator.translate(text, src='tl', dest='en').text
                time.sleep(0.3)
                back = self.translator.translate(english, src='en', dest='tl').text
                time.sleep(0.3)
                all_augmented.append(back)
            except:
                continue

        print(f"✅ Generated {len(all_augmented)} samples via back-translation")
        return all_augmented

# Quality Filter
class QualityFilter:
    def __init__(self, threshold=0.70):  # Lowered threshold for Filipino
        from sentence_transformers import SentenceTransformer, util
        print("📦 Loading sentence transformer for quality filtering...")
        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
        self.threshold = threshold
        self.util = util
        print(f"✅ Quality filter ready (threshold: {threshold})")

    def filter_augmented(self, original_texts: List[str], augmented_texts: List[str]) -> List[str]:
        filtered = []
        orig_embeddings = self.model.encode(original_texts, convert_to_tensor=True, show_progress_bar=True)
        aug_embeddings = self.model.encode(augmented_texts, convert_to_tensor=True, show_progress_bar=True)

        for i, aug_emb in enumerate(tqdm(aug_embeddings, desc="Quality filtering")):
            similarities = self.util.cos_sim(aug_emb, orig_embeddings)[0]
            max_similarity = similarities.max().item()
            if max_similarity >= self.threshold:
                filtered.append(augmented_texts[i])

        quality_rate = len(filtered) / len(augmented_texts) * 100 if len(augmented_texts) > 0 else 0
        print(f"✅ Kept {len(filtered)}/{len(augmented_texts)} ({quality_rate:.1f}% quality rate)")
        return filtered

    def remove_duplicates(self, texts: List[str], threshold=0.90) -> List[str]:  # Lowered for Filipino
        if len(texts) == 0:
            return texts
        embeddings = self.model.encode(texts, convert_to_tensor=True, show_progress_bar=True)
        unique_texts = [texts[0]]
        unique_embeddings = [embeddings[0]]

        for i in tqdm(range(1, len(texts)), desc="Duplicate removal"):
            similarities = self.util.cos_sim(embeddings[i], unique_embeddings)
            max_sim = similarities.max().item()
            if max_sim < threshold:
                unique_texts.append(texts[i])
                unique_embeddings.append(embeddings[i])

        print(f"✅ Kept {len(unique_texts)}/{len(texts)} unique samples")
        return unique_texts

print("\\n✅ Filipino-aware augmentation toolkit ready!")


🔧 Initializing Filipino-aware augmentation toolkit...\n
\n✅ Filipino-aware augmentation toolkit ready!


---
## 🔄 SECTION 5: Augment Objective & Neutral Classes
Using XLM-RoBERTa only (back-translation skipped due to low quality)

**⏱️ This will take 2-3 hours - you can close the tab and come back!**


In [5]:
# Initialize augmenters
print("="*70)
print("🚀 STARTING AUGMENTATION PROCESS")
print("="*70)

# Initialize XLM-R contextual augmenter
xlmr_aug = FilipinoContextualAugmenter()

# Initialize quality filter
quality_filter = QualityFilter(threshold=0.70)

print("\\n" + "="*70)
print("🎯 PHASE 1: AUGMENTING OBJECTIVE CLASS")
print("="*70)

# Extract objective samples
objective_samples = df[df['Final Polarization'] == 'objective']
objective_texts = objective_samples['Comment'].tolist()

print(f"\\n📊 Original objective samples: {len(objective_texts)}")
print(f"🎯 Target: ~{len(objective_texts) * 5} samples (5x)")

# Augment using XLM-RoBERTa (4x to get 5x total with originals)
augmented_obj = xlmr_aug.augment_batch(objective_texts, multiplier=4)

# Quality filter
print("\\n🔍 Applying quality filter...")
filtered_obj = quality_filter.filter_augmented(objective_texts, augmented_obj)

# Remove duplicates
print("\\n🔍 Removing duplicates...")
unique_obj = quality_filter.remove_duplicates(filtered_obj)

# Limit to target if we have too many
target_obj = len(objective_texts) * 4  # 4x augmented + 1x original = 5x total
if len(unique_obj) > target_obj:
    print(f"\\n⚠️  Limiting to {target_obj} samples")
    unique_obj = np.random.choice(unique_obj, target_obj, replace=False).tolist()

# Create augmented dataframe
aug_obj_df = pd.DataFrame({
    'Title': '',
    'Comment': unique_obj,
    'Final Sentiment': 'neutral',  # Most objective texts are neutral
    'Final Polarization': 'objective',
    'is_augmented': True
})

print(f"\\n{'='*70}")
print(f"✅ OBJECTIVE CLASS COMPLETE!")
print(f"{'='*70}")
print(f"📊 Original: {len(objective_texts)}")
print(f"📊 Augmented: {len(unique_obj)}")
print(f"📊 Total: {len(objective_texts) + len(unique_obj)}")
print(f"📊 Multiplier: {(len(objective_texts) + len(unique_obj)) / len(objective_texts):.2f}x")

print("\\n" + "="*70)
print("🎯 PHASE 2: AUGMENTING NEUTRAL CLASS")
print("="*70)

# Extract neutral samples
neutral_samples = df[df['Final Sentiment'] == 'neutral']
neutral_texts = neutral_samples['Comment'].tolist()

print(f"\\n📊 Original neutral samples: {len(neutral_texts)}")
print(f"🎯 Target: ~{len(neutral_texts) * 3} samples (3x)")

# Augment using XLM-RoBERTa (2x to get 3x total with originals)
augmented_neu = xlmr_aug.augment_batch(neutral_texts, multiplier=2)

# Quality filter
print("\\n🔍 Applying quality filter...")
filtered_neu = quality_filter.filter_augmented(neutral_texts, augmented_neu)

# Remove duplicates
print("\\n🔍 Removing duplicates...")
unique_neu = quality_filter.remove_duplicates(filtered_neu)

# Limit to target if we have too many
target_neu = len(neutral_texts) * 2  # 2x augmented + 1x original = 3x total
if len(unique_neu) > target_neu:
    print(f"\\n⚠️  Limiting to {target_neu} samples")
    unique_neu = np.random.choice(unique_neu, target_neu, replace=False).tolist()

# Get polarization distribution for neutral samples
neu_pol_dist = neutral_samples['Final Polarization'].value_counts(normalize=True).to_dict()
pol_labels = np.random.choice(
    list(neu_pol_dist.keys()),
    size=len(unique_neu),
    p=list(neu_pol_dist.values())
)

# Create augmented dataframe
aug_neu_df = pd.DataFrame({
    'Title': '',
    'Comment': unique_neu,
    'Final Sentiment': 'neutral',
    'Final Polarization': pol_labels,
    'is_augmented': True
})

print(f"\\n{'='*70}")
print(f"✅ NEUTRAL CLASS COMPLETE!")
print(f"{'='*70}")
print(f"📊 Original: {len(neutral_texts)}")
print(f"📊 Augmented: {len(unique_neu)}")
print(f"📊 Total: {len(neutral_texts) + len(unique_neu)}")
print(f"📊 Multiplier: {(len(neutral_texts) + len(unique_neu)) / len(neutral_texts):.2f}x")

print("\\n" + "="*70)
print("✅ AUGMENTATION COMPLETE!")
print("="*70)


🚀 STARTING AUGMENTATION PROCESS
📦 Loading XLM-RoBERTa for contextual augmentation...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

The following layers were not sharded: lm_head.dense.bias, roberta.encoder.layer.*.attention.self.value.weight, roberta.encoder.layer.*.intermediate.dense.bias, lm_head.decoder.weight, roberta.encoder.layer.*.attention.self.query.bias, roberta.encoder.layer.*.attention.self.query.weight, roberta.encoder.layer.*.output.dense.bias, lm_head.bias, lm_head.layer_norm.bias, roberta.embeddings.word_embeddings.weight, roberta.encoder.layer.*.intermediate.dense.weight, roberta.embeddings.position_embeddings.weight, roberta.embeddings.token_type_embeddings.weight, lm_head.dense.weight, roberta.embeddings.LayerNorm.bias, roberta.encoder.layer.*.attention.self.value.bias, roberta.encoder.layer.*.attention.self.key.bias, roberta.encoder.layer.*.attention.output.LayerNorm.weight, lm_head.decoder.bias, roberta.encoder.layer.*.output.dense.weight, roberta.encoder.layer.*.attention.output.dense.bias, roberta.embeddings.LayerNorm.weight, roberta.encoder.layer.*.output.LayerNorm.weight, roberta.encoder.l

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

---
## 💾 SECTION 6: Combine, Save & Download
Merge original + augmented data and prepare for training


In [None]:
print("="*70)
print("💾 COMBINING AND SAVING DATASET")
print("="*70)

# Add is_augmented column to original data
df['is_augmented'] = False

# Combine all dataframes
df_final = pd.concat([df, aug_obj_df, aug_neu_df], ignore_index=True)

# Shuffle
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# Save
output_filename = 'augmented_adjudications_2025-10-22.csv'
df_final.to_csv(output_filename, index=False)

print(f"\\n✅ Saved to: {output_filename}")
print(f"\\n📊 Final Dataset Statistics:")
print(f"   • Total samples: {len(df_final)}")
print(f"   • Original samples: {(~df_final['is_augmented']).sum()}")
print(f"   • Augmented samples: {df_final['is_augmented'].sum()}")
print(f"   • Augmentation rate: {df_final['is_augmented'].sum() / len(df) * 100:.1f}%")

print(f"\\n📊 Final Sentiment Distribution:")
print(df_final['Final Sentiment'].value_counts())

print(f"\\n📊 Final Polarization Distribution:")
print(df_final['Final Polarization'].value_counts())

# Calculate improvements
obj_before = len(df[df['Final Polarization'] == 'objective'])
obj_after = len(df_final[df_final['Final Polarization'] == 'objective'])
obj_improvement = (obj_after - obj_before) / obj_before * 100

neu_before = len(df[df['Final Sentiment'] == 'neutral'])
neu_after = len(df_final[df_final['Final Sentiment'] == 'neutral'])
neu_improvement = (neu_after - neu_before) / neu_before * 100

print(f"\\n🎯 Class Improvements:")
print(f"   • Objective: {obj_before} → {obj_after} (+{obj_improvement:.1f}%)")
print(f"   • Neutral: {neu_before} → {neu_after} (+{neu_improvement:.1f}%)")

print("\\n" + "="*70)
print("📥 DOWNLOADING AUGMENTED DATASET...")
print("="*70)

# Download
files.download(output_filename)

print(f"\\n✅ Downloaded: {output_filename}")

# Print next steps
print("\\n" + "="*70)
print("🎉 AUGMENTATION COMPLETE!")
print("="*70)

print(f"""
📋 NEXT STEPS FOR RUN #12:

1. Upload {output_filename} to your training Colab

2. Update your XLM_ROBERTA_TRAINING.ipynb configuration:

   CSV_PATH = '/content/{output_filename}'

   # REDUCE OVERSAMPLING (no longer needed!)
   OBJECTIVE_BOOST_MULT = 1.0  # Was 3.5
   NEUTRAL_BOOST_MULT = 1.0    # Was 0.3

   # REDUCE CLASS WEIGHTS
   CLASS_WEIGHT_MULT = {{
       "sentiment": {{
           "neutral": 1.20,    # Was 1.70
       }},
       "polarization": {{
           "objective": 1.30,  # Was 2.80
       }}
   }}

   # OPTIMIZE FOR MORE DATA
   EPOCHS = 15              # Was 20
   BATCH_SIZE = 24          # Was 16
   EARLY_STOP_PATIENCE = 5  # Was 6

3. Train Run #12 with the augmented data!

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

🎯 EXPECTED RESULTS:

Run #11 (Current):
  • Overall Macro-F1: 68.36%
  • Objective F1: 50.28%
  • Neutral F1: 55.69%

Run #12 (Expected with Augmented Data):
  • Overall Macro-F1: 73-76% (+5-8%) ✅
  • Objective F1: 65-70% (+15-20%) 🚀
  • Neutral F1: 68-72% (+13-17%) 🚀

🎯 TARGET: 75% Macro-F1 → ACHIEVABLE! ✅

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

🚀 Ready to hit 73-76%! Good luck! 🇵🇭
""")
