# 02 — Data Preprocessing (Triple Pipeline)

This notebook prepares THREE variations of the dataset:
1. **Standard**: Basic cleaning (lowercase, remove URLs/emojis).
2. **Irony-Augmented**: Standard cleaning + `[IRONIA]` tagging for detected colloquialisms.
3. **Obfuscated**: Standard cleaning + NER-based obfuscation of people names to `[PERSONA]`.

**Output Locations**:
- `../data/processed/standard/`
- `../data/processed/irony/`
- `../data/processed/obfuscated/`

In [None]:
import pandas as pd
import re
import emoji
from sklearn.model_selection import train_test_split
import os

# Ensure data directories exist
os.makedirs('../data/processed/standard', exist_ok=True)
os.makedirs('../data/processed/irony', exist_ok=True)
os.makedirs('../data/processed/obfuscated', exist_ok=True)
from tqdm.notebook import tqdm
tqdm.pandas()


In [2]:
%load_ext watermark
%watermark -v -n -m -p numpy,pandas,sklearn,emoji

Python implementation: CPython
Python version       : 3.13.0
IPython version      : 9.10.0

numpy  : 2.4.2
pandas : 3.0.0
sklearn: 1.8.0
emoji  : 2.15.0

Compiler    : Clang 16.0.0 (clang-1600.0.26.4)
OS          : Darwin
Release     : 25.2.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit



## 1. Load Data & Helper Functions

In [3]:
df = pd.read_csv('../data/raw/corpus.csv')
df['quote_safe'] = df['QuoteText'].fillna('')
df['text'] = df.apply(lambda x: (x['quote_safe'] + " " + x['TweetText']).strip(), axis=1)
df['label'] = df['Categorization']
df = df[['text', 'label']]
print(f"Loaded {df.shape[0]} samples")

Loaded 3000 samples


In [4]:
import spacy
try:
    nlp = spacy.load("es_core_news_lg")
except OSError:
    from spacy.cli import download
    download("es_core_news_lg")
    nlp = spacy.load("es_core_news_lg")

def strip_obfuscation_tags(text):
    """Remove existing obfuscation tags like [ANATOMIA], [GROSERIA], etc. before NER."""
    return re.sub(r'\[([A-Z_]+)\]', '', text)

def is_valid_name(ent):
    """
    Check if a detected PER entity is likely a false positive.
    Returns True if it seems like a valid name, False if it's likely a common word.
    Strategy: Check the POS tag of the lowercased tokens in isolation.
    """
    # POS tags that clearly shouldn't be part of a person's name in this context
    # We include NOUN because common nouns (el "drogas") shouldn't be obfuscated as PERSONA ideally, 
    # but 'gil' is PROPN so it slips through if we only check NOUN. 
    # However, 'Rindo' is VERB, 'mitotero' is ADJ.
    invalid_pos = {'VERB', 'AUX', 'ADJ', 'ADV', 'INTJ', 'PRON', 'DET', 'CONJ', 'NUM', 'SCONJ'}
    
    for token in ent:
        # Check the token in lower case and isolation
        # This helps check if the word *can* be a common word
        doc_lower = nlp.make_doc(token.text.lower())
        # We need to run the tagger on this single token doc
        # Just running nlp() is safer to get the pipeline's opinion
        doc_lower = nlp(token.text.lower())
        token_lower = doc_lower[0]
        
        if token_lower.pos_ in invalid_pos:
            return False
            
    return True

def obfuscate_entities(text):
    if not isinstance(text, str): return text
    # Strip existing obfuscation tags so NER doesn't pick them up as names
    cleaned_for_ner = strip_obfuscation_tags(text)
    
    # Run NER
    doc = nlp(cleaned_for_ner)
    
    # Identify valid entities to replace
    entities_to_replace = []
    for ent in doc.ents:
        if ent.label_ == 'PER' and is_valid_name(ent):
            entities_to_replace.append(ent.text)
            
    # Replace in the original text
    for entity_text in set(entities_to_replace): # Use set to avoid double work
        # Escape regex special characters in the name
        pattern = r'\b' + re.escape(entity_text) + r'\b'
        text = re.sub(pattern, '[PERSONA]', text)
        
    return text

def process_obfuscated(text):
    if not isinstance(text, str): return ""
    # 1. Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # 2. Demojize
    text = emoji.demojize(text, language='es', delimiters=(" :", ": "))
    # 3. Obfuscate People Names ([PERSONA])
    text = obfuscate_entities(text)
    # 4. Lowercase and clean whitespace
    # Lowercase EXCEPT tags (preserves [ANATOMIA], [PERSONA], etc.)
    parts = re.split(r'(\[[A-ZÁÉÍÓÚÑ]+\])', text)
    processed = []
    for part in parts:
        if re.match(r'^\[[A-ZÁÉÍÓÚÑ]+\]$', part):
            processed.append(part)
        else:
            processed.append(part.lower())
    text = "".join(processed)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


Collecting es-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.8.0/es_core_news_lg-3.8.0-py3-none-any.whl (568.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.0/568.0 MB[0m [31m37.7 MB/s[0m  [33m0:00:11[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: es-core-news-lg
Successfully installed es-core-news-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
def tag_irony_logic(text):
    if not isinstance(text, str): return text
    # Laughs
    text = re.sub(r'(?i)\b(j+a+){2,}\b', ' [IRONIA] ', text)
    text = re.sub(r'(?i)\b(j+e+){2,}\b', ' [IRONIA] ', text)
    # Specific phrases
    text = re.sub(r'\(\?+\)?', ' [IRONIA] ', text)
    text = re.sub(r'(?i)\bx+d+\b', ' [IRONIA] ', text)
    text = re.sub(r'(?i)\b(a+h? ?r+e+)\b', ' [IRONIA] ', text)
    text = re.sub(r'(?i)\bare\b', ' [IRONIA] ', text)
    text = re.sub(r'(?i)\bbue\b', ' [IRONIA] ', text)
    text = re.sub(r'(?i)\bwe\b', ' [IRONIA] ', text)
    text = re.sub(r'(?i)\bbueno no\b', ' [IRONIA] ', text)
    text = re.sub(r'(?i)\bno bueno\b', ' [IRONIA] ', text)
    return text

def clean_base(text):
    if not isinstance(text, str): return ""
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Demojize
    text = emoji.demojize(text, language='es', delimiters=(" :", ": "))
    # Lowercase EXCEPT tags
    parts = re.split(r'(\[[A-ZÁÉÍÓÚÑ]+\])', text)
    processed = []
    for part in parts:
        if re.match(r'^\[[A-ZÁÉÍÓÚÑ]+\]$', part):
            processed.append(part)
        else:
            processed.append(part.lower())
    text = "".join(processed)
    # Whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def process_standard(text):
    return clean_base(text)

def process_irony(text):
    # Tag irony FIRST, then clean (so [IRONIA] is preserved as uppercase tag)
    text = tag_irony_logic(text)
    return clean_base(text)

## 2. Generate Datasets

We now apply the preprocessing pipelines to generate three distinct datasets:
*   **Standard**: Baseline for model performance.
*   **Irony**: To test if explicit irony tagging helps the model.
*   **Obfuscated**: To test if masking personal names (`[PERSONA]`) improves generalization or reduces bias.

> **Note**: The `Obfuscated` pipeline uses the `es_core_news_lg` model for better accuracy in detecting people's names.

In [None]:
# Standard
df_standard = df.copy()
df_standard['text_clean'] = df_standard['text'].apply(process_standard)

# Irony
df_irony = df.copy()
df_irony['text_clean'] = df_irony['text'].apply(process_irony)

# Obfuscated
df_obfuscated = df.copy()
df_obfuscated['text_clean'] = df_obfuscated['text'].progress_apply(process_obfuscated)

print("Sample Standard:", df_standard['text_clean'].iloc[10])
print("Sample Irony:   ", df_irony['text_clean'].iloc[10])
print("Sample Obfuscated:", df_obfuscated['text_clean'].iloc[10])


  0%|          | 0/3000 [00:00<?, ?it/s]

In [None]:
# Display Before & After Examples for Obfuscation
print("\n=== Obfuscation Examples (Focus on [PERSONA]) ===\n")

# specific logic to find relevant examples
persona_mask = df_obfuscated['text_clean'].str.contains(r'\[PERSONA\]', regex=True)

if persona_mask.sum() > 0:
    # Show up to 5 examples where PERSONA was inserted
    examples = df_obfuscated[persona_mask].sample(min(5, persona_mask.sum()), random_state=42)
    for idx, row in examples.iterrows():
        original = df.loc[idx, 'text']
        processed = row['text_clean']
        print(f"Original:  {original}")
        print(f"Processed: {processed}")
        print("-" * 80)
else:
    print("No [PERSONA] tags found in the processed text.")


## 3. Split and Save

Finally, we split each dataset into Train (70%), Validation (15%), and Test (15%) sets.
All splits are stratified by label to ensure balanced class distribution.

Files are saved to:
*   `../data/processed/standard/`
*   `../data/processed/irony/`
*   `../data/processed/obfuscated/`

In [None]:
def save_splits(dataframe, name, output_dir):
    train, temp = train_test_split(dataframe, test_size=0.3, stratify=dataframe['label'], random_state=42)
    val, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)
    
    train.to_csv(f'{output_dir}/train.csv', index=False)
    val.to_csv(f'{output_dir}/val.csv', index=False)
    test.to_csv(f'{output_dir}/test.csv', index=False)
    print(f"Saved {name} splits to {output_dir}")

save_splits(df_standard, "Standard", "../data/processed/standard")
save_splits(df_irony, "Irony", "../data/processed/irony")
save_splits(df_obfuscated, "Obfuscated", "../data/processed/obfuscated")
