# 📘 Sentiment-only Pipeline — Pre-trained Models + LLMs (Predictions Only)

This Colab notebook is **dedicated exclusively to sentiment prediction merging and normalization**.
It does **not include any analysis, metrics, or visualization**.
All comments are written in English.


## 🧩 1. Setup and Imports

In [None]:
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path

# Optional: Mount Google Drive in Colab
tRY_COLAB = False
try:
    import google.colab # type: ignore
    tRY_COLAB = True
except Exception:
    tRY_COLAB = False

if tRY_COLAB:
from google.colab import drive # type: ignore
drive.mount('/drive', force_remount=False)

# Define main paths
PATH_ROOT = Path('/drive/My Drive/Colab Notebooks/infotracer/RESEARCH/') if tRY_COLAB else Path.cwd()
PATH_RESULTS = PATH_ROOT / 'results'
PATH_RESULTS.mkdir(parents=True, exist_ok=True)

# File references
F_GT = PATH_RESULTS / 'omd_sent_groundtruth_mex24.csv' # optional ground-truth
F_DL = PATH_RESULTS / 'omd_sent_PreTrained_pred_mex24.csv' # pre-trained models
F_LLMS = PATH_RESULTS / 'omd_sent_LLM_pred_mex24.csv' # LLMs predictions

# Output files
F_OUT_CSV = PATH_RESULTS / 'sentiment_predictions_merged.csv'
F_OUT_XLSX = PATH_RESULTS / 'sentiment_predictions_merged.xlsx'

## 📂 2. Load Datasets (Ground Truth Optional)

In [None]:
if F_GT.exists():
    df_gt = pd.read_csv(F_GT)
    required_unified = {'platform', 'text', 'sentiment_label'}
    if not required_unified.issubset(df_gt.columns):
        raise ValueError(f"Ground-truth file missing columns: {required_unified - set(df_gt.columns)}")
else:
    df_gt = pd.DataFrame(columns=['platform', 'text', 'sentiment_label'])

# Pre-trained (DL) models
if not F_DL.exists():
    raise FileNotFoundError(f"Missing file: {F_DL}")
df_dl_raw = pd.read_csv(F_DL)

# LLMs models
if not F_LLMS.exists():
    raise FileNotFoundError(f"Missing file: {F_LLMS}")
df_llm_raw = pd.read_csv(F_LLMS)

## 🧠 3. Label Normalization (Sentiment Only)


In [None]:
VALID = {"NEG", "POS", "NEU"}
label_map_numeric = {0: "NEG", 1: "POS", 2: "NEU"}

label_map_textual = {
'negative': 'NEG', 'neg': 'NEG', 'positive': 'POS', 'pos': 'POS', 'neutral': 'NEU', 'neu': 'NEU',
'negativo': 'NEG', 'positivo': 'POS', 'neutro': 'NEU', 'neutralidad': 'NEU',
'cannot_predict': 'INDETERMINATE', 'indeterminate': 'INDETERMINATE', 'nan': 'INDETERMINATE'
}

def normalize_series_to_labels(s: pd.Series) -> pd.Series:
    s_str = s.astype(str).str.strip()
    mapped = s.map(label_map_numeric)
    mapped = mapped.fillna(s_str.str.lower().map(label_map_textual))
    mapped = mapped.where(mapped.isin(VALID), 'INDETERMINATE')
    return mapped

## 🧮 4. Select and Normalize Model Columns

In [None]:
# DL models
DL_COLS = [c for c in ['sentiment_bert_multilingual', 'sentiment_beto', 'sentiment_pysentimiento', 'sentiment_roberta'] if c in df_dl_raw.columns]
# LLM models
LLM_COLS = [c for c in [
'zero_gpt-3.5-turbo_sentiment', 'zero_gpt-4o-mini_sentiment', 'zero_gpt_o1-mini_sentiment',
'few_gpt-3.5-turbo_sentiment', 'few_gpt-4o-mini_sentiment', 'few_gpt_o1-mini_sentiment'
] if c in df_llm_raw.columns]

if not DL_COLS and not LLM_COLS:
    raise ValueError('No prediction columns found.')

# Normalize DL predictions
df_dl = df_dl_raw[['text'] + DL_COLS].copy()
for col in DL_COLS:
    df_dl[col] = normalize_series_to_labels(df_dl[col])

# Normalize LLM predictions
df_llm = df_llm_raw[['text'] + LLM_COLS].copy()
for col in LLM_COLS:
    df_llm[col] = normalize_series_to_labels(df_llm[col])

## 🔗 5. Merge All Predictions into One Table


In [None]:
# Merge ground-truth (if available), DL, and LLMs
if not df_gt.empty:
    base = df_gt[['platform', 'text', 'sentiment_label']]
else:
    base = pd.DataFrame({'text': pd.unique(pd.concat([df_dl['text'], df_llm['text']], ignore_index=True))})

merged = base.merge(df_dl, on='text', how='left').merge(df_llm, on='text', how='left')

print(f"Merged table shape: {merged.shape}")
print(f"Columns: {list(merged.columns)}")

## 💾 6. Export Final Predictions


In [None]:
merged.to_csv(F_OUT_CSV, index=False, encoding='utf-8')
try:
    merged.to_excel(F_OUT_XLSX, index=False)
except Exception as e:
    print(f"[warn] Excel export skipped: {e}")

print(f"\n[export] CSV: {F_OUT_CSV}")
print(f"[export] Excel: {F_OUT_XLSX if F_OUT_XLSX.exists() else '(skipped)'}")