<a href="https://colab.research.google.com/github/isegura/iso4simplify/blob/main/dataset_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook para analizar "cualquier" dataset para simlificación de textos.

El fichero debe estar en data/DATASET/

El nombre del fichero debe ser DATASET_test.csv

Define DATASET con el nombre de tu dataset.



## 1) Dependencias y modelos de Spacy


In [28]:
!pip -q install textstat rouge_score bert_score
# cargar modelos para spacy
!python -m spacy download es_core_news_sm fr_core_news_sm

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m115.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## 3) Montar Google Drive y cargar dataset

In [29]:
from google.colab import drive
import os, pandas as pd

# montar disco
drive.mount('/content/drive')
# Path del proyecto
PATH='/content/drive/MyDrive/iso4simplify'
os.chdir(PATH)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
# Nombre del dataset
DATASET='cochrane'  #cochrane, plaba, cochraneauto
DIR_DATASET=f'data/{DATASET}/'
OUT_DIR = DIR_DATASET
print("Directorio dataset:", DIR_DATASET)

Directorio dataset: data/cochrane/


### Definir constantes (nombre columnas, etc)

In [31]:
# Escribe los nombres de tus columnas
COL_ID='text_id'
COL_COMPLEX='original'   # texto original
COL_REF='reference'        # texto simplificado

LANG='en'
LANG=LANG.lower()

# función para
def configure_language(lang):
    config = {}

    if lang == "en":
        config["readability"] = ["fkgl", "flesch"]

    elif lang == "es":
        config["readability"] = ["szigriszt", "fernandez_huerta"]

    elif lang == "fr":
        config["readability"] = ["flesch"]

    return config

CONFIG = configure_language(LANG)

In [32]:
PATH_FILE = DIR_DATASET+f'{DATASET}_test.csv'
# cargamos dataframe
df = pd.read_csv(PATH_FILE)
print("Size of the dataset:", df.shape)
# nos quedamos sólo con las columnas importantes
df=df[[COL_ID, COL_COMPLEX, COL_REF ]]
print(df.columns)
display(df.head(1))
print('='*50)

Size of the dataset: (125, 3)
Index(['text_id', 'original', 'reference'], dtype='object')


Unnamed: 0,text_id,original,reference
0,CD009601,We included 11 studies including 414 participa...,We searched for study reports and found 11 ran...




In [33]:
import re
import spacy
if LANG=='en':
    model_spacy='en_core_web_sm'
else:
    model_spacy=f'{LANG}_core_news_sm'

def norm_ws(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()

# Quitamos espacios en blanco
df[COL_COMPLEX] = df[COL_COMPLEX].map(norm_ws)
df[COL_REF]  = df[COL_REF].map(norm_ws)

# spaCy (tokenization + sentence splitting + lemmatizer)
nlp = spacy.load(model_spacy, disable=["ner", "parser", "tagger"])
print(model_spacy, " loaded!!!")
# Para tener oraciones sin parser:
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")


en_core_web_sm  loaded!!!


Número de palabras y oraciones por texto:

In [34]:
from tqdm.auto import tqdm
import numpy as np
SEED = 42
np.random.seed(SEED)

def tokens_lower(text):
    doc = nlp(text)
    return [t.text.lower() for t in doc if not t.is_space and not t.is_punct]

def sent_count(text):
    doc = nlp(text)
    return len(list(doc.sents))

def word_count(text):
    return len(tokens_lower(text))

def jaccard(setA, setB):
    if not setA and not setB:
        return np.nan
    return len(setA & setB) / max(1, len(setA | setB))

# 5) Row-level features
tqdm.pandas()

# número de palabras y oraciones en los textos complejos
df["c_words"] = df[COL_COMPLEX].progress_map(word_count)
df["s_words"] = df[COL_REF].progress_map(word_count)
# número de palabras y oraciones en los textos simplificados

df["c_sents"] = df[COL_COMPLEX].progress_map(sent_count)
df["s_sents"] = df[COL_REF].progress_map(sent_count)

# Compute the ratio of simplified text length to original text length
# (values <1 mean shorter, >1 mean longer).
df["word_ratio_s_over_c"] = df["s_words"] / df["c_words"].replace(0, np.nan)
# Sentence count ratio (simple/original); <1 means fewer sentences, >1 means more sentences.
df["sent_ratio_s_over_c"] = df["s_sents"] / df["c_sents"].replace(0, np.nan)
# Word count difference (simple - original); negative = shorter, positive = longer.
df["delta_words"] = df["s_words"] - df["c_words"]
# Sentence count difference (simple - original); negative = fewer sentences, positive = more sentences.
df["delta_sents"] = df["s_sents"] - df["c_sents"]

# lexical overlap (jaccard on lowercased word sets)
df["c_set"] = df[COL_COMPLEX].progress_map(lambda x: set(tokens_lower(x)))
df["s_set"] = df[COL_REF].progress_map(lambda x: set(tokens_lower(x)))
df["jaccard_lex"] = df.apply(lambda r: jaccard(r["c_set"], r["s_set"]), axis=1)


  0%|          | 0/125 [00:00<?, ?it/s]



  0%|          | 0/125 [00:00<?, ?it/s]



  0%|          | 0/125 [00:00<?, ?it/s]



  0%|          | 0/125 [00:00<?, ?it/s]



  0%|          | 0/125 [00:00<?, ?it/s]



  0%|          | 0/125 [00:00<?, ?it/s]



## Readibility

In [35]:
import textstat
# readability (English)
def safe_metric(fn, text):
    text = norm_ws(text)
    if not text:
        return np.nan
    try:
        return fn(text)
    except Exception:
        return np.nan


textstat.set_lang(LANG)

if "fkgl" in CONFIG["readability"]:
    df["c_fkgl"] = df[COL_COMPLEX].map(lambda t: safe_metric(textstat.flesch_kincaid_grade, t))
    df["s_fkgl"] = df[COL_REF].map(lambda t: safe_metric(textstat.flesch_kincaid_grade, t))
    df["delta_fkgl"] = df["s_fkgl"] - df["c_fkgl"]

if "flesch" in CONFIG["readability"]:
    df["c_fre"] = df[COL_COMPLEX].map(lambda t: safe_metric(textstat.flesch_reading_ease, t))
    df["s_fre"] = df[COL_REF].map(lambda t: safe_metric(textstat.flesch_reading_ease, t))
    df["delta_fre"] = df["s_fre"] - df["c_fre"]

if "szigriszt" in CONFIG["readability"]:
    df["c_szigriszt"] = df[COL_COMPLEX].map(lambda t: safe_metric(textstat.szigriszt_pazos, t))
    df["s_szigriszt"] = df[COL_REF].map(lambda t: safe_metric(textstat.szigriszt_pazos, t))
    df["delta_szigriszt"] = df["s_szigriszt"] - df["c_szigriszt"]

if "fernandez_huerta" in CONFIG["readability"]:
    df["c_fernandez_huerta"] = df[COL_COMPLEX].map(lambda t: safe_metric(textstat.fernandez_huerta, t))
    df["s_fernandez_huerta"] = df[COL_REF].map(lambda t: safe_metric(textstat.fernandez_huerta, t))
    df["delta_fernandez_huerta"] = df["s_fernandez_huerta"] - df["c_fernandez_huerta"]


In [36]:
from rouge_score import rouge_scorer
from nltk.stem.snowball import SnowballStemmer

stemmer_map = {
    "en": "english",
    "es": "spanish",
    "fr": "french"
}

stemmer = SnowballStemmer(stemmer_map[LANG])

# ---- Normalización básica ----
def normalize(text):
    doc = nlp(text)
    tokens = []
    for t in doc:
        if not t.is_space and not t.is_punct:
            tok = t.lemma_.lower()
            tok = stemmer.stem(tok)
            tokens.append(tok)
    return " ".join(tokens)

scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

def compute_rouge(references, candidates):
    """
    references = texto complejo (original)
    candidates = texto simplificado
    """
    scores = []

    for ref, cand in zip(references, candidates):
        score = scorer.score(
            normalize(ref),   # reference (complex)
            normalize(cand)   # candidate (simple)
        )
        scores.append(score["rougeL"].fmeasure)

    return scores

rougeL = compute_rouge(df[COL_COMPLEX].tolist(), df[COL_REF].tolist())
df["rougeL_f1"] = rougeL




In [37]:
from bert_score import score as bertscore_score
import torch

# cargamos un modelo multilingüe
model_type="xlm-roberta-large"

# =========================
# 5) BERTScore (semántica): simple vs complex
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"

P, R, F1 = bertscore_score(
    cands=df[COL_REF].tolist(),
    refs=df[COL_COMPLEX].tolist(),
    lang=LANG,
    model_type=model_type,
    device=device,
    verbose=False
)
df["bertscore_f1"] = F1.numpy()

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: xlm-roberta-large
Key                       | Status     |  | 
--------------------------+------------+--+-
lm_head.layer_norm.weight | UNEXPECTED |  | 
lm_head.dense.weight      | UNEXPECTED |  | 
lm_head.dense.bias        | UNEXPECTED |  | 
lm_head.layer_norm.bias   | UNEXPECTED |  | 
lm_head.bias              | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [38]:
import platform

# 8) Summary helpers
def summarize(series: pd.Series):
    x = pd.to_numeric(series, errors="coerce").dropna().to_numpy()
    if len(x) == 0:
        return {"n": 0}
    return {
        "n": int(len(x)),
        "mean": float(x.mean()),
        "std": float(x.std(ddof=1)) if len(x) > 1 else 0.0,
        "median": float(np.median(x)),
        "p25": float(np.percentile(x, 25)),
        "p75": float(np.percentile(x, 75)),
        "min": float(x.min()),
        "max": float(x.max()),
    }

readability_block = {}

for col in df.columns:
    # Detectar columnas de readability
    if col.startswith(("c_", "s_", "delta_")):

        metric_name = col.split("_", 1)[1]  # quitar prefijo

        # Filtrar solo métricas de readability conocidas
        if metric_name in [
            "fkgl",
            "fre",
            "szigriszt",
            "fernandez_huerta",
        ]:
            readability_block[col] = summarize(df[col])

report = {
    "dataset": {
        "file": PATH_FILE,
        "n_rows": int(len(df)),
        "n_unique_ids": int(df[COL_ID].nunique()),
        "seed": SEED,
    },
    "length_complex": {
        "words": summarize(df["c_words"]),
        "sents": summarize(df["c_sents"]),
    },
    "length_simple": {
        "words": summarize(df["s_words"]),
        "sents": summarize(df["s_sents"]),
    },
    "ratios": {
        "word_ratio_s_over_c": summarize(df["word_ratio_s_over_c"]),
        "sent_ratio_s_over_c": summarize(df["sent_ratio_s_over_c"]),
        "jaccard_lex": summarize(df["jaccard_lex"]),
    },
    "readability": readability_block,
    "overlap_semantics": {
        "rougeL_f1": summarize(df["rougeL_f1"]),
        "bertscore_f1": summarize(df["bertscore_f1"]),
    },
    "environment": {
        "python": platform.python_version(),
        "platform": platform.platform(),
        "spacy": spacy.__version__,
        "pandas": pd.__version__,
        "numpy": np.__version__,
    }
}

In [39]:
import json
with open(os.path.join(OUT_DIR, f"{DATASET}_report.json"), "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2, ensure_ascii=False)

# 9) Paper table (CSV + LaTeX)

rows = []

def safe_row(label, block, key):
    """Añade fila solo si la métrica existe"""
    if key in block:
        stats = block[key]
        rows.append([label, stats["mean"], stats["std"], stats["median"]])

# Siempre presentes
rows.append(["#docs", len(df), "", ""])

safe_row("Complex words", report["length_complex"], "words")
safe_row("Simple words", report["length_simple"], "words")
safe_row("Word ratio (S/C)", report["ratios"], "word_ratio_s_over_c")

safe_row("Complex sents", report["length_complex"], "sents")
safe_row("Simple sents", report["length_simple"], "sents")
safe_row("Sent ratio (S/C)", report["ratios"], "sent_ratio_s_over_c")

# Readability (solo si existe en este idioma)
safe_row("FKGL complex", report["readability"], "c_fkgl")
safe_row("FKGL simple", report["readability"], "s_fkgl")
safe_row("ΔFKGL (S - C)", report["readability"], "delta_fkgl")

safe_row("FRE complex", report["readability"], "c_fre")
safe_row("FRE simple", report["readability"], "s_fre")
safe_row("ΔFRE (S - C)", report["readability"], "delta_fre")

safe_row("Szigriszt complex", report["readability"], "c_szigriszt")
safe_row("Szigriszt simple", report["readability"], "s_szigriszt")
safe_row("ΔSzigriszt (S - C)", report["readability"], "delta_szigriszt")

safe_row("Fernández-Huerta complex", report["readability"], "c_fernandez_huerta")
safe_row("Fernández-Huerta simple", report["readability"], "s_fernandez_huerta")
safe_row("ΔFernández-Huerta (S - C)", report["readability"], "delta_fernandez_huerta")

safe_row("Kandel-Moles complex", report["readability"], "c_kandel")
safe_row("Kandel-Moles simple", report["readability"], "s_kandel")
safe_row("ΔKandel-Moles (S - C)", report["readability"], "delta_kandel")

# Overlap
safe_row("Jaccard lex", report["ratios"], "jaccard_lex")
safe_row("ROUGE-L F1 (S vs C)", report["overlap_semantics"], "rougeL_f1")
safe_row("BERTScore F1 (S vs C)", report["overlap_semantics"], "bertscore_f1")

paper_table = pd.DataFrame(rows, columns=["Metric", "Mean", "Std", "Median"])

paper_table.to_csv(os.path.join(OUT_DIR, f"{DATASET}_table.csv"), index=False)

with open(os.path.join(OUT_DIR, f"{DATASET}_table.tex"), "w", encoding="utf-8") as f:
    f.write(paper_table.to_latex(index=False, float_format="%.4f"))



In [40]:
import matplotlib.pyplot as plt

# =========================
# 8) Figuras (PNG) para paper
# =========================
def save_hist(series, title, filename, bins=50):
    x = pd.to_numeric(series, errors="coerce").dropna()
    plt.figure()
    plt.hist(x, bins=bins)
    plt.title(title)
    plt.xlabel(title)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, filename), dpi=200)
    plt.close()

save_hist(df["word_ratio_s_over_c"], f"Word ratio (simple/complex) in {DATASET.upper()}", f"{DATASET}_hist_word_ratio.png")
save_hist(df["delta_fkgl"], f"Delta FKGL (simple - complex) in {DATASET.upper()}", f"{DATASET}_hist_delta_fkgl.png")
save_hist(df["delta_fre"], f"Delta Flesch Reading Ease (simple - complex) in {DATASET.upper()}", f"{DATASET}_hist_delta_fre.png")
save_hist(df["jaccard_lex"], f"Jaccard lexical overlap in {DATASET.upper()}", f"{DATASET}_hist_jaccard.png")
save_hist(df["bertscore_f1"], f"BERTScore F1 (simple vs complex) in {DATASET.upper()}", f"{DATASET}_hist_bertscore.png")

if LANG=='en':
    # Figura: scatter cambio longitud vs cambio FKGL
    plt.figure()
    plt.scatter(df["delta_words"], df["delta_fkgl"])
    plt.xlabel("Delta words (simple - complex)")
    plt.ylabel("Delta FKGL (simple - complex)")
    plt.title(f"Length change vs readability change in {DATASET.upper()}")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, f"{DATASET}_scatter_delta_words_vs_delta_fkgl.png"), dpi=200)
    plt.close()

In [41]:
# 11) Save enriched CSV
df.drop(columns=["c_set","s_set"], errors="ignore").to_csv(os.path.join(OUT_DIR, f"{DATASET}_test_enriched.csv"), index=False)

print("✅ Done. Outputs saved in:", OUT_DIR)
print("Main files:")
print(f" - {DATASET}_report.json")
print(f" - {DATASET}_table.csv / {DATASET}_table.tex")
print(f" - {DATASET}_test_enriched.csv")
print(f" - figures: {DATASET}_hist_*.png and {DATASET}_scatter_*.png")

✅ Done. Outputs saved in: data/cochrane/
Main files:
 - cochrane_report.json
 - cochrane_table.csv / cochrane_table.tex
 - cochrane_test_enriched.csv
 - figures: cochrane_hist_*.png and cochrane_scatter_*.png
