In [4]:
from __future__ import annotations
import ast
import os
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd

# =========================== CONFIG ===========================
# Point to your CSV. If you uploaded via notebook, /mnt/data is common.
CSV_PATH = os.getenv("CSV_PATH", "en_ewt-ud-train_sentences.csv")

# Set True to drop sentence-initial tokens (word_id == 0) from all stats
EXCLUDE_SENT_START = False

# Where to save the summary table
OUT_CSV = Path("pos_summary.csv")


# ======================== SMALL HELPERS =======================
def _to_list(x):
    """Parse list-like strings to Python lists; leave lists as-is."""
    if isinstance(x, str) and x.startswith("["):
        return ast.literal_eval(x)
    return x


def _pick_col(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    """Return the first column from candidates that exists in df, else None."""
    for c in candidates:
        if c in df.columns:
            return c
    return None


def _mode_and_pct(series: pd.Series, denom: int) -> Tuple[Optional[object], float, int]:
    """Return (most_frequent_value, pct_of_pos, count). If empty -> (None, 0.0, 0)."""
    s = series.dropna()
    if s.empty or denom == 0:
        return None, 0.0, 0
    vc = s.value_counts()
    top_val = vc.index[0]
    top_n = int(vc.iloc[0])
    pct = 100.0 * top_n / float(denom)
    return top_val, pct, top_n


# ====================== LOADING & EXPANSION ====================
def expand_to_tokens(
    csv_path: str,
    exclude_sent_start: bool = EXCLUDE_SENT_START,
) -> pd.DataFrame:
    """
    Read sentence-level rows and expand to token-level rows with columns:
      sentence_id, word_id, word, pos, index, length, head_dist, relation_type, arity
    Nonexistent columns are filled with NaN.
    """
    df_all = pd.read_csv(csv_path)

    sid_col   = _pick_col(df_all, ["sentence_id", "sent_id", "sid", "SentenceID"])
    tok_col   = _pick_col(df_all, ["tokens", "Tokens", "words", "Words"])
    pos_col   = _pick_col(df_all, ["pos", "upos", "POS", "Pos"])
    idx_col   = _pick_col(df_all, ["index", "token_index", "position", "positions", "idx"])
    hd_col    = _pick_col(df_all, ["head_dist", "head_distance", "dep_head_dist", "gov_dist"])
    rel_col   = _pick_col(df_all, ["relation_type", "deprel", "typed_dependency", "relation", "ud_rel", "rel"])
    arity_col = _pick_col(df_all, ["arity", "ariety", "ARITY"])

    # Minimal required columns
    missing = [name for name, col in dict(sentence_id=sid_col, tokens=tok_col, pos=pos_col).items() if col is None]
    if missing:
        raise ValueError(
            f"Missing required columns: {missing}. "
            f"Found columns: {list(df_all.columns)[:20]}..."
        )

    keep = [sid_col, tok_col, pos_col] + [c for c in [idx_col, hd_col, rel_col, arity_col] if c]
    df = df_all[keep].copy()

    # Normalize dtypes / parse lists
    df[sid_col]   = df[sid_col].astype(str, copy=False)
    df[tok_col]   = df[tok_col].apply(_to_list)
    df[pos_col]   = df[pos_col].apply(_to_list)
    if idx_col:   df[idx_col]   = df[idx_col].apply(_to_list)
    if hd_col:    df[hd_col]    = df[hd_col].apply(_to_list)
    if rel_col:   df[rel_col]   = df[rel_col].apply(_to_list)
    if arity_col: df[arity_col] = df[arity_col].apply(_to_list)

    rows = []
    for row in df.itertuples(index=False):
        sid  = getattr(row, sid_col)
        toks = getattr(row, tok_col)
        posl = getattr(row, pos_col)
        idxl = getattr(row, idx_col) if idx_col else None
        hdl  = getattr(row, hd_col)  if hd_col  else None
        rell = getattr(row, rel_col) if rel_col else None
        arl  = getattr(row, arity_col) if arity_col else None

        # Align lengths conservatively
        L = min(len(toks), len(posl))
        if idxl is not None: L = min(L, len(idxl))
        if hdl  is not None: L = min(L, len(hdl))
        if rell is not None: L = min(L, len(rell))
        if arl  is not None: L = min(L, len(arl))

        for wid in range(L):
            if exclude_sent_start and wid == 0:
                continue

            token = toks[wid]
            pos   = str(posl[wid]).upper() if posl[wid] is not None else None
            index = idxl[wid] if idxl is not None else np.nan
            hd    = hdl[wid]  if hdl  is not None else np.nan
            rel   = rell[wid] if rell is not None else np.nan
            arity = arl[wid]  if arl  is not None else np.nan

            # NEW: character length of the surface token
            length = len(str(token))

            rows.append((sid, wid, token, pos, index, length, hd, rel, arity))

    tok_df = pd.DataFrame(
        rows,
        columns=[
            "sentence_id", "word_id", "word", "pos",
            "index", "length", "head_dist", "relation_type", "arity"
        ],
    )

    # Normalize token text for type counting
    tok_df["word_norm"] = tok_df["word"].astype(str).str.casefold()
    return tok_df


# ====================== PER‑POS SUMMARY LOGIC ===================
def build_pos_summary(tokens_df: pd.DataFrame) -> pd.DataFrame:
    if "pos" not in tokens_df.columns:
        raise ValueError("Expanded token frame has no 'pos' column; check inputs.")

    records = []
    for pos, g in tokens_df.groupby("pos", dropna=True):
        n_tokens = len(g)
        n_types  = g["word_norm"].nunique()

        top_index,  pct_index,  _ = _mode_and_pct(g["index"],        denom=n_tokens) if "index"        in g else (None, 0.0, 0)
        top_length, pct_length, _ = _mode_and_pct(g["length"],       denom=n_tokens) if "length"       in g else (None, 0.0, 0)
        top_hd,     pct_hd,     _ = _mode_and_pct(g["head_dist"],    denom=n_tokens) if "head_dist"    in g else (None, 0.0, 0)
        top_rel,    pct_rel,    _ = _mode_and_pct(g["relation_type"],denom=n_tokens) if "relation_type" in g else (None, 0.0, 0)
        top_arity,  pct_arity,  _ = _mode_and_pct(g["arity"],        denom=n_tokens) if "arity"        in g else (None, 0.0, 0)

        records.append({
            "pos": pos,
            "n_tokens": n_tokens,
            "n_types": n_types,

            "top_index": top_index,
            "top_index_pct":  round(pct_index, 2),

            "top_length": top_length,
            "top_length_pct": round(pct_length, 2),

            "top_head_dist": top_hd,
            "top_head_dist_pct": round(pct_hd, 2),

            "top_relation_type": top_rel,
            "top_relation_type_pct": round(pct_rel, 2),

            "top_arity": top_arity,
            "top_arity_pct": round(pct_arity, 2),
        })

    summary = (
        pd.DataFrame.from_records(records)
        .sort_values(["n_tokens", "pos"], ascending=[False, True])
        .reset_index(drop=True)
    )
    return summary


# =============================== MAIN ===============================
if __name__ == "__main__":
    print(f"Reading: {CSV_PATH}")
    df_tokens = expand_to_tokens(CSV_PATH, exclude_sent_start=EXCLUDE_SENT_START)
    print(
        f"Expanded to {len(df_tokens):,} token rows "
        f"(POS present for {df_tokens['pos'].notna().mean()*100:.1f}% of tokens)."
    )

    summary = build_pos_summary(df_tokens)
    print("\nPer‑POS summary (top values and percentages):")
    with pd.option_context("display.max_rows", None,
                           "display.max_columns", None,
                           "display.width", 160):
        print(summary)

    OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    summary.to_csv(OUT_CSV, index=False)
    print(f"\n✓ Saved summary to {OUT_CSV.resolve()}")


Reading: en_ewt-ud-train_sentences.csv
Expanded to 194,916 token rows (POS present for 100.0% of tokens).

Per‑POS summary (top values and percentages):
      pos  n_tokens  n_types  top_index  top_index_pct  top_length  top_length_pct  top_head_dist  top_head_dist_pct top_relation_type  top_relation_type_pct  \
0    NOUN     33607     6903          6           5.54           4           18.33             -2              18.67               obj                  20.22   
1   PUNCT     22123       94         10           4.48           1           96.14             -1              15.46             punct                 100.00   
2    VERB     22095     3538          3           8.07           4           27.70              0              30.91              root                  30.91   
3    PRON     18255      138          1          18.33           2           26.52              1              37.16             nsubj                  55.28   
4     ADP     17557      116          6   

In [5]:
def build_index_summary(tokens_df: pd.DataFrame, max_index: int = 10) -> pd.DataFrame:
    """
    Build a per-INDEX summary (index ≤ max_index).
    """
    if "index" not in tokens_df.columns:
        raise ValueError("Expanded token frame has no 'index' column; check inputs.")

    # Keep only index values up to max_index
    df = tokens_df[tokens_df["index"].notna() & (tokens_df["index"] <= max_index)]

    records = []
    for idx_val, g in df.groupby("index", dropna=True):
        n_tokens = len(g)
        n_types  = g["word_norm"].nunique()

        top_pos,    pct_pos,    _ = _mode_and_pct(g["pos"], denom=n_tokens)
        top_length, pct_length, _ = _mode_and_pct(g["length"], denom=n_tokens)
        top_hd,     pct_hd,     _ = _mode_and_pct(g["head_dist"], denom=n_tokens)
        top_rel,    pct_rel,    _ = _mode_and_pct(g["relation_type"], denom=n_tokens)
        top_arity,  pct_arity,  _ = _mode_and_pct(g["arity"], denom=n_tokens)

        records.append({
            "index": idx_val,
            "n_tokens": n_tokens,
            "n_types": n_types,

            "top_pos": top_pos,
            "top_pos_pct": round(pct_pos, 2),

            "top_length": top_length,
            "top_length_pct": round(pct_length, 2),

            "top_head_dist": top_hd,
            "top_head_dist_pct": round(pct_hd, 2),

            "top_relation_type": top_rel,
            "top_relation_type_pct": round(pct_rel, 2),

            "top_arity": top_arity,
            "top_arity_pct": round(pct_arity, 2),
        })

    summary = (
        pd.DataFrame.from_records(records)
        .sort_values(["index"])
        .reset_index(drop=True)
    )
    return summary




In [6]:
if __name__ == "__main__":
    print(f"Reading: {CSV_PATH}")
    df_tokens = expand_to_tokens(CSV_PATH, exclude_sent_start=EXCLUDE_SENT_START)
    print(
        f"Expanded to {len(df_tokens):,} token rows "
        f"(POS present for {df_tokens['pos'].notna().mean()*100:.1f}% of tokens)."
    )
    OUT_CSV_INDEX = Path("index_summary.csv")

    # --- POS summary ---
    pos_summary = build_pos_summary(df_tokens)
    print("\nPer‑POS summary (top values and percentages):")
    with pd.option_context("display.max_rows", None,
                           "display.max_columns", None,
                           "display.width", 160):
        print(pos_summary)

    OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    pos_summary.to_csv(OUT_CSV, index=False)
    print(f"\n✓ Saved POS summary to {OUT_CSV.resolve()}")

    # --- INDEX summary ---
    index_summary = build_index_summary(df_tokens)
    print("\nPer‑INDEX summary (top values and percentages):")
    with pd.option_context("display.max_rows", None,
                           "display.max_columns", None,
                           "display.width", 160):
        print(index_summary)

    OUT_CSV_INDEX.parent.mkdir(parents=True, exist_ok=True)
    index_summary.to_csv(OUT_CSV_INDEX, index=False)
    print(f"✓ Saved INDEX summary to {OUT_CSV_INDEX.resolve()}")


Reading: en_ewt-ud-train_sentences.csv
Expanded to 194,916 token rows (POS present for 100.0% of tokens).

Per‑POS summary (top values and percentages):
      pos  n_tokens  n_types  top_index  top_index_pct  top_length  top_length_pct  top_head_dist  top_head_dist_pct top_relation_type  top_relation_type_pct  \
0    NOUN     33607     6903          6           5.54           4           18.33             -2              18.67               obj                  20.22   
1   PUNCT     22123       94         10           4.48           1           96.14             -1              15.46             punct                 100.00   
2    VERB     22095     3538          3           8.07           4           27.70              0              30.91              root                  30.91   
3    PRON     18255      138          1          18.33           2           26.52              1              37.16             nsubj                  55.28   
4     ADP     17557      116          6   

In [7]:
def build_length_summary(tokens_df: pd.DataFrame, max_length: int = 10) -> pd.DataFrame:
    """
    Build a per-token-length summary (length ≤ max_length).
    Mirrors build_pos_summary and build_index_summary.
    """

    if "length" not in tokens_df.columns:
        raise ValueError("Expanded token frame has no 'length' column; check inputs.")

    # Keep only tokens with length <= max_length
    df = tokens_df[tokens_df["length"].notna() & (tokens_df["length"] <= max_length)]

    records = []
    for L, g in df.groupby("length", dropna=True):
        n_tokens = len(g)
        n_types  = g["word_norm"].nunique()

        top_pos,    pct_pos,    _ = _mode_and_pct(g["pos"], denom=n_tokens)
        top_index,  pct_index,  _ = _mode_and_pct(g["index"], denom=n_tokens)
        top_hd,     pct_hd,     _ = _mode_and_pct(g["head_dist"], denom=n_tokens)
        top_rel,    pct_rel,    _ = _mode_and_pct(g["relation_type"], denom=n_tokens)
        top_arity,  pct_arity,  _ = _mode_and_pct(g["arity"], denom=n_tokens)

        records.append({
            "length": L,
            "n_tokens": n_tokens,
            "n_types": n_types,

            "top_pos": top_pos,
            "top_pos_pct": round(pct_pos, 2),

            "top_index": top_index,
            "top_index_pct": round(pct_index, 2),

            "top_head_dist": top_hd,
            "top_head_dist_pct": round(pct_hd, 2),

            "top_relation_type": top_rel,
            "top_relation_type_pct": round(pct_rel, 2),

            "top_arity": top_arity,
            "top_arity_pct": round(pct_arity, 2),
        })

    summary = (
        pd.DataFrame.from_records(records)
        .sort_values(["length"])
        .reset_index(drop=True)
    )
    return summary


In [8]:
length_summary = build_length_summary(df_tokens, max_length=10)
print("\nPer-LENGTH summary (length ≤ 10):")
with pd.option_context("display.max_rows", None,
                       "display.max_columns", None,
                       "display.width", 160):
    print(length_summary)

length_summary.to_csv("length_summary.csv", index=False)
print("✓ Saved LENGTH summary to length_summary.csv")



Per-LENGTH summary (length ≤ 10):
   length  n_tokens  n_types top_pos  top_pos_pct  top_index  top_index_pct  top_head_dist  top_head_dist_pct top_relation_type  top_relation_type_pct  \
0       1     29626       74   PUNCT        71.80          1           5.98              1              21.08             punct                  71.80   
1       2     29918      334     ADP        37.84          1           6.61              1              41.01              case                  35.28   
2       3     36097      911     DET        26.98          1           5.88              1              41.34               det                  26.02   
3       4     33008     1660    NOUN        18.66          2           6.03              1              26.63            advmod                  10.33   
4       5     18892     2012    NOUN        30.75          2           5.92              1              26.32              amod                   9.01   
5       6     14274     2373    NOUN     

In [9]:
def build_head_dist_summary(tokens_df: pd.DataFrame,
                            min_head_dist: int = -6,
                            max_head_dist: int = 6) -> pd.DataFrame:
    """
    Build a per-head_dist summary for head_dist in [min_head_dist, max_head_dist].
    Mirrors the structure of your POS / INDEX / LENGTH summaries.
    """

    if "head_dist" not in tokens_df.columns:
        raise ValueError("Expanded token frame has no 'head_dist' column; check inputs.")

    # Keep only tokens within the specified range
    df = tokens_df[
        tokens_df["head_dist"].notna()
        & (tokens_df["head_dist"] >= min_head_dist)
        & (tokens_df["head_dist"] <= max_head_dist)
    ]

    records = []
    for hd, g in df.groupby("head_dist", dropna=True):
        n_tokens = len(g)
        n_types  = g["word_norm"].nunique()

        top_pos,    pct_pos,    _ = _mode_and_pct(g["pos"], denom=n_tokens)
        top_index,  pct_index,  _ = _mode_and_pct(g["index"], denom=n_tokens)
        top_length, pct_length, _ = _mode_and_pct(g["length"], denom=n_tokens)
        top_rel,    pct_rel,    _ = _mode_and_pct(g["relation_type"], denom=n_tokens)
        top_arity,  pct_arity,  _ = _mode_and_pct(g["arity"], denom=n_tokens)

        records.append({
            "head_dist": hd,
            "n_tokens": n_tokens,
            "n_types": n_types,

            "top_pos": top_pos,
            "top_pos_pct": round(pct_pos, 2),

            "top_index": top_index,
            "top_index_pct": round(pct_index, 2),

            "top_length": top_length,
            "top_length_pct": round(pct_length, 2),

            "top_relation_type": top_rel,
            "top_relation_type_pct": round(pct_rel, 2),

            "top_arity": top_arity,
            "top_arity_pct": round(pct_arity, 2),
        })

    summary = (
        pd.DataFrame.from_records(records)
        .sort_values(["head_dist"])
        .reset_index(drop=True)
    )
    return summary


In [10]:
head_dist_summary = build_head_dist_summary(df_tokens,
                                            min_head_dist=-6,
                                            max_head_dist=6)

print("\nPer-HEAD_DIST summary (−6 to 6):")
with pd.option_context("display.max_rows", None,
                       "display.max_columns", None,
                       "display.width", 160):
    print(head_dist_summary)

head_dist_summary.to_csv("head_dist_summary.csv", index=False)



Per-HEAD_DIST summary (−6 to 6):
    head_dist  n_tokens  n_types top_pos  top_pos_pct  top_index  top_index_pct  top_length  top_length_pct top_relation_type  top_relation_type_pct  \
0          -6      3251     1701    NOUN        37.47          8          10.34           1           18.61             punct                  19.26   
1          -5      4545     2268    NOUN        41.41          8           9.42           4           17.29               obl                  21.30   
2          -4      6684     3050    NOUN        46.95          7           8.95           4           18.73               obl                  21.47   
3          -3     10487     4117    NOUN        55.21          6           7.96           4           19.61              nmod                  20.32   
4          -2     15690     5483    NOUN        39.99          5           7.39           4           21.22               obj                  22.54   
5          -1     13038     2647   PUNCT        26.24 

In [11]:
def build_arity_summary(tokens_df: pd.DataFrame, max_arity: int = 4) -> pd.DataFrame:
    """
    Build a per-arity summary (arity ≤ max_arity).
    Mirrors the structure of your POS / INDEX / LENGTH / HEAD_DIST summaries.
    """

    if "arity" not in tokens_df.columns:
        raise ValueError("Expanded token frame has no 'arity' column; check inputs.")

    # Keep only tokens with arity <= max_arity
    df = tokens_df[
        tokens_df["arity"].notna()
        & (tokens_df["arity"] <= max_arity)
    ]

    records = []
    for ar, g in df.groupby("arity", dropna=True):
        n_tokens = len(g)
        n_types  = g["word_norm"].nunique()

        top_pos,    pct_pos,    _ = _mode_and_pct(g["pos"], denom=n_tokens)
        top_index,  pct_index,  _ = _mode_and_pct(g["index"], denom=n_tokens)
        top_length, pct_length, _ = _mode_and_pct(g["length"], denom=n_tokens)
        top_hd,     pct_hd,     _ = _mode_and_pct(g["head_dist"], denom=n_tokens)
        top_rel,    pct_rel,    _ = _mode_and_pct(g["relation_type"], denom=n_tokens)

        records.append({
            "arity": ar,
            "n_tokens": n_tokens,
            "n_types": n_types,

            "top_pos": top_pos,
            "top_pos_pct": round(pct_pos, 2),

            "top_index": top_index,
            "top_index_pct": round(pct_index, 2),

            "top_length": top_length,
            "top_length_pct": round(pct_length, 2),

            "top_head_dist": top_hd,
            "top_head_dist_pct": round(pct_hd, 2),

            "top_relation_type": top_rel,
            "top_relation_type_pct": round(pct_rel, 2),
        })

    summary = (
        pd.DataFrame.from_records(records)
        .sort_values(["arity"])
        .reset_index(drop=True)
    )
    return summary


In [12]:
arity_summary = build_arity_summary(df_tokens, max_arity=4)

print("\nPer-ARITY summary (≤ 4):")
with pd.option_context("display.max_rows", None,
                       "display.max_columns", None,
                       "display.width", 160):
    print(arity_summary)

arity_summary.to_csv("arity_summary.csv", index=False)
print("✓ Saved ARITY summary to arity_summary.csv")



Per-ARITY summary (≤ 4):
   arity  n_tokens  n_types top_pos  top_pos_pct  top_index  top_index_pct  top_length  top_length_pct  top_head_dist  top_head_dist_pct top_relation_type  \
0      0    129064     7581   PUNCT        17.14          1           6.53           3           22.93              1              42.04             punct   
1      1     20239     6441    NOUN        45.31          6           5.68           4           19.97             -2              38.72               obj   
2      2     18037     5846    NOUN        51.82          2           6.32           4           22.05             -3              27.30               obl   
3      3     13546     4621    VERB        44.92          3           8.25           4           23.67              0              18.88              root   
4      4      7856     3169    VERB        54.88          4           8.95           4           26.03              0              31.73              root   

   top_relation_type_pct 

In [13]:
def build_relation_type_summary_top15(tokens_df: pd.DataFrame, top_k: int = 15) -> pd.DataFrame:
    """
    Build a per-relation_type summary for the top-K most frequent
    dependency relations in the dataset.
    Mirrors your other feature summary functions.
    """

    if "relation_type" not in tokens_df.columns:
        raise ValueError("Expanded token frame has no 'relation_type' column; check inputs.")

    # Count relations → pick top K
    vc = tokens_df["relation_type"].value_counts(dropna=True)
    top_relations = set(vc.iloc[:top_k].index)

    # Filter to only those
    df = tokens_df[tokens_df["relation_type"].isin(top_relations)]

    records = []
    for rel, g in df.groupby("relation_type", dropna=True):
        n_tokens = len(g)
        n_types  = g["word_norm"].nunique()

        top_pos,    pct_pos,    _ = _mode_and_pct(g["pos"], denom=n_tokens)
        top_index,  pct_index,  _ = _mode_and_pct(g["index"], denom=n_tokens)
        top_length, pct_length, _ = _mode_and_pct(g["length"], denom=n_tokens)
        top_hd,     pct_hd,     _ = _mode_and_pct(g["head_dist"], denom=n_tokens)
        top_arity,  pct_arity,  _ = _mode_and_pct(g["arity"], denom=n_tokens)

        records.append({
            "relation_type": rel,
            "n_tokens": n_tokens,
            "n_types": n_types,

            "top_pos": top_pos,
            "top_pos_pct": round(pct_pos, 2),

            "top_index": top_index,
            "top_index_pct": round(pct_index, 2),

            "top_length": top_length,
            "top_length_pct": round(pct_length, 2),

            "top_head_dist": top_hd,
            "top_head_dist_pct": round(pct_hd, 2),

            "top_arity": top_arity,
            "top_arity_pct": round(pct_arity, 2),
        })

    summary = (
        pd.DataFrame.from_records(records)
        .sort_values(["n_tokens"], ascending=False)
        .reset_index(drop=True)
    )
    return summary


In [14]:
rel_summary = build_relation_type_summary_top15(df_tokens, top_k=15)

print("\nTop-15 RELATION_TYPE summary:")
with pd.option_context("display.max_rows", None,
                       "display.max_columns", None,
                       "display.width", 160):
    print(rel_summary)

rel_summary.to_csv("relation_type_top15_summary.csv", index=False)
print("✓ Saved to relation_type_top15_summary.csv")



Top-15 RELATION_TYPE summary:
   relation_type  n_tokens  n_types top_pos  top_pos_pct  top_index  top_index_pct  top_length  top_length_pct  top_head_dist  top_head_dist_pct  top_arity  \
0          punct     22123       94   PUNCT       100.00         10           4.48           1           96.14             -1              15.46          0   
1           case     16577      128     ADP        97.79          6           6.01           2           63.67              2              37.04          0   
2          nsubj     15648     2562    PRON        64.49          1          21.20           4           19.67              1              35.85          0   
3            det     15562       31     DET       100.00          1           7.36           3           60.36              1              58.85          0   
4         advmod     10307      704     ADV        89.64          1           7.98           4           33.07              1              43.64          0   
5           roo

In [15]:
import collections
import pandas as pd
def top_n_and_pct(series: pd.Series, denom: int, n: int = 5):
    """
    Return the top n values in the series, their counts, and their percentages of denom.
    Returns lists: values, counts, pct_of_denom.
    """
    # Value counts (drop NaNs)
    vc = series.dropna().value_counts()
    topn = vc.head(n)
    values = topn.index.tolist()
    counts = topn.values.tolist()
    pcts   = [(count / denom) * 100.0 for count in counts]
    return values, counts, pcts


In [16]:
def build_head_dist_summary(tokens_df: pd.DataFrame,
                            min_head_dist: int = -6,
                            max_head_dist: int = 6,
                            top_n: int = 5) -> pd.DataFrame:
    if "head_dist" not in tokens_df.columns:
        raise ValueError("Expanded token frame has no 'head_dist' column; check inputs.")

    df = tokens_df[
        tokens_df["head_dist"].notna()
        & (tokens_df["head_dist"] >= min_head_dist)
        & (tokens_df["head_dist"] <= max_head_dist)
    ]

    records = []
    for hd, g in df.groupby("head_dist", dropna=True):
        n_tokens = len(g)
        n_types  = g["word_norm"].nunique()

        # get top_n for each feature
        vals_pos, cnts_pos, pcts_pos = top_n_and_pct(g["pos"], denom=n_tokens, n=top_n)
        vals_rel, cnts_rel, pcts_rel = top_n_and_pct(g["relation_type"], denom=n_tokens, n=top_n)
        # likewise for index, length, arity …
        vals_index, cnts_index, pcts_index = top_n_and_pct(g["index"], denom=n_tokens, n=top_n)
        vals_length, cnts_length, pcts_length = top_n_and_pct(g["length"], denom=n_tokens, n=top_n)
        vals_arity, cnts_arity, pcts_arity = top_n_and_pct(g["arity"], denom=n_tokens, n=top_n)

        records.append({
            "head_dist": hd,
            "n_tokens": n_tokens,
            "n_types": n_types,

            "top_pos_vals": vals_pos,
            "top_pos_counts": cnts_pos,
            "top_pos_pct": [round(p,2) for p in pcts_pos],

            "top_relation_type_vals": vals_rel,
            "top_relation_type_counts": cnts_rel,
            "top_relation_type_pct": [round(p,2) for p in pcts_rel],

            "top_index_vals": vals_index,
            "top_index_counts": cnts_index,
            "top_index_pct": [round(p,2) for p in pcts_index],

            "top_length_vals": vals_length,
            "top_length_counts": cnts_length,
            "top_length_pct": [round(p,2) for p in pcts_length],

            "top_arity_vals": vals_arity,
            "top_arity_counts": cnts_arity,
            "top_arity_pct": [round(p,2) for p in pcts_arity],
        })

    summary = (
        pd.DataFrame.from_records(records)
        .sort_values(["head_dist"])
        .reset_index(drop=True)
    )
    return summary


In [17]:
head_dist_summary = build_head_dist_summary(df_tokens,
                                            min_head_dist=-6,
                                            max_head_dist=6)

print("\nPer-HEAD_DIST summary (−6 to 6):")
with pd.option_context("display.max_rows", None,
                       "display.max_columns", None,
                       "display.width", 160):
    print(head_dist_summary)

head_dist_summary.to_csv("head_dist_summary.csv", index=False)



Per-HEAD_DIST summary (−6 to 6):
    head_dist  n_tokens  n_types                      top_pos_vals                  top_pos_counts                          top_pos_pct  \
0          -6      3251     1701   [NOUN, VERB, PUNCT, PROPN, ADJ]      [1218, 816, 626, 209, 163]     [37.47, 25.1, 19.26, 6.43, 5.01]   
1          -5      4545     2268   [NOUN, VERB, PUNCT, PROPN, ADJ]     [1882, 1058, 694, 358, 191]     [41.41, 23.28, 15.27, 7.88, 4.2]   
2          -4      6684     3050   [NOUN, VERB, PUNCT, PROPN, ADJ]     [3138, 1488, 670, 592, 262]    [46.95, 22.26, 10.02, 8.86, 3.92]   
3          -3     10487     4117   [NOUN, VERB, PROPN, PUNCT, ADV]     [5790, 1839, 936, 685, 352]     [55.21, 17.54, 8.93, 6.53, 3.36]   
4          -2     15690     5483    [NOUN, VERB, PROPN, PRON, NUM]    [6275, 3726, 2110, 842, 646]    [39.99, 23.75, 13.45, 5.37, 4.12]   
5          -1     13038     2647   [PUNCT, PRON, ADV, PROPN, NOUN]  [3421, 2368, 1453, 1375, 1213]    [26.24, 18.16, 11.14, 10.55, 9