## KWIC Search (Keyword In Context)

**How to use**
1. Press ‚ñ∂Ô∏è **Play** on the code cell below (you only need to do this once).
2. Scroll down to the graphical interface (GUI).
3. Choose a file, type a search term, and click **Run KWIC**.

You can run multiple searches by changing the search term and clicking **Run KWIC** again ‚Äî no need to rerun the notebook cell.

**Supported files**
- **TXT**, **CSV**, **JSON**
- Using a Google Drive folder is recommended (easy reuse), but you can also upload a file once.

**Search modes**
- Exact word or phrase ‚Äì `love`
- Word with `*` wildcard (instead of a character) ‚Äì `l*ve`
- Multiple terms separated by `|` ‚Äì `love | hate`

**Results**
- Every search is automatically saved as a CSV in the `results` folder.
- The filename includes the search term, source filename, and timestamp.

**Permissions & first-time setup**
When you open this notebook for the first time, Google Colab may ask for several permissions (login, Google Drive access, etc.).  
These are required so the notebook can read input files and save results to your Drive.

üëâ If this is your first time using Google Colab, open the file  
**`How-to-Run-Jupyter-Notebooks-in-Google-Colab.docx`**  
in this folder for step-by-step instructions on installing Colab (if needed) and granting permissions.


In [5]:
# ==============================
# KWIC Search Notebook (Google Colab)
#
# This notebook lets you search texts using KWIC (Keyword In Context)
# via a simple graphical interface.
#
# HOW TO USE
# ----------
# 1. Press ‚ñ∂Ô∏è Play to run this cell.
# 2. Scroll down to the graphical interface (GUI).
# 3. Choose a file and enter a search term.
# 4. Click the green ‚ÄúRun KWIC‚Äù button.
#
# SUPPORTED FILES
# ---------------
# ‚Ä¢ TXT  ‚Äì plain text files
# ‚Ä¢ CSV  ‚Äì tabular data (auto-selects a likely text column; you can override)
# ‚Ä¢ JSON ‚Äì simple JSON structures
#
# SEARCH MODES
# ------------
# ‚Ä¢ Exact word or phrase
# ‚Ä¢ Word with * wildcard (one character)
# ‚Ä¢ Multiple alternatives separated by |
#
# RESULTS
# -------
# ‚Ä¢ Each search is saved automatically as a CSV file
# ‚Ä¢ Files are stored in the ‚Äúresults‚Äù folder
# ‚Ä¢ Filenames include the search term, source file name, and timestamp
#
# FILE ACCESS & PERMISSIONS
# -------------------------
# ‚Ä¢ Files can be selected from a Google Drive folder (recommended)
# ‚Ä¢ Or uploaded once from your computer
# ‚Ä¢ On first use, Colab may ask for permissions (login, Drive access, etc.)
#   These are required so the notebook can read/write files in your Drive folder.
#
# SETTINGS
# --------
# ‚Ä¢ The notebook remembers your last file and search settings
# ‚Ä¢ You can rerun searches without reloading the notebook
# ==============================

import csv
import io
import json
import os
import re
import unicodedata
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple

import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

try:
    from google.colab import drive  # type: ignore
except Exception:
    drive = None


# ------------------------------
# Defaults / constants
# ------------------------------
DEFAULT_DRIVE_FOLDER = "/content/drive/MyDrive/KWIC-notebook"
DEFAULT_RESULTS_FOLDER_NAME = "results"
SETTINGS_FILENAME = "kwic_last_settings.json"
CSV_SAMPLE_ROWS_FOR_GUESS = 200  # small for speed


# ------------------------------
# Data models
# ------------------------------
@dataclass
class Record:
    record_id: str
    text_raw: str
    meta: Dict[str, Any]


@dataclass
class KwicHit:
    record_id: str
    left: str
    hit: str
    right: str
    meta: Dict[str, Any]


# ------------------------------
# Logging helpers
# ------------------------------
_FIRST_PRINT_DONE = False


def tprint(message: str, *, enabled: bool = True) -> None:
    """Timestamped print; first print starts with a blank line."""
    global _FIRST_PRINT_DONE
    if not enabled:
        return
    if not _FIRST_PRINT_DONE:
        print()
        _FIRST_PRINT_DONE = True
    print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {message}")


def blank_line(*, enabled: bool = True) -> None:
    """Visual separation between console blocks."""
    if enabled:
        print()


# ------------------------------
# Widget safety (prevents 'Invalid selection')
# ------------------------------
def dropdown_allowed_values(dd: widgets.Dropdown) -> set:
    """Return the set of allowed dd.value values based on dd.options."""
    allowed = set()
    for o in list(dd.options):
        if isinstance(o, (tuple, list)) and len(o) == 2:
            allowed.add(o[1])
        else:
            allowed.add(o)
    return allowed


def safe_set_dropdown_value(dd: widgets.Dropdown, value: Any) -> bool:
    """Set dropdown.value only if value is allowed; return True if set."""
    allowed = dropdown_allowed_values(dd)
    if value in allowed:
        dd.value = value
        return True
    return False


# ------------------------------
# Core helpers
# ------------------------------
def ensure_drive_mounted(verbose: bool) -> None:
    """Mount Google Drive in Colab if needed."""
    if drive is None:
        return
    if not os.path.ismount("/content/drive"):
        tprint("Mounting Google Drive...", enabled=verbose)
        drive.mount("/content/drive")
        tprint("Google Drive mounted.", enabled=verbose)
        blank_line(enabled=verbose)


def normalize_whitespace(text: str) -> str:
    """Collapse whitespace for stable tokenization."""
    return re.sub(r"\s+", " ", (text or "")).strip()


def detect_file_type_from_name(name: str) -> str:
    """Detect type from extension."""
    ext = Path(name).suffix.lower()
    if ext == ".json":
        return "json"
    if ext == ".txt":
        return "txt"
    if ext == ".csv":
        return "csv"
    raise ValueError(f"Cannot detect file type from extension: {ext}")


def slugify_ascii(value: str, max_len: int = 40) -> str:
    """Make filename-safe ASCII slug."""
    value = (value or "").strip().lower()
    if not value:
        return "query"
    value = unicodedata.normalize("NFKD", value)
    value = value.encode("ascii", "ignore").decode("ascii")
    value = re.sub(r"[^a-z0-9_-]+", "_", value)
    value = re.sub(r"_{2,}", "_", value).strip("_-")
    return value[:max_len] if value else "query"


def build_output_filename(source_filename: str, query: str, timestamp: str) -> str:
    """<search>_<sourcefile>_<timestamp>.csv"""
    query_slug = slugify_ascii(query, max_len=40)
    source_stem = slugify_ascii(Path(source_filename).stem, max_len=40)
    return f"{query_slug}_{source_stem}_{timestamp}.csv"


def ensure_results_folder(base_folder: str, verbose: bool) -> str:
    """Ensure results folder exists."""
    results = os.path.join(base_folder, DEFAULT_RESULTS_FOLDER_NAME)
    os.makedirs(results, exist_ok=True)
    tprint(f"Results folder: {results}", enabled=verbose)
    return results


# ------------------------------
# Encoding (Auto-detect + user override)
# ------------------------------
COMMON_ENCODINGS_TRY_ORDER = ["utf-8-sig", "utf-8", "cp1252", "latin-1"]


def _decode_with_score(data: bytes, enc: str) -> Tuple[bool, int, str]:
    """
    Try decode with encoding.
    Returns: (success_strict, replacement_count, decoded_text)
    Lower replacement_count is better.
    """
    try:
        text = data.decode(enc, errors="strict")
        return True, 0, text
    except Exception:
        text = data.decode(enc, errors="replace")
        rep_count = text.count("\ufffd")  # replacement char count
        return False, rep_count, text


def decode_bytes(file_bytes: bytes, encoding_choice: str) -> Tuple[str, str]:
    """
    Decode bytes into text according to encoding_choice.
    Returns: (decoded_text, used_encoding_label)
    """
    choice = (encoding_choice or "auto").strip().lower()

    explicit_map = {
        "utf-8": "utf-8",
        "utf-8-sig": "utf-8-sig",
        "cp1252": "cp1252",
        "latin-1": "latin-1",
    }
    if choice in explicit_map:
        enc = explicit_map[choice]
        text = file_bytes.decode(enc, errors="replace")
        return text, enc

    # Auto: pick best candidate
    best = None  # (success_strict, -rep_count, enc, text)
    for enc in COMMON_ENCODINGS_TRY_ORDER:
        success, rep_count, text = _decode_with_score(file_bytes, enc)
        candidate = (success, -rep_count, enc, text)
        if best is None or candidate > best:
            best = candidate

        if success and rep_count == 0:
            return text, enc

    assert best is not None
    return best[3], best[2]


# ------------------------------
# Settings persistence
# ------------------------------
def settings_path(base_folder: str) -> str:
    return os.path.join(base_folder, SETTINGS_FILENAME)


def load_settings(base_folder: str) -> Dict[str, Any]:
    try:
        p = settings_path(base_folder)
        if not os.path.exists(p):
            return {}
        with open(p, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data if isinstance(data, dict) else {}
    except Exception:
        return {}


def save_settings(base_folder: str, data: Dict[str, Any]) -> None:
    try:
        os.makedirs(base_folder, exist_ok=True)
        p = settings_path(base_folder)
        out = dict(data)
        out["updated_at"] = datetime.now().isoformat(timespec="seconds")
        with open(p, "w", encoding="utf-8") as f:
            json.dump(out, f, ensure_ascii=False, indent=2)
    except Exception:
        pass


# ------------------------------
# Search pattern building
# ------------------------------
def split_alternatives(query: str, sep: str = "|") -> List[str]:
    parts = [p.strip() for p in (query or "").split(sep)]
    return [p for p in parts if p]


def build_search_regex(query: str, *, mode: str, ignore_case: bool = True) -> re.Pattern:
    """
    Modes:
    - exact_one: literal search
    - wildcard_one: '*' = exactly ONE character
    - multi_or: literal OR search for alternatives split by '|'
    """
    query = (query or "").strip()
    if not query:
        raise ValueError("Search field is empty.")

    if mode == "exact_one":
        pattern_text = re.escape(query)
    elif mode == "wildcard_one":
        pattern_text = re.escape(query).replace(r"\*", ".")
    elif mode == "multi_or":
        alts = split_alternatives(query, sep="|")
        if not alts:
            raise ValueError("No valid alternatives found. Example: love|hate")
        inner = "|".join(re.escape(a) for a in alts)
        pattern_text = f"(?:{inner})"
    else:
        raise ValueError(f"Unknown search mode: {mode}")

    flags = re.IGNORECASE if ignore_case else 0
    return re.compile(pattern_text, flags=flags)


# ------------------------------
# KWIC engine
# ------------------------------
def tokenize_with_spans(text: str) -> List[Tuple[str, int, int]]:
    return [(m.group(0), m.start(), m.end()) for m in re.finditer(r"\S+", text)]


def char_to_token_index(tokens: List[Tuple[str, int, int]], char_pos: int) -> int:
    for i, (_tok, start, end) in enumerate(tokens):
        if start <= char_pos < end:
            return i
        if char_pos < start:
            return max(0, i - 1)
    return max(0, len(tokens) - 1)


def kwic_search_records(
    records: Sequence[Record],
    pattern: re.Pattern,
    *,
    context_window: int,
    normalize: bool,
    stop_after: int,
    verbose: bool
) -> List[KwicHit]:
    tprint("Starting KWIC search...", enabled=verbose)
    hits_raw: List[KwicHit] = []

    for rec in records:
        text = normalize_whitespace(rec.text_raw) if normalize else (rec.text_raw or "")
        if not text:
            continue

        tokens = tokenize_with_spans(text)

        for m in pattern.finditer(text):
            if stop_after > 0 and len(hits_raw) >= stop_after:
                break

            tok_i = char_to_token_index(tokens, m.start())
            start_tok = max(0, tok_i - context_window)
            end_tok = min(len(tokens), tok_i + context_window + 1)

            window_tokens = tokens[start_tok:end_tok]
            keyword_pos = tok_i - start_tok

            left_tokens = [t[0] for t in window_tokens[:keyword_pos]]
            hit_token = window_tokens[keyword_pos][0] if window_tokens else m.group(0)
            right_tokens = [t[0] for t in window_tokens[keyword_pos + 1:]]

            hits_raw.append(
                KwicHit(
                    record_id=rec.record_id,
                    left=" ".join(left_tokens),
                    hit=hit_token,
                    right=" ".join(right_tokens),
                    meta=rec.meta,
                )
            )

        if stop_after > 0 and len(hits_raw) >= stop_after:
            break

    tprint(f"Found {len(hits_raw)} matches.", enabled=verbose)
    blank_line(enabled=verbose)

    left_width = 60
    if hits_raw:
        left_width = min(200, max(len(h.left) for h in hits_raw) + 5)

    return [
        KwicHit(
            record_id=h.record_id,
            left=h.left.ljust(left_width),
            hit=h.hit,
            right=h.right,
            meta=h.meta,
        )
        for h in hits_raw
    ]


def print_kwic_console(hits: Sequence[KwicHit], max_show: int, verbose: bool) -> None:
    if not hits:
        tprint("No matches found.", enabled=True)
        blank_line(enabled=True)
        return

    shown = hits[:max_show] if max_show > 0 else list(hits)
    tprint(f"Showing {len(shown)} of {len(hits)} matches:", enabled=verbose)
    tprint("=" * 80, enabled=verbose)

    for h in shown:
        print(f"{h.record_id}    {h.left}{h.hit}          {h.right}")

    if max_show > 0 and len(hits) > max_show:
        tprint(f"... and {len(hits) - max_show} more.", enabled=verbose)

    blank_line(enabled=verbose)


# ------------------------------
# Loaders (decode via decode_bytes)
# ------------------------------
def load_json_from_bytes(file_bytes: bytes, source_name: str, encoding_choice_value: str) -> List[Record]:
    decoded, used_enc = decode_bytes(file_bytes, encoding_choice_value)
    data = json.loads(decoded)
    records: List[Record] = []

    if isinstance(data, dict):
        for k, v in data.items():
            text = v if isinstance(v, str) else json.dumps(v, ensure_ascii=False)
            records.append(Record(record_id=str(k), text_raw=text, meta={"source": source_name, "encoding": used_enc}))
    elif isinstance(data, list):
        for i, item in enumerate(data):
            if isinstance(item, dict):
                rid = str(item.get("date") or item.get("id") or f"row_{i+1}")
                text = str(item.get("text") or item.get("entry") or "")
                records.append(
                    Record(
                        record_id=rid,
                        text_raw=text,
                        meta={"source": source_name, "row_index": i + 1, "encoding": used_enc},
                    )
                )
            else:
                records.append(
                    Record(
                        record_id=f"row_{i+1}",
                        text_raw=str(item),
                        meta={"source": source_name, "encoding": used_enc},
                    )
                )
    else:
        records.append(
            Record(
                record_id=source_name,
                text_raw=json.dumps(data, ensure_ascii=False),
                meta={"source": source_name, "encoding": used_enc},
            )
        )

    return records


def load_txt_from_bytes(
    file_bytes: bytes,
    source_name: str,
    split_mode: str,
    encoding_choice_value: str
) -> List[Record]:
    text, used_enc = decode_bytes(file_bytes, encoding_choice_value)

    if split_mode == "file":
        return [Record(record_id=source_name, text_raw=text, meta={"source": source_name, "encoding": used_enc})]

    parts = re.split(r"\n\s*\n+", text.strip())
    records: List[Record] = []
    for i, part in enumerate(parts):
        if part.strip():
            records.append(
                Record(
                    record_id=f"{source_name}#p{i+1}",
                    text_raw=part,
                    meta={"source": source_name, "paragraph_index": i + 1, "encoding": used_enc},
                )
            )
    return records


def load_csv_from_bytes_simple(
    file_bytes: bytes,
    source_name: str,
    *,
    content_column: str,
    id_column: Optional[str],
    delimiter: str,
    encoding_choice_value: str
) -> List[Record]:
    decoded, used_enc = decode_bytes(file_bytes, encoding_choice_value)
    df = pd.read_csv(io.StringIO(decoded), sep=delimiter, dtype=str, keep_default_na=False)

    if content_column not in df.columns:
        raise ValueError(f"CSV content column '{content_column}' not found. Available: {list(df.columns)}")

    records: List[Record] = []
    for i, row in df.iterrows():
        rid = (
            str(row[id_column])
            if (id_column and id_column in df.columns and str(row[id_column]).strip())
            else f"row_{i+1}"
        )
        records.append(
            Record(
                record_id=rid,
                text_raw=str(row[content_column]),
                meta={"source": source_name, "encoding": used_enc},
            )
        )
    return records


# ------------------------------
# CSV guessing (Content + ID)
# ------------------------------
_NUMERIC_RE = re.compile(r"^\s*[-+]?\d+(?:[.,]\d+)?\s*$")


def _colname_tokens(name: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", (name or "").lower()).strip()


def read_csv_sample_df(file_bytes: bytes, delimiter: str, encoding_choice_value: str) -> pd.DataFrame:
    decoded, _ = decode_bytes(file_bytes, encoding_choice_value)
    return pd.read_csv(
        io.StringIO(decoded),
        sep=delimiter,
        dtype=str,
        keep_default_na=False,
        nrows=CSV_SAMPLE_ROWS_FOR_GUESS,
    )


def guess_csv_content_column(df_sample: pd.DataFrame) -> Optional[str]:
    if df_sample.empty:
        return None

    bad_name_tokens = {
        "id", "uuid", "uid", "date", "time", "timestamp", "year", "month", "day",
        "type", "category", "code", "zip", "postcode", "lat", "lon", "lng",
        "place", "location", "country", "city", "index", "row", "nr", "no", "number"
    }

    best_col = None
    best_score = float("-inf")

    for col in df_sample.columns:
        s = df_sample[col].astype(str).fillna("").map(lambda x: x.strip())
        non_empty = s.map(lambda x: len(x) > 0)
        if non_empty.sum() == 0:
            continue

        lengths = s[non_empty].map(len)
        median_len = float(lengths.median()) if not lengths.empty else 0.0
        non_empty_ratio = float(non_empty.mean())
        space_ratio = float(s[non_empty].map(lambda x: (" " in x)).mean()) if non_empty.sum() else 0.0
        numeric_ratio = float(s[non_empty].map(lambda x: bool(_NUMERIC_RE.match(x))).mean()) if non_empty.sum() else 0.0

        tokens = set(_colname_tokens(col).split())
        name_penalty = 1.5 if (tokens & bad_name_tokens) else 0.0

        med_scaled = min(1.0, median_len / 80.0)

        score = (
            2.2 * med_scaled +
            1.6 * space_ratio +
            1.0 * non_empty_ratio -
            2.0 * numeric_ratio -
            name_penalty
        )

        if score > best_score:
            best_score = score
            best_col = col

    return best_col


def guess_csv_id_column(df_sample: pd.DataFrame) -> Optional[str]:
    if df_sample.empty:
        return None

    strong_name_tokens = {"id", "uuid", "uid", "key", "ref", "record", "entry", "date", "timestamp", "time", "datetime"}

    best_col = None
    best_score = float("-inf")

    for col in df_sample.columns:
        s = df_sample[col].astype(str).fillna("").map(lambda x: x.strip())
        non_empty = s.map(lambda x: len(x) > 0)
        ne = s[non_empty]
        if ne.empty:
            continue

        unique_ratio = float(ne.nunique() / max(1, len(ne)))
        median_len = float(ne.map(len).median()) if len(ne) else 0.0
        space_ratio = float(ne.map(lambda x: (" " in x)).mean()) if len(ne) else 0.0
        numeric_ratio = float(ne.map(lambda x: bool(_NUMERIC_RE.match(x))).mean()) if len(ne) else 0.0

        tokens = set(_colname_tokens(col).split())
        name_boost = 1.8 if (tokens & strong_name_tokens) else 0.0

        short_bonus = 1.0 if 2 <= median_len <= 40 else 0.0
        too_long_penalty = 1.5 if median_len > 120 else 0.0

        score = (
            2.5 * unique_ratio +
            1.0 * short_bonus -
            2.0 * space_ratio -
            0.8 * too_long_penalty +
            0.6 * numeric_ratio +
            name_boost
        )

        if score > best_score:
            best_score = score
            best_col = col

    if best_score < 2.2:
        return None
    return best_col


# ------------------------------
# UI widgets
# ------------------------------
title = widgets.HTML("<h3>KWIC Search</h3>")

input_mode = widgets.RadioButtons(
    options=[
        ("Choose a file from Google Drive (recommended)", "drive_folder"),
        ("Upload a file (one-off)", "upload"),
    ],
    value="drive_folder",
    description="Input:",
    layout=widgets.Layout(width="760px"),
    style={"description_width": "80px"},
)

drive_folder = widgets.Text(
    value=DEFAULT_DRIVE_FOLDER,
    description="Drive folder:",
    layout=widgets.Layout(width="760px"),
    style={"description_width": "120px"},
)

drive_status = widgets.HTML("<div style='color:#666; font-size: 12px; padding-left: 120px;'></div>")

drive_file_dropdown = widgets.Dropdown(
    options=[],
    value=None,
    description="File:",
    layout=widgets.Layout(width="760px"),
    style={"description_width": "120px"},
)

drive_tip = widgets.HTML(
    "<div style='color:#666; font-size: 12px; padding-left: 120px;'>"
    "Tip: keep your input files inside the <b>KWIC-notebook</b> Drive folder so you can select them without re-uploading."
    "</div>"
)

upload_widget = widgets.FileUpload(
    accept=".json,.txt,.csv",
    multiple=False,
    description="Upload file‚Ä¶",
)
upload_status = widgets.HTML("<div style='color:#666; font-size: 12px; padding-left: 120px;'></div>")

query_text = widgets.Text(
    value="",
    description="Search field:",
    layout=widgets.Layout(width="760px"),
    style={"description_width": "120px"},
)

search_mode = widgets.Dropdown(
    options=[
        ("Mode 1: Search word/text (exact)", "exact_one"),
        ("Mode 2: Search with * wildcard (one character)", "wildcard_one"),
        ("Mode 3: Search multiple words (use | between words)", "multi_or"),
    ],
    value="exact_one",
    description="Search mode:",
    layout=widgets.Layout(width="760px"),
    style={"description_width": "120px"},
)

mode_hint = widgets.HTML(
    "<div style='color:#666; font-size: 12px; padding-left: 120px;'>"
    "Examples: Mode 1 ‚Üí <code>love</code> &nbsp;&nbsp; "
    "Mode 2 ‚Üí <code>l*ve</code> &nbsp;&nbsp; "
    "Mode 3 ‚Üí <code>love | hate</code>"
    "</div>"
)

context_window = widgets.IntSlider(
    value=7,
    min=1,
    max=30,
    step=1,
    description="Context around term:",
    layout=widgets.Layout(width="760px"),
    style={"description_width": "160px"},
)

max_show = widgets.IntText(
    value=100,
    description="Show first N matches:",
    layout=widgets.Layout(width="360px"),
    style={"description_width": "160px"},
)

run_btn = widgets.Button(description="Run KWIC", button_style="success")
output_area = widgets.Output()

# Advanced (CSV)
csv_delimiter = widgets.Text(
    value=",",
    description="CSV delimiter:",
    layout=widgets.Layout(width="220px"),
    style={"description_width": "120px"},
)

csv_content_column = widgets.Dropdown(
    options=[],
    value=None,
    description="CSV: search in column:",
    layout=widgets.Layout(width="520px"),
    style={"description_width": "200px"},
)

csv_id_column = widgets.Dropdown(
    options=[("(row number)", "")],
    value="",
    description="CSV: use as record ID:",
    layout=widgets.Layout(width="520px"),
    style={"description_width": "200px"},
)

# Advanced (Encoding)
encoding_choice = widgets.Dropdown(
    options=[
        ("Auto (recommended)", "auto"),
        ("UTF-8", "utf-8"),
        ("UTF-8-SIG", "utf-8-sig"),
        ("Windows-1252 (common legacy)", "cp1252"),
        ("Latin-1 / ISO-8859-1", "latin-1"),
    ],
    value="auto",
    description="Text encoding:",
    layout=widgets.Layout(width="520px"),
    style={"description_width": "160px"},
)

encoding_hint = widgets.HTML(
    "<div style='color:#666; font-size: 12px; padding-left: 160px;'></div>"
)

# Advanced (Other defaults)
verbose_logs = widgets.Checkbox(value=True, description="Verbose logs (timestamped)")
normalize_text = widgets.Checkbox(value=True, description="Normalize whitespace")
save_csv = widgets.Checkbox(value=True, description="Save results to CSV")

stop_after = widgets.IntText(
    value=0,
    description="Stop after N matches (0=no limit):",
    layout=widgets.Layout(width="420px"),
    style={"description_width": "260px"},
)

txt_split_mode = widgets.Dropdown(
    options=[("Whole file (default)", "file"), ("Split by paragraphs (blank lines)", "paragraph")],
    value="file",
    description="TXT handling:",
    layout=widgets.Layout(width="520px"),
    style={"description_width": "160px"},
)

# Advanced boxes (reordered)
encoding_adv_box = widgets.VBox([
    widgets.HTML("<b>Text encoding settings</b>"),
    encoding_choice,
    encoding_hint,
])

csv_adv_box = widgets.VBox([
    widgets.HTML("<b>CSV settings (only needed if your file is .csv)</b>"),
    csv_delimiter,
    csv_content_column,
    csv_id_column,
])

other_adv_box = widgets.VBox([
    widgets.HTML("<b>Other default settings</b>"),
    widgets.HBox([verbose_logs, normalize_text, save_csv]),
    stop_after,
    widgets.HTML("<hr style='margin: 10px 0;'>"),
    txt_split_mode,
])

# ------------------------------
# Advanced UI (no outer wrapper)
# ------------------------------
advanced = widgets.Accordion(children=[encoding_adv_box, csv_adv_box, other_adv_box])
advanced.set_title(0, "Text encoding settings")
advanced.set_title(1, "CSV settings")
advanced.set_title(2, "Other default settings")
advanced.selected_index = None


# ------------------------------
# Drive listing + input reading
# ------------------------------
def list_drive_files(folder_path: str) -> List[str]:
    p = Path(folder_path)
    if not p.exists() or not p.is_dir():
        return []
    return sorted([f.name for f in p.iterdir() if f.is_file() and f.suffix.lower() in [".json", ".txt", ".csv"]])


def refresh_drive_files(verbose: bool) -> None:
    ensure_drive_mounted(verbose)
    folder = drive_folder.value.strip()
    files = list_drive_files(folder)

    drive_file_dropdown.options = files

    if files:
        if drive_file_dropdown.value not in dropdown_allowed_values(drive_file_dropdown):
            drive_file_dropdown.value = files[0]
    else:
        drive_file_dropdown.value = None


def read_input_file_bytes(verbose: bool) -> Tuple[str, bytes]:
    if input_mode.value == "upload":
        if not upload_widget.value:
            raise ValueError("No file uploaded.")
        fname = list(upload_widget.value.keys())[0]
        content = upload_widget.value[fname]["content"]
        return fname, content

    ensure_drive_mounted(verbose)
    folder = drive_folder.value.strip()
    fname = drive_file_dropdown.value
    if not folder or not fname:
        raise ValueError("Choose a Drive folder and a file.")
    full_path = os.path.join(folder, fname)
    return fname, Path(full_path).read_bytes()


# ------------------------------
# CSV controls population (with guessing)
# ------------------------------
def set_csv_id_options(cols: List[str]) -> None:
    pairs: List[Tuple[str, str]] = [("(row number)", "")]
    pairs.extend([(c, c) for c in cols])
    csv_id_column.options = pairs


def populate_csv_controls_from_bytes(file_bytes: bytes) -> None:
    delim = (csv_delimiter.value or ",")[:1]
    df_sample = read_csv_sample_df(file_bytes, delim, encoding_choice.value)
    cols = list(df_sample.columns)

    prev_content = csv_content_column.value
    prev_id = csv_id_column.value

    csv_content_column.options = cols
    set_csv_id_options(cols)

    # Content: keep prior if valid else guess
    if prev_content in cols:
        safe_set_dropdown_value(csv_content_column, prev_content)
    else:
        guessed_content = guess_csv_content_column(df_sample)
        if guessed_content and guessed_content in cols:
            safe_set_dropdown_value(csv_content_column, guessed_content)
        elif cols:
            safe_set_dropdown_value(csv_content_column, cols[0])

    # ID: keep prior if valid else guess else row number
    if prev_id in dropdown_allowed_values(csv_id_column):
        safe_set_dropdown_value(csv_id_column, prev_id)
    else:
        guessed_id = guess_csv_id_column(df_sample)
        if guessed_id and guessed_id in dropdown_allowed_values(csv_id_column):
            safe_set_dropdown_value(csv_id_column, guessed_id)
        else:
            safe_set_dropdown_value(csv_id_column, "")


def populate_csv_controls_from_selected_file() -> None:
    try:
        if input_mode.value == "drive_folder":
            fname = drive_file_dropdown.value
            if not fname or detect_file_type_from_name(fname) != "csv":
                return
            folder = drive_folder.value.strip()
            if not folder:
                return
            file_bytes = Path(os.path.join(folder, fname)).read_bytes()
        else:
            if not upload_widget.value:
                return
            fname = list(upload_widget.value.keys())[0]
            if detect_file_type_from_name(fname) != "csv":
                return
            file_bytes = upload_widget.value[fname]["content"]

        populate_csv_controls_from_bytes(file_bytes)
        advanced.selected_index = 1  # Open CSV settings panel
    except Exception:
        pass


# ------------------------------
# Settings apply/collect (safe)
# ------------------------------
def apply_settings_non_dropdowns(s: Dict[str, Any]) -> None:
    if isinstance(s.get("drive_folder"), str) and s["drive_folder"].strip():
        drive_folder.value = s["drive_folder"].strip()

    if isinstance(s.get("query_text"), str):
        query_text.value = s["query_text"]
    if s.get("search_mode") in ["exact_one", "wildcard_one", "multi_or"]:
        search_mode.value = s["search_mode"]

    if isinstance(s.get("context_window"), int) and 1 <= s["context_window"] <= 30:
        context_window.value = s["context_window"]
    if isinstance(s.get("max_show"), int) and s["max_show"] >= 0:
        max_show.value = s["max_show"]

    if isinstance(s.get("verbose_logs"), bool):
        verbose_logs.value = s["verbose_logs"]
    if isinstance(s.get("normalize_text"), bool):
        normalize_text.value = s["normalize_text"]
    if isinstance(s.get("save_csv"), bool):
        save_csv.value = s["save_csv"]
    if isinstance(s.get("stop_after"), int) and s["stop_after"] >= 0:
        stop_after.value = s["stop_after"]
    if s.get("txt_split_mode") in ["file", "paragraph"]:
        txt_split_mode.value = s["txt_split_mode"]
    if isinstance(s.get("csv_delimiter"), str) and s["csv_delimiter"]:
        csv_delimiter.value = s["csv_delimiter"][:1]
    if s.get("encoding_choice") in ["auto", "utf-8", "utf-8-sig", "cp1252", "latin-1"]:
        encoding_choice.value = s["encoding_choice"]


def collect_settings() -> Dict[str, Any]:
    return {
        "drive_folder": drive_folder.value.strip(),
        "drive_file": drive_file_dropdown.value,
        "query_text": query_text.value,
        "search_mode": search_mode.value,
        "context_window": int(context_window.value),
        "max_show": int(max_show.value),
        "verbose_logs": bool(verbose_logs.value),
        "normalize_text": bool(normalize_text.value),
        "save_csv": bool(save_csv.value),
        "stop_after": int(stop_after.value),
        "txt_split_mode": txt_split_mode.value,
        "csv_delimiter": (csv_delimiter.value or ",")[:1],
        "csv_content_column": csv_content_column.value,
        "csv_id_column": csv_id_column.value,
        "encoding_choice": encoding_choice.value,
    }


# ------------------------------
# UI behavior
# ------------------------------
def update_encoding_hint(*_args) -> None:
    hints = {
        "auto": (
            "Auto (recommended): chooses the best encoding automatically. "
            "If you see ‚ñ° or ÔøΩ instead of letters, try UTF-8, Windows-1252, or Latin-1."
        ),
        "utf-8": "UTF-8: best for most modern Portuguese / Latvian / English files.",
        "utf-8-sig": "UTF-8-SIG: same text as UTF-8, but sometimes helps CSVs opened in Excel.",
        "cp1252": "Windows-1252: common in older Windows files (often fixes Portuguese in older texts).",
        "latin-1": "Latin-1: very old Western European encoding. Try if Windows-1252 doesn‚Äôt work.",
    }

    msg = hints.get(str(encoding_choice.value), "")
    encoding_hint.value = (
        "<div style='color:#666; font-size: 12px; padding-left: 160px;'>"
        f"{msg}"
        "</div>"
    )


def update_visibility(*_args) -> None:
    is_drive = input_mode.value == "drive_folder"
    drive_folder.layout.display = "" if is_drive else "none"
    drive_status.layout.display = "" if is_drive else "none"
    drive_file_dropdown.layout.display = "" if is_drive else "none"
    drive_tip.layout.display = "" if is_drive else "none"

    is_upload = input_mode.value == "upload"
    upload_widget.layout.display = "" if is_upload else "none"
    upload_status.layout.display = "" if is_upload else "none"


def on_upload_change(_change) -> None:
    if not upload_widget.value:
        upload_status.value = "<div style='color:#666; font-size: 12px; padding-left: 120px;'></div>"
        return

    fname = list(upload_widget.value.keys())[0]
    upload_status.value = (
        "<div style='color:#2e7d32; font-size: 12px; padding-left: 120px;'>"
        f"‚úÖ File selected: <b>{fname}</b></div>"
    )

    try:
        if detect_file_type_from_name(fname) == "csv":
            populate_csv_controls_from_selected_file()
    except Exception:
        pass


def on_drive_folder_change(_change) -> None:
    try:
        drive_status.value = "<div style='color:#666; font-size: 12px; padding-left: 120px;'>Updating file list‚Ä¶</div>"
        refresh_drive_files(verbose=False)

        if drive_file_dropdown.options:
            drive_status.value = "<div style='color:#2e7d32; font-size: 12px; padding-left: 120px;'>‚úÖ File list updated.</div>"
        else:
            drive_status.value = "<div style='color:#b71c1c; font-size: 12px; padding-left: 120px;'>No .json/.txt/.csv files found in this folder.</div>"

        if drive_file_dropdown.value and detect_file_type_from_name(drive_file_dropdown.value) == "csv":
            populate_csv_controls_from_selected_file()

    except Exception as e:
        drive_status.value = (
            "<div style='color:#b71c1c; font-size: 12px; padding-left: 120px;'>"
            f"Error: {e}</div>"
        )


def on_drive_file_change(_change) -> None:
    try:
        fname = drive_file_dropdown.value
        if not fname:
            return
        if detect_file_type_from_name(fname) == "csv":
            populate_csv_controls_from_selected_file()
    except Exception:
        pass


def on_csv_delimiter_change(_change) -> None:
    populate_csv_controls_from_selected_file()


def on_encoding_change(_change) -> None:
    # If encoding changes, refresh CSV guesses if a CSV is selected.
    populate_csv_controls_from_selected_file()


drive_folder.observe(on_drive_folder_change, names="value")
drive_file_dropdown.observe(on_drive_file_change, names="value")
upload_widget.observe(on_upload_change, names="value")
input_mode.observe(update_visibility, names="value")
csv_delimiter.observe(on_csv_delimiter_change, names="value")
encoding_choice.observe(on_encoding_change, names="value")
encoding_choice.observe(update_encoding_hint, names="value")
update_encoding_hint()


# ------------------------------
# Main run
# ------------------------------
def on_run_clicked(_b) -> None:
    with output_area:
        clear_output(wait=True)

        verbose = verbose_logs.value
        try:
            ensure_drive_mounted(verbose)

            base_folder = drive_folder.value.strip() or DEFAULT_DRIVE_FOLDER
            os.makedirs(base_folder, exist_ok=True)
            results_folder = ensure_results_folder(base_folder, verbose=verbose)

            source_name, file_bytes = read_input_file_bytes(verbose)
            ftype = detect_file_type_from_name(source_name)

            # Display encoding used (probe)
            _probe_text, used_enc = decode_bytes(file_bytes[:20000], encoding_choice.value)

            tprint(f"Source file: {source_name}", enabled=verbose)
            tprint(f"Detected type: {ftype}", enabled=verbose)
            tprint(f"Text encoding: {encoding_choice.value} ‚Üí using {used_enc}", enabled=verbose)
            blank_line(enabled=verbose)

            if ftype == "csv":
                populate_csv_controls_from_bytes(file_bytes)

                if not csv_content_column.value:
                    advanced.selected_index = 1
                    raise ValueError("CSV: choose 'CSV: search in column' in Advanced settings.")

                delim = (csv_delimiter.value or ",")[:1]
                id_val = csv_id_column.value
                id_col = None if (id_val is None or str(id_val) == "") else str(id_val)

                records = load_csv_from_bytes_simple(
                    file_bytes=file_bytes,
                    source_name=source_name,
                    content_column=str(csv_content_column.value),
                    id_column=id_col,
                    delimiter=delim,
                    encoding_choice_value=encoding_choice.value,
                )

            elif ftype == "json":
                records = load_json_from_bytes(file_bytes, source_name, encoding_choice.value)

            else:
                records = load_txt_from_bytes(
                    file_bytes,
                    source_name,
                    split_mode=txt_split_mode.value,
                    encoding_choice_value=encoding_choice.value,
                )

            pattern = build_search_regex(query_text.value, mode=search_mode.value, ignore_case=True)
            tprint(f"Pattern used: {pattern.pattern}", enabled=verbose)
            blank_line(enabled=verbose)

            hits = kwic_search_records(
                records=records,
                pattern=pattern,
                context_window=int(context_window.value),
                normalize=bool(normalize_text.value),
                stop_after=int(stop_after.value),
                verbose=verbose,
            )

            print_kwic_console(hits, max_show=int(max_show.value), verbose=verbose)

            if save_csv.value:
                df = pd.DataFrame(
                    [{"record_id": h.record_id, "left": h.left, "hit": h.hit, "right": h.right} for h in hits]
                )
                stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                out_name = build_output_filename(source_name, query_text.value, stamp)
                out_path = os.path.join(results_folder, out_name)
                df.to_csv(out_path, index=False, encoding="utf-8-sig")
                tprint(f"Saved results CSV: {out_path}", enabled=verbose)
                blank_line(enabled=verbose)

            save_settings(base_folder, collect_settings())

        except Exception as e:
            tprint(f"ERROR: {e}", enabled=True)


run_btn.on_click(on_run_clicked)


# ------------------------------
# Layout
# ------------------------------
ui = widgets.VBox([
    title,
    input_mode,
    drive_folder,
    drive_status,
    drive_file_dropdown,
    drive_tip,
    upload_widget,
    upload_status,
    widgets.HTML("<hr>"),
    query_text,
    search_mode,
    mode_hint,
    context_window,
    max_show,
    advanced,
    run_btn,
    output_area,
])

display(ui)


# ------------------------------
# Init (robust, no invalid selection)
# ------------------------------
def init_refresh_files() -> None:
    """
    Robust init:
    - Mount Drive
    - Load settings
    - Apply non-dropdown settings first
    - Refresh file list (options)
    - Restore last file safely (only if present in options)
    - Populate CSV controls if needed, then restore CSV selections safely
    """
    try:
        ensure_drive_mounted(verbose=False)
        os.makedirs(DEFAULT_DRIVE_FOLDER, exist_ok=True)

        s = load_settings(DEFAULT_DRIVE_FOLDER) or {}
        apply_settings_non_dropdowns(s)

        drive_status.value = "<div style='color:#666; font-size: 12px; padding-left: 120px;'>Updating file list‚Ä¶</div>"
        refresh_drive_files(verbose=False)

        last_file = s.get("drive_file")
        if isinstance(last_file, str):
            safe_set_dropdown_value(drive_file_dropdown, last_file)

        if drive_file_dropdown.options:
            drive_status.value = "<div style='color:#2e7d32; font-size: 12px; padding-left: 120px;'>‚úÖ File list updated.</div>"
        else:
            drive_status.value = "<div style='color:#b71c1c; font-size: 12px; padding-left: 120px;'>No .json/.txt/.csv files found in this folder.</div>"

        populate_csv_controls_from_selected_file()

        saved_content = s.get("csv_content_column")
        if isinstance(saved_content, str):
            safe_set_dropdown_value(csv_content_column, saved_content)

        saved_id = s.get("csv_id_column")
        if isinstance(saved_id, str):
            safe_set_dropdown_value(csv_id_column, saved_id)

        update_encoding_hint()

    except Exception as e:
        drive_status.value = (
            "<div style='color:#b71c1c; font-size: 12px; padding-left: 120px;'>"
            f"Init error: {e}</div>"
        )


init_refresh_files()
update_visibility()
update_encoding_hint()


VBox(children=(HTML(value='<h3>KWIC Search</h3>'), RadioButtons(description='Input:', layout=Layout(width='760‚Ä¶