In [1]:
# === Kazakh Missing-Word Position: Heuristic + Unsupervised Bigram Baseline ===
# Новое: если первая алфавитная буква строки (с учётом каз. кириллицы) строчная -> предсказываем 0.
# Иначе — выбираем разрыв с минимальной P(R|L) по биграммной статистике.
# Файлы по умолчанию: /mnt/data/train.csv, /mnt/data/public_test.csv, /mnt/data/sample_submission.csv

import pandas as pd
import numpy as np
import re
from collections import Counter
from typing import List, Tuple

# ----- Пути (при необходимости поменяйте) -----
PATH_TRAIN = "/kaggle/input/tst-day-4-upsolving/train.csv"
PATH_TEST  = "/kaggle/input/tst-day-4-upsolving/public_test.csv"
PATH_SUB   = "/kaggle/input/tst-day-4-upsolving/sample_submission.csv"
OUT_PATH   = "submission.csv"

# ----- Надёжное чтение CSV -----
def try_read_csv(path: str) -> pd.DataFrame:
    for enc in [None, "utf-8-sig", "cp1251", "latin1"]:
        try:
            return pd.read_csv(path, encoding=enc) if enc else pd.read_csv(path)
        except Exception:
            continue
    raise RuntimeError(f"Failed to read {path} with common encodings.")

# ----- Определение текстовой колонки -----
def guess_text_col(df: pd.DataFrame) -> str:
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if not obj_cols: obj_cols = list(df.columns)
    best, best_len = obj_cols[0], -1
    for c in obj_cols:
        try: cur = df[c].astype(str).str.len().fillna(0).mean()
        except Exception: cur = -1
        if cur > best_len: best, best_len = c, cur
    return best

# ----- Токенизация (слова/цифры + одиночные знаки препинания как токены) -----
TOKEN_RE = re.compile(r"\w+|[^\w\s]", flags=re.UNICODE)

def tokenize(s: str) -> List[str]:
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = re.sub(r"\s+", " ", s.strip())
    return TOKEN_RE.findall(s)

# ----- Первая буква строчная? (игнорируем не-буквы в начале) -----
def first_alpha_is_lower(s: str) -> bool:
    for ch in s.strip():
        # пропускаем начальные кавычки/тире/скобки и т.п.
        if ch.isalpha():
            return ch.islower()
    return False

# ----- Статистики корпуса -----
def build_stats(series_list: List[pd.Series]) -> Tuple[Counter, Counter]:
    unigram, bigram = Counter(), Counter()
    for ser in series_list:
        for s in ser.astype(str).fillna("").tolist():
            toks = tokenize(s)
            seq = ["<BOS>"] + toks + ["<EOS>"]
            for i in range(1, len(seq)):
                unigram[seq[i-1]] += 1
                bigram[(seq[i-1], seq[i])] += 1
            unigram[seq[-1]] += 1
    return unigram, bigram

# Сглажённая условная вероятность P(R|L) ~ (c(L,R)+alpha)/(c(L)+alpha*V)
def logP_cond(l: str, r: str, unigram_cnt: Counter, bigram_cnt: Counter, alpha: float = 0.5) -> float:
    V = max(1, len(unigram_cnt))
    c_l  = unigram_cnt[l]
    c_lr = bigram_cnt[(l, r)]
    return float(np.log((c_lr + alpha) / (c_l + alpha * V) + 1e-12))

# Предсказание позиции:
#   1) если первая буква строчная -> 0
#   2) иначе минимальная logP_cond(L,R)
def predict_gap(s: str, unigram_cnt: Counter, bigram_cnt: Counter) -> int:
    if first_alpha_is_lower(s):
        return 0
    toks = tokenize(s)
    scores = []
    for g in range(len(toks) + 1):
        L = toks[g-1] if g-1 >= 0 else "<BOS>"
        R = toks[g]   if g   < len(toks) else "<EOS>"
        scores.append(logP_cond(L, R, unigram_cnt, bigram_cnt, alpha=0.5))
    return int(np.argmin(scores))

def main():
    # 1) Чтение
    train = try_read_csv(PATH_TRAIN)
    test  = try_read_csv(PATH_TEST)
    sub   = try_read_csv(PATH_SUB)

    # 2) Колонки
    train_text_col = guess_text_col(train)
    test_text_col  = "masked_sentence" if "masked_sentence" in test.columns else guess_text_col(test)
    id_col         = "ID" if "ID" in sub.columns else sub.columns[0]
    pred_col       = "word_index" if "word_index" in sub.columns else [c for c in sub.columns if c != id_col][0]

    print("Train columns:", list(train.columns))
    print("Test  columns:", list(test.columns))
    print("Sub   columns:", list(sub.columns))
    print(f"[Guess] train text='{train_text_col}', test text='{test_text_col}', id='{id_col}', pred='{pred_col}'")

    # 3) Корпусная статистика (train + masked тест)
    unigram_cnt, bigram_cnt = build_stats([train[train_text_col], test[test_text_col]])

    # 4) Предсказания
    masked_texts = test[test_text_col].astype(str).fillna("")
    preds = [predict_gap(s, unigram_cnt, bigram_cnt) for s in masked_texts]

    # 5) Сабмит
    submission = sub.copy()
    if len(submission) != len(preds):
        preds = preds[:len(submission)]
    submission[pred_col] = preds
    submission.to_csv(OUT_PATH, index=False)

    print(f"Saved: {OUT_PATH}")
    print(submission.head())

if __name__ == "__main__":
    main()


Train columns: ['sentence']
Test  columns: ['ID', 'masked_sentence']
Sub   columns: ['ID', 'word_index']
[Guess] train text='sentence', test text='masked_sentence', id='ID', pred='word_index'
Saved: submission.csv
   ID  word_index
0   1           1
1   2           6
2   3           9
3   4           5
4   5           0
