In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

# =========================
# 1) Load
# =========================
print("1) Reading data...")
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

ID_COL = "id"
TARGET = "response_label"

# Binary label: unsafe=1, safe=0
y = (train[TARGET].values == "unsafe").astype(int)

# =========================
# 2) Penalty scoring (0/1/3)
# =========================
def penalty_score(y_true, p_unsafe, t_low, t_high):
    # y_true: 0 safe, 1 unsafe
    # p_unsafe: probability of unsafe
    total = 0
    for yt, pu in zip(y_true, p_unsafe):
        if pu < t_low:
            pred = 0
        elif pu > t_high:
            pred = 1
        else:
            pred = -1  # notSure

        if pred == -1:
            total += 1
        elif pred == yt:
            total += 0
        else:
            total += 3
    return total

def pred_3class(p_unsafe, t_low, t_high):
    out = np.full(len(p_unsafe), "notSure", dtype=object)
    out[p_unsafe < t_low] = "safe"
    out[p_unsafe > t_high] = "unsafe"
    return out

# =========================
# 3) Feature blocks
#    - Prompt: word (1,2)
#    - Response: word (1,2) + char_wb (3,5)
# =========================
prompt_word = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents="unicode",
    sublinear_tf=True
)

response_word = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents="unicode",
    sublinear_tf=True
)

response_char = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=3,
    max_df=0.95,
    strip_accents="unicode",
    sublinear_tf=True
)

# ColumnTransformer: same column can be used multiple times with different vectorizers
featurizer = ColumnTransformer(
    transformers=[
        ("p_word", prompt_word, "prompt"),
        ("r_word", response_word, "response"),
        ("r_char", response_char, "response"),
    ],
    remainder="drop"
)

# =========================
# 4) Two models for ensemble
#    A) Logistic Regression
#    B) LinearSVC + calibration (sigmoid) => proba
# =========================
model_lr = Pipeline([
    ("tfidf", featurizer),
    ("clf", LogisticRegression(
        solver="liblinear",
        C=6.0,
        max_iter=5000,
        class_weight="balanced"
    ))
])

# CalibratedClassifierCV expects an estimator with decision_function (LinearSVC has it)
model_svc = Pipeline([
    ("tfidf", featurizer),
    ("clf", CalibratedClassifierCV(
        estimator=LinearSVC(C=1.5, class_weight="balanced"),
        method="sigmoid",
        cv=3
    ))
])

def fit_predict_proba(model, X_train, y_train, X_val, X_test):
    model.fit(X_train, y_train)
    p_val  = model.predict_proba(X_val)[:, 1]
    p_test = model.predict_proba(X_test)[:, 1]
    return p_val, p_test

# =========================
# 5) OOF + Test probs (5-fold)
# =========================
print("2) Training 5-fold OOF (Ensemble LR + CalibratedSVC)...")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lr  = np.zeros(len(train), dtype=float)
oof_svc = np.zeros(len(train), dtype=float)
test_lr  = np.zeros(len(test), dtype=float)
test_svc = np.zeros(len(test), dtype=float)

for fold, (tr_idx, va_idx) in enumerate(skf.split(train, y), 1):
    X_tr = train.iloc[tr_idx][["prompt", "response"]]
    y_tr = y[tr_idx]
    X_va = train.iloc[va_idx][["prompt", "response"]]
    X_te = test[["prompt", "response"]]

    # LR
    p_va_lr, p_te_lr = fit_predict_proba(model_lr, X_tr, y_tr, X_va, X_te)
    oof_lr[va_idx] = p_va_lr
    test_lr += p_te_lr / skf.n_splits

    # Calibrated SVC
    p_va_svc, p_te_svc = fit_predict_proba(model_svc, X_tr, y_tr, X_va, X_te)
    oof_svc[va_idx] = p_va_svc
    test_svc += p_te_svc / skf.n_splits

    print(f"Fold {fold} done.")

# Ensemble probs (simple average)
oof = (oof_lr + oof_svc) / 2.0
test_probs = (test_lr + test_svc) / 2.0

# =========================
# 6) Threshold search (grid) on OOF
# =========================
print("3) Searching thresholds (t_low, t_high) on OOF penalty...")
best_pen = None
best_pair = None

# Grid ranges (bạn có thể tăng độ mịn nếu muốn)
t_lows  = np.linspace(0.05, 0.45, 81)   # step ~0.005
t_highs = np.linspace(0.55, 0.99, 89)   # step ~0.005

for tl in t_lows:
    for th in t_highs:
        if tl >= th:
            continue
        pen = penalty_score(y, oof, tl, th)
        if (best_pen is None) or (pen < best_pen):
            best_pen = pen
            best_pair = (float(tl), float(th))

t_low, t_high = best_pair
print(f"[BEST] OOF penalty={best_pen} | t_low={t_low:.3f} t_high={t_high:.3f}")

# =========================
# 7) Build submission
# =========================
pred_labels = pred_3class(test_probs, t_low, t_high)
sub = pd.DataFrame({ID_COL: test[ID_COL], "response_label": pred_labels})
sub.to_csv("submission.csv", index=False)

print("Saved: submission.csv")
print(pd.Series(pred_labels).value_counts())

Fold 1 done.
Fold 2 done.
Fold 3 done.
Fold 4 done.
Fold 5 done.
[BEST] OOF penalty=663 | t_low=0.420 t_high=0.950
Saved submission.csv
safe       1067
notSure     132
unsafe       12
Name: count, dtype: int64
