In [27]:
import os
print(os.getcwd())

/Users/tiffanytseng/Documents/ai-review-moderation-2/notebooks


In [29]:
# =============================================
# 🧹 Step 1. 建立資料路徑 + 清理重複與亂文
# =============================================

from pathlib import Path
import pandas as pd
import numpy as np
import os

# --- 自動偵測專案根目錄 ---
ROOT = Path(os.getcwd()).parent          # e.g. /Users/tiffanytseng/Documents/ai-review-moderation-2
DATA = ROOT / "data"
RAW = DATA / "raw"
PROC = DATA / "processed"
LABELED = DATA / "labeled"

# 確保資料夾存在
PROC.mkdir(parents=True, exist_ok=True)
LABELED.mkdir(parents=True, exist_ok=True)

print("✅ 專案根目錄:", ROOT)
print("📁 RAW:", RAW)
print("📁 PROC:", PROC)
print("📁 LABELED:", LABELED)

# --- 讀取 01 的輸出 ---
SRC = RAW / "exploratory_output.csv"
print("\n讀檔：", SRC)

df = pd.read_csv(SRC)
print("原始筆數：", len(df))

# --- 轉換欄位型別（防止字串布林混亂） ---
bool_cols = ["_is_low_quality_v4", "_is_duplicate"]
for c in bool_cols:
    if c in df.columns:
        if df[c].dtype == "O":
            df[c] = df[c].astype(str).str.lower().map({"true": True, "false": False})
        df[c] = df[c].fillna(False).astype(bool)
    else:
        df[c] = False

# --- gibberish 分數欄位 ---
if "_gibberish_score_v2" not in df.columns:
    df["_gibberish_score_v2"] = 0.0
df["_gibberish_score_v2"] = pd.to_numeric(df["_gibberish_score_v2"], errors="coerce").fillna(0.0)

# --- 清理條件 ---
mask_clean = (
    (~df["_is_low_quality_v4"]) &
    (~df["_is_duplicate"]) &
    (df["_gibberish_score_v2"] < 0.5)
)

clean_df = df[mask_clean].copy()

# --- 檢查數據 ---
assert len(clean_df) <= len(df), "❌ 乾淨筆數不應大於原始筆數。請檢查路徑或變數污染。"

print("\n✅ 原始筆數:", len(df))
print("✅ 乾淨筆數:", len(clean_df))
print("✅ 刪除比例:", round((1 - len(clean_df)/len(df)) * 100, 2), "%")

# --- 匯出乾淨與被刪資料 ---
clean_out = PROC / "clean_no_gibberish_dups.csv"
dropped_out = PROC / "_dropped_rows.csv"

clean_df.to_csv(clean_out, index=False)
df.loc[~mask_clean].to_csv(dropped_out, index=False)

print("\n💾 已輸出乾淨資料：", clean_out)
print("🗑️ 被刪資料：", dropped_out)


✅ 專案根目錄: /Users/tiffanytseng/Documents/ai-review-moderation-2
📁 RAW: /Users/tiffanytseng/Documents/ai-review-moderation-2/data/raw
📁 PROC: /Users/tiffanytseng/Documents/ai-review-moderation-2/data/processed
📁 LABELED: /Users/tiffanytseng/Documents/ai-review-moderation-2/data/labeled

讀檔： /Users/tiffanytseng/Documents/ai-review-moderation-2/data/raw/exploratory_output.csv
原始筆數： 32183

✅ 原始筆數: 32183
✅ 乾淨筆數: 30635
✅ 刪除比例: 4.81 %

💾 已輸出乾淨資料： /Users/tiffanytseng/Documents/ai-review-moderation-2/data/processed/clean_no_gibberish_dups.csv
🗑️ 被刪資料： /Users/tiffanytseng/Documents/ai-review-moderation-2/data/processed/_dropped_rows.csv


In [36]:
from pathlib import Path
import pandas as pd
import re

# 路徑（你在 notebooks/ 底下）
ROOT = Path.cwd().parent
LABELED = ROOT / "data" / "labeled"
PROC = ROOT / "data" / "processed"

ANN = LABELED / "for_annotation.csv"
print("讀取人工標註：", ANN)
ann = pd.read_csv(ANN)

# --- 欄名正規化：去空白 -> 小寫 -> 空白轉底線 ---
ann.columns = [c.strip().lower().replace(" ", "_") for c in ann.columns]

# --- 檢查必要欄位（label 用小寫）---
need_cols = {"object_id","complex_id","review_text","label"}
missing = need_cols - set(ann.columns)
if missing:
    raise ValueError(f"缺少欄位：{missing}（請在 for_annotation.csv 補上）")

# --- 清理 label 命名（先標準化，再合併 non-apartment -> test_like）---
ann["label"] = (
    ann["label"].astype(str).str.strip().str.lower()
      .replace({
          "real-apartment-review": "real_apartment_review",
          "real_apartment_review": "real_apartment_review",
          "test-like":            "test_like",
          "test_like":            "test_like",
          "non-apartment":        "test_like",      # 直接併入 test_like
          "non_apartment":        "test_like",      # 直接併入 test_like
      })
)

# --- 僅保留二類 ---
allowed = {"real_apartment_review", "test_like"}
ann = ann[ann["label"].isin(allowed)].copy()

# --- 文字簡單清理（不破壞語意）---
def basic_clean(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()

ann["review_text"] = ann["review_text"].map(basic_clean)

# --- 以文字去重（你也可改成 ["object_id","review_text"] 更嚴格）---
before = len(ann)
ann = ann.drop_duplicates(subset=["review_text"]).copy()

print(f"人工標註筆數：{before} → 去重後：{len(ann)}")
print("\nLabel 分佈：")
print(ann["label"].value_counts())
print((ann["label"].value_counts(normalize=True)*100).round(2))

# （可選）覆寫保存清理後的標註檔
out_path = LABELED / "for_annotation.cleaned.csv"
ann.to_csv(out_path, index=False)
print("\n✅ 已輸出清理後標註檔：", out_path)


讀取人工標註： /Users/tiffanytseng/Documents/ai-review-moderation-2/data/labeled/for_annotation.csv
人工標註筆數：300 → 去重後：153

Label 分佈：
label
real_apartment_review    136
test_like                 17
Name: count, dtype: int64
label
real_apartment_review    88.89
test_like                11.11
Name: proportion, dtype: float64

✅ 已輸出清理後標註檔： /Users/tiffanytseng/Documents/ai-review-moderation-2/data/labeled/for_annotation.cleaned.csv


In [32]:
from sklearn.model_selection import GroupShuffleSplit
from pathlib import Path

# 以 complex_id 做群組，避免同社區同時在 train/val/test
groups = ann["complex_id"].astype(str)
y = ann["Label"]

# train : temp(=val+test) = 80% : 20%
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, temp_idx = next(gss1.split(ann, y, groups))

train_df = ann.iloc[train_idx].copy()
temp_df  = ann.iloc[temp_idx].copy()

# temp 再劃分成 val : test = 50% : 50%（即各 10%）
gss2 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(gss2.split(temp_df, temp_df["Label"], temp_df["complex_id"].astype(str)))

val_df  = temp_df.iloc[val_idx].copy()
test_df = temp_df.iloc[test_idx].copy()

# 確認分佈
def show_dist(name, d):
    print(f"\n{name} 份量：{len(d)}")
    print(d["Label"].value_counts())
    print((d["Label"].value_counts(normalize=True)*100).round(2))

for nm, d in [("Train",train_df),("Val",val_df),("Test",test_df)]:
    show_dist(nm, d)

# 匯出
LABELED.mkdir(parents=True, exist_ok=True)
train_df.to_csv(LABELED / "train.csv", index=False)
val_df.to_csv(LABELED / "val.csv", index=False)
test_df.to_csv(LABELED / "test.csv", index=False)
print("\n✅ 已輸出：")
print(" - data/labeled/train.csv")
print(" - data/labeled/val.csv")
print(" - data/labeled/test.csv")



Train 份量：117
Label
real_apartment_review    106
test_like                 10
non_apartment              1
Name: count, dtype: int64
Label
real_apartment_review    90.60
test_like                 8.55
non_apartment             0.85
Name: proportion, dtype: float64

Val 份量：25
Label
real_apartment_review    20
test_like                 5
Name: count, dtype: int64
Label
real_apartment_review    80.0
test_like                20.0
Name: proportion, dtype: float64

Test 份量：11
Label
real_apartment_review    10
test_like                 1
Name: count, dtype: int64
Label
real_apartment_review    90.91
test_like                 9.09
Name: proportion, dtype: float64

✅ 已輸出：
 - data/labeled/train.csv
 - data/labeled/val.csv
 - data/labeled/test.csv


In [33]:
import pandas as pd
train_df = pd.read_csv("../data/labeled/train.csv")
val_df   = pd.read_csv("../data/labeled/val.csv")
test_df  = pd.read_csv("../data/labeled/test.csv")


In [38]:
from pathlib import Path
import pandas as pd

ROOT = Path.cwd().parent
LABELED = ROOT / "data" / "labeled"

src_clean = LABELED / "for_annotation.cleaned.csv"
src_raw   = LABELED / "for_annotation.csv"
SRC = src_clean if src_clean.exists() else src_raw
print("讀取標註來源：", SRC)

ann = pd.read_csv(SRC)

# 欄名正規化
ann.columns = [c.strip().lower().replace(" ", "_") for c in ann.columns]

# 檢查必要欄位
need = {"object_id","complex_id","review_text","label"}
missing = need - set(ann.columns)
if missing:
    raise ValueError(f"缺少欄位：{missing}")

# 僅二類
ann["label"] = ann["label"].astype(str).str.strip().str.lower().replace({
    "non-apartment": "test_like",
    "non_apartment": "test_like",
    "real-apartment-review": "real_apartment_review",
})
allowed = {"real_apartment_review","test_like"}
ann = ann[ann["label"].isin(allowed)].copy()

print("總筆數：", len(ann))
print(ann["label"].value_counts())


讀取標註來源： /Users/tiffanytseng/Documents/ai-review-moderation-2/data/labeled/for_annotation.cleaned.csv
總筆數： 153
label
real_apartment_review    136
test_like                 17
Name: count, dtype: int64


In [39]:
from sklearn.model_selection import GroupShuffleSplit
import numpy as np

groups = ann["complex_id"].astype(str)
y = ann["label"]

def grouped_split(df, test_size=0.2, val_size=0.5, random_state=42, max_tries=100):
    # 先切 train vs temp
    for seed in range(random_state, random_state + max_tries):
        gss1 = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
        tr_idx, tmp_idx = next(gss1.split(df, df["label"], df["complex_id"].astype(str)))
        train_df = df.iloc[tr_idx].copy()
        tmp_df   = df.iloc[tmp_idx].copy()

        # 再把 temp 切成 val/test
        gss2 = GroupShuffleSplit(n_splits=1, test_size=val_size, random_state=seed+1)
        v_idx, te_idx = next(gss2.split(tmp_df, tmp_df["label"], tmp_df["complex_id"].astype(str)))
        val_df  = tmp_df.iloc[v_idx].copy()
        test_df = tmp_df.iloc[te_idx].copy()

        # 確保每個 split 都包含兩類
        ok = (set(train_df["label"].unique()) == {"real_apartment_review","test_like"} and
              set(val_df["label"].unique())   == {"real_apartment_review","test_like"} and
              set(test_df["label"].unique())  == {"real_apartment_review","test_like"})
        if ok:
            return train_df, val_df, test_df

    raise RuntimeError("分組抽樣多次仍無法讓每個 split 都含兩類；請增加樣本或調整比例。")

train_df, val_df, test_df = grouped_split(ann, test_size=0.2, val_size=0.5, random_state=42)

def show_dist(name, d):
    print(f"\n{name}: {len(d)}")
    print(d["label"].value_counts())
    print((d["label"].value_counts(normalize=True)*100).round(2))

show_dist("Train", train_df)
show_dist("Val",   val_df)
show_dist("Test",  test_df)

# 匯出
train_df.to_csv(LABELED / "train.csv", index=False)
val_df.to_csv(LABELED / "val.csv", index=False)
test_df.to_csv(LABELED / "test.csv", index=False)
print("\n✅ 已輸出 train/val/test 到 data/labeled/")



Train: 117
label
real_apartment_review    106
test_like                 11
Name: count, dtype: int64
label
real_apartment_review    90.6
test_like                 9.4
Name: proportion, dtype: float64

Val: 21
label
real_apartment_review    20
test_like                 1
Name: count, dtype: int64
label
real_apartment_review    95.24
test_like                 4.76
Name: proportion, dtype: float64

Test: 15
label
real_apartment_review    10
test_like                 5
Name: count, dtype: int64
label
real_apartment_review    66.67
test_like                33.33
Name: proportion, dtype: float64

✅ 已輸出 train/val/test 到 data/labeled/


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pandas as pd
import numpy as np

X_train, y_train = train_df["review_text"].astype(str), train_df["label"].astype(str)
X_val,   y_val   = val_df["review_text"].astype(str),   val_df["label"].astype(str)
X_test,  y_test  = test_df["review_text"].astype(str),  test_df["label"].astype(str)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_df=0.98,
        sublinear_tf=True
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

pipe.fit(X_train, y_train)

def eval_split(name, X, y):
    pred = pipe.predict(X)
    proba = pipe.predict_proba(X)[:, pipe.classes_.tolist().index("test_like")]  # 取 test_like 的機率
    print(f"\n=== {name} ===")
    print(classification_report(y, pred, target_names=["real_apartment_review","test_like"], zero_division=0))
    try:
        auc = roc_auc_score((y=="test_like").astype(int), proba)
        print("ROC-AUC(test_like):", round(auc, 4))
    except Exception as e:
        print("ROC-AUC 無法計算：", e)
    return pred, proba

pred_val,  proba_val  = eval_split("VAL",  X_val,  y_val)
pred_test, proba_test = eval_split("TEST", X_test, y_test)

# 混淆矩陣（test）
cm = confusion_matrix(y_test, pred_test, labels=["real_apartment_review","test_like"])
cm_df = pd.DataFrame(cm, index=["real_apartment_review","test_like"], columns=["real_apartment_review","test_like"])
cm_df



=== VAL ===
                       precision    recall  f1-score   support

real_apartment_review       1.00      1.00      1.00        20
            test_like       1.00      1.00      1.00         1

             accuracy                           1.00        21
            macro avg       1.00      1.00      1.00        21
         weighted avg       1.00      1.00      1.00        21

ROC-AUC(test_like): 1.0

=== TEST ===
                       precision    recall  f1-score   support

real_apartment_review       0.77      1.00      0.87        10
            test_like       1.00      0.40      0.57         5

             accuracy                           0.80        15
            macro avg       0.88      0.70      0.72        15
         weighted avg       0.85      0.80      0.77        15

ROC-AUC(test_like): 0.96


Unnamed: 0,real_apartment_review,test_like
real_apartment_review,10,0
test_like,3,2


In [41]:
from pathlib import Path
import pandas as pd

ROOT = Path.cwd().parent
PROC = ROOT / "data" / "processed"
LABELED = ROOT / "data" / "labeled"

clean_path = PROC / "clean_no_gibberish_dups.csv"
ann_path   = LABELED / "for_annotation.cleaned.csv"

pool = pd.read_csv(clean_path)
pool.columns = [c.strip().lower().replace(" ", "_") for c in pool.columns]

if ann_path.exists():
    ann = pd.read_csv(ann_path)
else:
    ann = pd.read_csv(LABELED / "for_annotation.csv")
ann.columns = [c.strip().lower().replace(" ", "_") for c in ann.columns]

labeled_ids = set(ann["object_id"].astype(str))
pool = pool[~pool["object_id"].astype(str).isin(labeled_ids)].copy()
print("未標註池大小：", len(pool))


未標註池大小： 30355


In [42]:
import numpy as np

# 取得 "test_like" 在 pipe.classes_ 的 index
classes = pipe.classes_.tolist()
assert "test_like" in classes, f"你的模型類別：{classes}，找不到 'test_like'。"
ti = classes.index("test_like")

# 對未標註池預測
proba = pipe.predict_proba(pool["review_text"].astype(str))[:, ti]
pool["_proba_test_like"] = proba

# 方便人工標註的欄位子集
view_cols = ["object_id","complex_id","review_text","_proba_test_like"]
pool_preview = pool[view_cols].copy().sort_values("_proba_test_like", ascending=False)
pool_preview.head(5)


Unnamed: 0,object_id,complex_id,review_text,_proba_test_like
5732,111700287,3101234567,x Be Specific - Don&apos;t just complain about...,0.783889
26986,112082546,9199332346275143593,n/a ehh not applicableMillions of people use ...,0.711205
6548,112685038,410356506221117,Please I don't want to complete this section.M...,0.686399
20064,217700,281890283977070,Not a great choice,0.681807
270,112823250,9199332346275192547,"JOE TEST Anon - Lorem ipsum dolor sit amet, co...",0.614671


In [43]:
# 1) 高機率 test_like（建議快速標記：幾乎都是）
hi = pool_preview.query("_proba_test_like >= 0.80").head(200)

# 2) 不確定區（模型最需要你提供訊息）
uncertain = pool_preview.query("_proba_test_like >= 0.40 and _proba_test_like <= 0.60").head(200)

# 3) 從被刪清單回收（多半是亂文/模板）
dropped = pd.read_csv(PROC / "_dropped_rows.csv")
dropped.columns = [c.strip().lower().replace(" ", "_") for c in dropped.columns]
pick_cols = [c for c in view_cols if c in dropped.columns] + [c for c in ["_gibberish_score_v2","_is_low_quality_v4"] if c in dropped.columns]
dropped_preview = dropped[pick_cols].copy()
# 取亂文較高或被標為低品質者
if "_gibberish_score_v2" in dropped_preview:
    backfill = dropped_preview.sort_values("_gibberish_score_v2", ascending=False).head(200)
else:
    backfill = dropped_preview.head(200)

# 輸出成待標註 CSV
LABELED.mkdir(parents=True, exist_ok=True)
hi_out   = LABELED / "round2_candidates_highprob.csv"
unc_out  = LABELED / "round2_candidates_uncertain.csv"
drop_out = LABELED / "round2_candidates_from_dropped.csv"

hi.to_csv(hi_out, index=False)
uncertain.to_csv(unc_out, index=False)
backfill.to_csv(drop_out, index=False)

print("✅ 已輸出：")
print(" -", hi_out)
print(" -", unc_out)
print(" -", drop_out)
print("合計候選：", len(hi), "+", len(uncertain), "+", len(backfill))


✅ 已輸出：
 - /Users/tiffanytseng/Documents/ai-review-moderation-2/data/labeled/round2_candidates_highprob.csv
 - /Users/tiffanytseng/Documents/ai-review-moderation-2/data/labeled/round2_candidates_uncertain.csv
 - /Users/tiffanytseng/Documents/ai-review-moderation-2/data/labeled/round2_candidates_from_dropped.csv
合計候選： 0 + 200 + 200


In [44]:
from pathlib import Path
import pandas as pd

ROOT = Path.cwd().parent
LABELED = ROOT / "data" / "labeled"

# 讀原標註
base = pd.read_csv(LABELED / "for_annotation.cleaned.csv")
base.columns = [c.strip().lower().replace(" ", "_") for c in base.columns]

# 讀回合併
parts = []
for name in ["round2_labeled_highprob.csv", "round2_labeled_uncertain.csv", "round2_labeled_dropped.csv"]:
    p = LABELED / name
    if p.exists():
        d = pd.read_csv(p)
        d.columns = [c.strip().lower().replace(" ", "_") for c in d.columns]
        # 確保有 label 欄
        if "label" not in d.columns:
            raise ValueError(f"{name} 缺少 label 欄位")
        parts.append(d[["object_id","complex_id","review_text","label"]])

add = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=["object_id","complex_id","review_text","label"])

# 合併 & 去重（以 text 去重）
merged = pd.concat([base[["object_id","complex_id","review_text","label"]], add], ignore_index=True)
merged["label"] = merged["label"].astype(str).str.strip().str.lower().replace({
    "non-apartment":"test_like","non_apartment":"test_like",
    "real-apartment-review":"real_apartment_review"
})
merged = merged.drop_duplicates(subset=["review_text"]).copy()

# 覆寫 cleaned v2
out = LABELED / "for_annotation.cleaned.v2.csv"
merged.to_csv(out, index=False)
print("✅ 已輸出新的標註全集：", out)
print(merged["label"].value_counts())


✅ 已輸出新的標註全集： /Users/tiffanytseng/Documents/ai-review-moderation-2/data/labeled/for_annotation.cleaned.v2.csv
label
real_apartment_review    136
test_like                 17
Name: count, dtype: int64


In [45]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np
import pandas as pd

ann = pd.read_csv(LABELED / "for_annotation.cleaned.v2.csv")
ann.columns = [c.strip().lower().replace(" ", "_") for c in ann.columns]

# 分組切分（確保每個 split 兩類都有）
def grouped_split(df, test_size=0.2, val_size=0.5, random_state=42, tries=100):
    for seed in range(random_state, random_state+tries):
        g1 = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
        tr_idx, tmp_idx = next(g1.split(df, df["label"], df["complex_id"].astype(str)))
        train_df, tmp_df = df.iloc[tr_idx].copy(), df.iloc[tmp_idx].copy()
        g2 = GroupShuffleSplit(n_splits=1, test_size=val_size, random_state=seed+1)
        v_idx, te_idx = next(g2.split(tmp_df, tmp_df["label"], tmp_df["complex_id"].astype(str)))
        val_df, test_df = tmp_df.iloc[v_idx].copy(), tmp_df.iloc[te_idx].copy()
        if all(set(d["label"].unique())=={"real_apartment_review","test_like"} for d in [train_df,val_df,test_df]):
            return train_df, val_df, test_df
    raise RuntimeError("無法讓每個 split 都含兩類，請增加樣本或調比例。")

train_df, val_df, test_df = grouped_split(ann)

X_train, y_train = train_df["review_text"].astype(str), train_df["label"].astype(str)
X_val,   y_val   = val_df["review_text"].astype(str),   val_df["label"].astype(str)
X_test,  y_test  = test_df["review_text"].astype(str),  test_df["label"].astype(str)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.98, sublinear_tf=True)),
    ("clf", LogisticRegression(max_iter=2000, class_weight={"real_apartment_review":0.8,"test_like":1.2}, n_jobs=-1))
]).fit(X_train, y_train)

def eval_split(name, X, y):
    pred = pipe.predict(X)
    proba = pipe.predict_proba(X)[:, pipe.classes_.tolist().index("test_like")]
    print(f"\n=== {name} ===")
    print(classification_report(y, pred, target_names=["real_apartment_review","test_like"], zero_division=0))
    try:
        print("ROC-AUC(test_like):", round(roc_auc_score((y=="test_like").astype(int), proba), 4))
    except: pass
    return pred, proba

pred_val,  proba_val  = eval_split("VAL",  X_val,  y_val)
pred_test, proba_test = eval_split("TEST", X_test, y_test)

cm = confusion_matrix(y_test, pred_test, labels=["real_apartment_review","test_like"])
pd.DataFrame(cm, index=["real_apartment_review","test_like"], columns=["real_apartment_review","test_like"])



=== VAL ===
                       precision    recall  f1-score   support

real_apartment_review       0.95      1.00      0.98        20
            test_like       0.00      0.00      0.00         1

             accuracy                           0.95        21
            macro avg       0.48      0.50      0.49        21
         weighted avg       0.91      0.95      0.93        21

ROC-AUC(test_like): 1.0

=== TEST ===
                       precision    recall  f1-score   support

real_apartment_review       0.67      1.00      0.80        10
            test_like       0.00      0.00      0.00         5

             accuracy                           0.67        15
            macro avg       0.33      0.50      0.40        15
         weighted avg       0.44      0.67      0.53        15

ROC-AUC(test_like): 0.96


Unnamed: 0,real_apartment_review,test_like
real_apartment_review,10,0
test_like,5,0


In [46]:
from pathlib import Path
import pandas as pd, time

ROOT = Path.cwd().parent
LABELED = ROOT / "data" / "labeled"

v2 = LABELED / "for_annotation.cleaned.v2.csv"
base = LABELED / "for_annotation.cleaned.csv"
r2u  = LABELED / "round2_labeled_uncertain.csv"
r2h  = LABELED / "round2_labeled_highprob.csv"
r2d  = LABELED / "round2_labeled_dropped.csv"

def mtime(p): 
    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p.stat().st_mtime)) if p.exists() else "N/A"

print("存在檢查：")
for p in [base, r2u, r2h, r2d, v2]:
    print(f"- {p.name:30s}  exists={p.exists()}  mtime={mtime(p)}")

if v2.exists():
    ann = pd.read_csv(v2)
    ann.columns = [c.strip().lower().replace(" ", "_") for c in ann.columns]
    print("\n✅ v2 筆數：", len(ann))
    print("✅ v2 標籤分佈：")
    print(ann["label"].value_counts())
else:
    print("\n⚠️ 目前沒有 v2，代表合併步驟尚未完成（或沒輸出）。")


存在檢查：
- for_annotation.cleaned.csv      exists=True  mtime=2025-10-22 15:08:56
- round2_labeled_uncertain.csv    exists=False  mtime=N/A
- round2_labeled_highprob.csv     exists=False  mtime=N/A
- round2_labeled_dropped.csv      exists=False  mtime=N/A
- for_annotation.cleaned.v2.csv   exists=True  mtime=2025-10-22 15:42:39

✅ v2 筆數： 153
✅ v2 標籤分佈：
label
real_apartment_review    136
test_like                 17
Name: count, dtype: int64


In [47]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

ann = pd.read_csv(LABELED / "for_annotation.cleaned.v2.csv")
ann.columns = [c.strip().lower().replace(" ", "_") for c in ann.columns]

def grouped_split(df, test_size=0.2, val_size=0.5, random_state=42, tries=100):
    for seed in range(random_state, random_state+tries):
        g1 = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
        tr_idx, tmp_idx = next(g1.split(df, df["label"], df["complex_id"].astype(str)))
        train_df, tmp_df = df.iloc[tr_idx].copy(), df.iloc[tmp_idx].copy()
        g2 = GroupShuffleSplit(n_splits=1, test_size=val_size, random_state=seed+1)
        v_idx, te_idx = next(g2.split(tmp_df, tmp_df["label"], tmp_df["complex_id"].astype(str)))
        val_df, test_df = tmp_df.iloc[v_idx].copy(), tmp_df.iloc[te_idx].copy()
        if all(set(d["label"].unique())=={"real_apartment_review","test_like"} for d in [train_df,val_df,test_df]):
            return train_df, val_df, test_df
    raise RuntimeError("split 無法滿足每組含兩類，請增加 test_like 或調整比例。")

train_df, val_df, test_df = grouped_split(ann)

# 檢查各 split 的 test_like 數量是否真的增加
for nm, d in [("Train",train_df),("Val",val_df),("Test",test_df)]:
    print(nm, "分佈：")
    print(d["label"].value_counts(), "\n")

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.98, sublinear_tf=True)),
    ("clf", LogisticRegression(max_iter=2000, class_weight={"real_apartment_review":0.8,"test_like":1.2}, n_jobs=-1))
]).fit(train_df["review_text"].astype(str), train_df["label"].astype(str))

def eval_split(name, X, y):
    pred = pipe.predict(X)
    print(f"\n=== {name} ===")
    print(classification_report(y, pred, target_names=["real_apartment_review","test_like"], zero_division=0))
    return pred

_ = eval_split("VAL",  val_df["review_text"].astype(str),  val_df["label"].astype(str))
_ = eval_split("TEST", test_df["review_text"].astype(str), test_df["label"].astype(str))


Train 分佈：
label
real_apartment_review    106
test_like                 11
Name: count, dtype: int64 

Val 分佈：
label
real_apartment_review    20
test_like                 1
Name: count, dtype: int64 

Test 分佈：
label
real_apartment_review    10
test_like                 5
Name: count, dtype: int64 


=== VAL ===
                       precision    recall  f1-score   support

real_apartment_review       0.95      1.00      0.98        20
            test_like       0.00      0.00      0.00         1

             accuracy                           0.95        21
            macro avg       0.48      0.50      0.49        21
         weighted avg       0.91      0.95      0.93        21


=== TEST ===
                       precision    recall  f1-score   support

real_apartment_review       0.67      1.00      0.80        10
            test_like       0.00      0.00      0.00         5

             accuracy                           0.67        15
            macro avg       0.33      0