In [1]:
# —— 乾淨重載 utils，指向專案根目錄的 utils/text_normalize.py —— 
%reload_ext autoreload
%autoreload 2

import os, sys, importlib, inspect

# 1) 指到「專案根目錄」（notebooks 的上一層）
ROOT = os.path.abspath('..')
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

# 2) 徹底清掉記憶體裡舊的 notebooks/utils 模組殘留
for k in list(sys.modules):
    if k == 'utils' or k.startswith('utils.'):
        del sys.modules[k]

# 3) 確認真正存在的檔案
print("text_normalize.py 存在？",
      os.path.exists(os.path.join(ROOT, 'utils', 'text_normalize.py')),
      "→", os.path.join(ROOT, 'utils', 'text_normalize.py'))

# 4) 重新乾淨匯入（不要用 from...import）
textnorm = importlib.import_module('utils.text_normalize')

# 5) 驗證來源
print("loaded from:", inspect.getfile(textnorm))


text_normalize.py 存在？ True → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
loaded from: /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py


In [2]:
import inspect
for name in ["normalize_text", "is_test_like", "normalize_for_exact", "normalize_for_near"]:
    f = getattr(textnorm, name)
    print(f"{name} →", inspect.getfile(f))


normalize_text → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
is_test_like → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
normalize_for_exact → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
normalize_for_near → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py


In [3]:
# 讓 Python 能找到當前路徑
import os, sys
sys.path.append(os.path.abspath('.'))

# 自動重新載入外部模組（你改 .py 後會自動生效）
%load_ext autoreload
%autoreload 2

from importlib import reload
import utils.text_normalize as textnorm
reload(textnorm)  # 保險起見，每次先 reload 一下


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'utils.text_normalize' from '/Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py'>

In [4]:
import inspect

def list_own_functions(mod):
    rows = []
    for name, obj in vars(mod).items():
        # 只列出「函式」，而且來源在這個模組本身（排除內建/外部）
        if inspect.isfunction(obj) and getattr(obj, "__module__", None) == mod.__name__:
            try:
                path = inspect.getsourcefile(obj) or inspect.getfile(obj)
            except TypeError:
                path = "<built-in>"
            rows.append((name, path))
    for name, path in sorted(rows):
        print(f"{name:25s} → {path}")

list_own_functions(textnorm)


_ratio_emoji              → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
_ratio_symbols            → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
_token_has_vowel          → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
gibberish_score_v3        → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
is_low_quality_v3         → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
is_test_like              → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
normalize_for_exact       → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
normalize_for_near        → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
normalize_text            → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py


In [5]:
%load_ext autoreload
%autoreload 2
from utils.text_normalize import (
    normalize_text,
    is_test_like,
    normalize_for_exact,
    normalize_for_near,
)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
!which python

/Users/tiffanytseng/Documents/ai-review-moderation-2/.venv/bin/python


In [7]:
import os
print(os.getcwd())

/Users/tiffanytseng/Documents/ai-review-moderation-2/notebooks


In [8]:
from pathlib import Path

DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
OUTPUT_DIR = DATA_DIR / "processed"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# TODO: 修改為你的檔名
INPUT_CSV = RAW_DIR / "_SELECT_A_object_id_A_complex_id_A_vote_reason_id_B_reason_A_dat_202110291714.csv"

# 欄位與參數
POSSIBLE_TEXT_COLS = ["review_text", "text", "content", "comment"]
ID_COLS = ["object_id", "complex_id"]
DATE_COLS = ["date_created", "date_updated"]

# 門檻（可之後在 Step 2 調整）
MIN_WORDS = 1
MAX_WORDS = 300
MAX_PUNCT_RATIO = 0.20
MAX_NONASCII_RATIO = 0.50


In [9]:
import pandas as pd
import numpy as np
import re, string

def pick_text_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    text_like = [c for c in df.columns if "text" in c.lower() or "content" in c.lower() or "comment" in c.lower()]
    return text_like[0] if text_like else df.columns[-1]

def normalize_text(s):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = s.lower()
    s = s.translate(str.maketrans({ch: " " for ch in string.punctuation}))
    s = re.sub(r"\s+", " ", s).strip()
    return s

def punctuation_ratio(s):
    if not isinstance(s, str): return 0.0
    if len(s) == 0: return 0.0
    p = sum(1 for ch in s if ch in string.punctuation)
    return p / len(s)

def nonascii_ratio(s):
    if not isinstance(s, str): return 0.0
    if len(s) == 0: return 0.0
    na = sum(1 for ch in s if ord(ch) > 127)
    return na / len(s)

def word_count(s):
    if not isinstance(s, str): return 0
    return len(re.findall(r"\b\w+\b", s))

# 強化測試資料規則
TEST_PATTERNS = [
    r"\bdescribe your experience\b",
    r"\bkeep your review (?:precise|to the point)\b",
    r"\bwe value open expression\b",
    r"\bno personal info\b",
    r"\bprofanity and hateful speech\b",
    r"\bupdating .* review\b",
    r"\bsubmit your review\b",
    r"\bcontact info\b",
]
compiled_test = [re.compile(p, flags=re.IGNORECASE) for p in TEST_PATTERNS]

def is_test_like(s):
    if not isinstance(s, str): return False
    s_lower = s.lower()
    for cp in compiled_test:
        if cp.search(s_lower):
            return True
    if (
        len(s_lower.split()) > 40 and
        "review" in s_lower and
        any(x in s_lower for x in ["describe your", "keep your", "we value", "no personal info"])
    ):
        return True
    return False

def gibberish_score(s):
    if not isinstance(s, str) or not s.strip():
        return 0
    words = re.findall(r"\b\w+\b", s.lower())
    if not words:
        return 0
    unique_ratio = len(set(words)) / len(words)
    long_run = (len(s.split(".")) == 1 and len(words) > 60)
    repeat_pattern = re.search(r"(\b\w+\b)(?:\s+\1){1,}", s.lower())
    score = 0
    if unique_ratio < 0.5: score += 1
    if long_run: score += 1
    if repeat_pattern: score += 1
    return score


In [10]:
df = pd.read_csv(INPUT_CSV)
print(f"Loaded rows={len(df)}, cols={len(df.columns)}")
print("\nColumns:", list(df.columns))

TEXT_COL = pick_text_col(df, POSSIBLE_TEXT_COLS)
print("Detected text column:", TEXT_COL)

# ✅ 在這裡先做 HTML 轉換再進後面計算
import html
df["_text_raw"] = df[TEXT_COL].astype(str).apply(html.unescape)

# 然後再算這三個特徵
df["_len_words"] = df["_text_raw"].map(word_count)
df["_punct_ratio"] = df["_text_raw"].map(punctuation_ratio)
df["_nonascii_ratio"] = df["_text_raw"].map(nonascii_ratio)

print("\nNull counts:")
display(df.isna().sum().to_frame("nulls"))

print("\nSample rows:")
cols = [c for c in ["object_id","complex_id","vote_reason_id","reason", TEXT_COL] if c in df.columns] 
display(df[cols].head(5))


Loaded rows=32183, cols=7

Columns: ['object_id', 'complex_id', 'vote_reason_id', 'reason', 'date_created', 'date_updated', 'review_text']
Detected text column: review_text

Null counts:


Unnamed: 0,nulls
object_id,0
complex_id,0
vote_reason_id,0
reason,0
date_created,0
date_updated,0
review_text,0
_text_raw,0
_len_words,0
_punct_ratio,0



Sample rows:


Unnamed: 0,object_id,complex_id,vote_reason_id,reason,review_text
0,112885551,972359147075013,6,"It contains threats, lewdness or hate speech",Initially I was excited about moving to Settle...
1,112795026,9199332346275186185,1,It's for the wrong community,Love my apartment and the staff here. I like t...
2,112795026,9199332346275186185,1,It's for the wrong community,Love my apartment and the staff here. I like t...
3,112889848,9199332346275166581,3,It contains false information,My experience with Century was outstanding whi...
4,112932850,9199332346275193048,3,It contains false information,Tongue flank swine shank shankle capicola jowl...


In [11]:
df["_is_test_like_preview"] = df["_text_raw"].apply(is_test_like)
df["_gibberish_score_preview"] = df["_text_raw"].apply(gibberish_score)

print("Top suspected TEST-like entries:")
display(df.loc[df["_is_test_like_preview"]].head(10)[[TEXT_COL]])

print("Top suspected gibberish (score>=2):")
display(df.loc[df["_gibberish_score_preview"]>=2].head(10)[[TEXT_COL, "_gibberish_score_preview"]])


Top suspected TEST-like entries:


Unnamed: 0,review_text
7,Help your fellow renters get the most out of y...
8,Help your fellow renters get the most out of y...
9,Help your fellow renters get the most out of y...
10,Help your fellow renters get the most out of y...
11,Help your fellow renters get the most out of y...
12,Help your fellow renters get the most out of y...
13,Help your fellow renters get the most out of y...
14,Help your fellow renters get the most out of y...
35,Updating Le Villa Review...Describe your exper...
36,Updating Le Villa Review...Describe your exper...


Top suspected gibberish (score>=2):


Unnamed: 0,review_text,_gibberish_score_preview
41,"orem ipsum dolor sit amet, consectetur adipisc...",2
42,I don&apos;t even know where to start on how g...,2
150,"To the Fellow Luminary Tenants: For starters, ...",2
156,I am a current resident at this complex and le...,2
166,To those who have decided to write negative th...,2
173,Not recommended!!!! They just try to lease out...,2
199,"To the Fellow Luminary Tenants: For starters, ...",2
200,"To the Fellow Luminary Tenants: For starters, ...",2
239,I have lived in Greystone for almost 2 years. ...,2
259,I have lived here for a year now and am moving...,2


In [12]:
# 檢查第 156 與第 173 筆的完整內容
pd.set_option('display.max_colwidth', None)
df.loc[[156, 173], ["object_id", "review_text", "_gibberish_score_preview"]]


Unnamed: 0,object_id,review_text,_gibberish_score_preview
156,112773850,"I am a current resident at this complex and let me just say, it has been the worst apartment living situation I have ever lived in. We moved in at the end of March 2019 and loved the apartment and the community. After 3 months, we began smelling marijuana in our apartment. Since this is a &quot;non smoking&quot; community, it was a bit of a shock to have our master bedroom and bathroom stinking from marijuana smell. So, I called and complained. The complex did what it always does which is to send a letter to each apartment saying not to smoke. Every month after that, we complained about the smell filling our apartment. In September of 2019 we were still having the issue and the complex would never come to actually check out the problem. Finally it got so bad that after numerous complaints, the complex decided to evict our neighbors months later. The neighbors somehow found out we complained and vandalized our vehicles. They broke my windshield on my car and also the side mirror of my partner&apos;s car. This was allowed to occur because the complex gave the tenants a week to move instead of just taking care of it right away. We had to call the police and file a report. At this point, I had had enough and went into the office to complain. The manager at the time offered no help whatsoever so I had to go to the District Manager to finally get results. After the neighbors got evicted, that apartment was empty for a month or 2 and we did not have the smoke problem. Then after getting new neighbors, we had the same problem. Our bathroom and bedroom began smelling of marijuana. We complained and got the usual response of &quot;we&apos;ll send a letter and call&quot;. One day while leaving my apartment, I found a baggie of marijuana laying on the stairs. I took a picture and emailed it to the office and their response was that they found it hard to believe that there was a bag of weed left on the steps. So now, the complex started downplaying our complaints and making us feel like we were lying about the issue. We had this issue with the smoke most of this year and NOT A SINGLE person had come to our apartment to check out the issue until we finally just went over the property&apos;s head to the District Manager yet again. The new DM finally pushed for our apartment to get looked at 1.5 years later. The construction guy for our property checked out our apartment and the smoking neighbor&apos;s apartment and acknowledged that the marijuana was from the neighbors. He sealed around the pipes which was a 5 minute fix and it has actually helped now. FIVE MINUTES!!!!! That is all the time tit took to fix the issue after 1.5 years. This could have been taken care of months and months ago if only this complex cared about it&apos;s tenants to actually come inspect the problem. So, I stated I wanted compensation for the year and a half of neglect, the copious amounts of cleaners and air fresheners we bought, the electricity and water used to constantly re-wash our bedding to get rid of the smell, and for the overall horrible experience. They offered us $500. $500 for 1.5 years of nonsense. What a slap in the face. They sent people to shampoo our carpets and one of the guys entered into our apartment without wearing a mask during this time of Covid-19 which potentially puts us at risk for infection. I have so many pictures of trash being strewn about, pictures of people smoking in this community, and pictures of how disgusting the breezeway gets from the valet trash guys. I would never recommend this community to anyone until they enforce the non-smoking policy in the lease and get people in the office who actually try to fix the problems of their tenants instead of making them feel insignificant.",2
173,112929672,"Not recommended!!!! They just try to lease out the apartments WITHOUT any help or service, after you signed the lease, they just try to away from you!!! Since we already signed the lease, they just like &quot;oh , you signed can&apos;t change anything&quot;. It just like making you gotta pay with no way! I have flied my compliment to BBB and building department. They are NOT helping you AT ALL. This kind of service is bad. I signed the lease on 04/19/2021, before I signing the lease, they are not clafiy about charge on the move-in date, which made us got confused that the there is no fee on the movin-in date. End up, they send a bill of $634.84 of it ( attachment). If we know there is a charge, we won&apos;t move in so fast since our currently lease is NOT end yet! I&apos;ve been lived in Long Island City for more than 4 years, I lived couple buildings around the area, which they will give one to two FREE days for moving in. But Hayden don&apos;t have this deal and both agents are not telling us before it. I tried to talked with their agent, to ask if I can get one day free to move in or I can pay for one extra day for moving, but not extra week, and I got refused by the reason of the lease had been finalized.That is super confusion part! We changed the move in day earlier AFTER we signed the lease,they allowed it, they are happy that we are paying money with we can move in earlier, so they allow us to changed without saying there is a fee. But we didn&apos;t know it has a fee on it. After we know it, we tried to changed the move-in date, they are not allowed us to do it. I don&apos;t understand if they are has race on us because we are Asian or what kind of reason. There is many Asian live in LIC!! In the station of Covid-19, no one has extra money to offer, we tried to ask for their help to change for us, but they keep refused.",2


In [13]:
# === 五種調整一次到位：gibberish_score_v2 + is_low_quality_v2 (refined) ===
import re

# 參數（可再微調）
MAX_PUNCT_RATIO = 0.30   # 提高標點容忍度
LONG_RUN_WORDS  = 120    # 無句號長句的門檻（用新判句規則）
NONASCII_THR    = 0.50

# 情緒/關鍵字（負評 -> 降分；短句保留詞）
NEGATIVE_KEYWORDS = {
    "bad","terrible","awful","horrible","hate","angry","disgusting",
    "mold","roach","roaches","rat","rats","leak","leaking","broken",
    "maintenance","rude","unsafe","noisy","dirty","refund","scam",
    "avoid","complaint","nightmare","fraud","harassment","threat",
    "bedbug","bedbugs","infestation"
}
SHORT_ALLOWED = {
    # 2~4 詞短句若含這些字，傾向保留（正負面都列一些常見詞）
    "bad","terrible","awful","horrible","love","hate","great","good","nice",
    "clean","dirty","rude","scam","avoid","unsafe","noisy","broken","refund",
    "recommend","amazing","awful","disgusting","bedbug","bedbugs","infestation"
}

def gibberish_score_v2(s: str) -> int:
    """
    - unique_ratio 極低才加分
    - 無句號/驚嘆/問號的超長單段才加分（用 [.!?] 判句）
    - 連續重複詞加分
    - 若含負評關鍵字，減 1 分（最低 0）
    """
    if not isinstance(s, str) or not s.strip():
        return 0

    s_lower = s.lower()
    words = re.findall(r"\b\w+\b", s_lower)
    if not words:
        return 0

    # 1) 詞彙多樣性極低才視為可疑
    unique_ratio = len(set(words)) / len(words)
    uniq_flag = (unique_ratio < 0.35)

    # 2) 用 [.!?] 判句；完全沒有句末符且字數超長
    has_sentence_end = bool(re.search(r"[.!?]", s))
    long_run = (not has_sentence_end) and (len(words) > LONG_RUN_WORDS)

    # 3) 連續重複詞
    repeat_pattern = bool(re.search(r"(\b\w+\b)(?:\s+\1){1,}", s_lower))

    score = int(uniq_flag) + int(long_run) + int(repeat_pattern)

    # 4) 負評關鍵字 → 降分（偏真實抱怨）
    if any(kw in s_lower for kw in NEGATIVE_KEYWORDS):
        score = max(0, score - 1)

    return score

def is_low_quality_v2(row) -> bool:
    """
    最終規則：
    - 只擋 1 詞以下（wc < 2）；不設上限（極端>10000 仍擋）
    - 標點/非ASCII 比例過高仍擋
    - 情緒保護：字數>=10 且標點<0.35 且含 ! 或 ? → 保留
    - gibberish_score_v2 >= 3 才當低品質
    - 2~4 詞短句若含 SHORT_ALLOWED 任一詞 → 保留
    """
    s  = row["_text_raw"]
    wc = row["_len_words"]
    pr = row["_punct_ratio"]
    nr = row["_nonascii_ratio"]

    # 0) 超短直接擋：僅 0 或 1 詞（如 "ok", "good"）
    if wc < 2:
        return True

    # 短句白名單：2~4 詞且含常見評價/情緒詞 → 保留
    if 2 <= wc <= 4:
        words = set(re.findall(r"\b\w+\b", (s or "").lower()))
        if not words.isdisjoint(SHORT_ALLOWED):
            return False  # 直接視為可用短評

    # 1) 極端超長文本（通常為貼整段文章/合約/網頁）
    if wc > 10000:
        return True

    # 2) 符號與亂碼比例
    if pr > MAX_PUNCT_RATIO:
        return True
    if nr > NONASCII_THR:
        return True

    # 3) 情緒保護：長句 + 合理標點 + 有情緒標點 → 多半是真實評論
    if wc >= 10 and pr < 0.35 and re.search(r"[!?]", s or ""):
        return False

    # 4) 語意亂度
    gib = gibberish_score_v2(s)
    return gib >= 3

# 重新計算新版分數與標籤
df["_gibberish_score_v2"] = df["_text_raw"].apply(gibberish_score_v2)
df["_is_low_quality_v2"]  = df.apply(is_low_quality_v2, axis=1)

# 統計與對比（可視需要保留/移除）
n = len(df)
old_hits = int((df.get("_gibberish_score_preview", 0) >= 2).sum()) if "_gibberish_score_preview" in df.columns else None
new_hits = int(df["_is_low_quality_v2"].sum())

print(f"Total rows: {n}")
if old_hits is not None:
    print(f"OLD suspected (gib>=2): {old_hits} ({old_hits/n:.2%})")
print(f"NEW low_quality_v2 hits: {new_hits} ({new_hits/n:.2%})")

# 檢視前幾筆（不截斷）
import pandas as pd
pd.set_option('display.max_colwidth', None)
display(df.loc[df["_is_low_quality_v2"], ["object_id", "review_text", "_gibberish_score_v2"]].head(10))


Total rows: 32183
OLD suspected (gib>=2): 2068 (6.43%)
NEW low_quality_v2 hits: 101 (0.31%)


Unnamed: 0,object_id,review_text,_gibberish_score_v2
174,112929642,BbbbbbbbbbbbbbbbbbbbbbbbbbnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnMmmmmmmmmkmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmkkmmmmmmmmmmkmkkmmm,0
821,112904373,........................................................................................................................................................................................................,0
980,112897087,Jdhdhdhdhdbdjdbdhdjdbdhdhudududhdhdududhdhdudujddbdjdieiehdhdjdbdjhdhdududhdhdhdhududucjcndjsisoiwhebdjdhdjuddhdidhuehebxjkchdbejdudbdjdbdhdjjdhdhdbdjjdjdhdhcudjdjejdbdbjdjdjeidjdjjdjdjdjdjdbdhdjdbjdjdjd,0
2943,112711184,CxxxxxxxxxxxxxxxxxxxxxxxxxxxxixifgighSkdjdjjdjdhddhhdhxhdhxjslsjdbdjdjdjdnvsbssbbdbsbdbdbsbxbxbxbxbxbxbbxbxbxbxbxxbxbxbxbxnnzndnxdhdhvxbxbxjckdbdbnxnxbxbxnxbbxbxbxnxnckjcnxnxnxbxjjxnxnxnxbxbbxbxbxbxbxbxnznnz,0
3449,112133787,************************************************** ************************************************************************************************************************************************************,0
3450,112133787,************************************************** ************************************************************************************************************************************************************,0
3923,112778499,oiejwiofajeiowjfoiewjaoifjweoiajfewoiajoifejoiwajjjjjjjjjfioasjeoijwoifjoiwejoijfoiwjeoifjoiawjiojfoiwejfoiwjfoiejoifejiofejwoifjiowejfiowejfiojwoifjoiwjoiwjeoifjoiwfejoiefwjoiwefjoiwjfiojaiojewfoijaoiejfoi,0
3988,112755207,My experience with Briana Shaw was really great!! I got approved for my apartment in no time and moved in in less the a week! Would recommend ????????????????????????????????????????????????????????????,1
5477,112456987,Describe my experience at The Vineyards at Hammock Ridge Apartments. I rather not say.......................................................................................................................,0
5835,112711744,fhhfiueprfhrfhuirepihrhuhfurhfrhuifhufrhurfhurfhufrhufhurHbuicebucuhedhiueduhuoejidejiowjfsojdjeidjjiodjiojuihiugtfhjdjdksowowpskskdkdkdkdkdksowososkdmdmdkskwksjdjdjskdkdmdnejwkwowowkwkwkdjdmejwjwjwjwkekwkwk,0


In [14]:
def is_low_quality_v4(row) -> bool:
    """
    版本 v4：移除標點符號比檢查，完全不依照標點比例判斷。
    """
    s  = row["_text_raw"]
    wc = row["_len_words"]
    nr = row["_nonascii_ratio"]   # 保留非 ASCII 比例檢查
    gib = gibberish_score_v2(s)   # 或 gibberish_score_v3，看你目前使用哪一個版本

    # --- 規則 ---
    if wc < 2:       # 太短（0 或 1 詞）
        return True
    if wc > 10000:   # 太長（整段文章）
        return True
    if nr > 0.50:    # 非 ASCII 比例過高
        return True

    # 語意亂度（Gibberish）判斷：需 ≥3 才視為低品質
    return gib >= 3


In [15]:
print("Current MIN_WORDS =", MIN_WORDS)


Current MIN_WORDS = 1


In [16]:
# 重新套用新規則
df["_is_low_quality_v4"] = df.apply(is_low_quality_v4, axis=1)

# 檢視統計
total = len(df)
lowq = df["_is_low_quality_v4"].sum()
print(f"Total rows: {total}")
print(f"Low-quality (v4): {lowq} ({lowq/total:.2%})")

# 檢視樣本
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
display(df.loc[df["_is_low_quality_v4"], ["object_id", "review_text", "_gibberish_score_v2", "_len_words", "_nonascii_ratio"]])



Total rows: 32183
Low-quality (v4): 70 (0.22%)


Unnamed: 0,object_id,review_text,_gibberish_score_v2,_len_words,_nonascii_ratio
174,112929642,BbbbbbbbbbbbbbbbbbbbbbbbbbnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnMmmmmmmmmkmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmkkmmmmmmmmmmkmkkmmm,0,1,0.0
821,112904373,........................................................................................................................................................................................................,0,0,0.0
980,112897087,Jdhdhdhdhdbdjdbdhdjdbdhdhudududhdhdududhdhdudujddbdjdieiehdhdjdbdjhdhdududhdhdhdhududucjcndjsisoiwhebdjdhdjuddhdidhuehebxjkchdbejdudbdjdbdhdjjdhdhdbdjjdjdhdhcudjdjejdbdbjdjdjeidjdjjdjdjdjdjdbdhdjdbjdjdjd,0,1,0.0
2943,112711184,CxxxxxxxxxxxxxxxxxxxxxxxxxxxxixifgighSkdjdjjdjdhddhhdhxhdhxjslsjdbdjdjdjdnvsbssbbdbsbdbdbsbxbxbxbxbxbxbbxbxbxbxbxxbxbxbxbxnnzndnxdhdhvxbxbxjckdbdbnxnxbxbxnxbbxbxbxnxnckjcnxnxnxbxjjxnxnxnxbxbbxbxbxbxbxbxnznnz,0,1,0.0
3449,112133787,************************************************** ************************************************************************************************************************************************************,0,0,0.0
3450,112133787,************************************************** ************************************************************************************************************************************************************,0,0,0.0
3923,112778499,oiejwiofajeiowjfoiewjaoifjweoiajfewoiajoifejoiwajjjjjjjjjfioasjeoijwoifjoiwejoijfoiwjeoifjoiawjiojfoiwejfoiwjfoiejoifejiofejwoifjiowejfiowejfiojwoifjoiwjoiwjeoifjoiwfejoiefwjoiwefjoiwjfiojaiojewfoijaoiejfoi,0,1,0.0
5835,112711744,fhhfiueprfhrfhuirepihrhuhfurhfrhuifhufrhurfhurfhufrhufhurHbuicebucuhedhiueduhuoejidejiowjfsojdjeidjjiodjiojuihiugtfhjdjdksowowpskskdkdkdkdkdksowososkdmdmdkskwksjdjdjskdkdmdnejwkwowowkwkwkdjdmejwjwjwjwkekwkwk,0,1,0.0
5836,112711744,fhhfiueprfhrfhuirepihrhuhfurhfrhuifhufrhurfhurfhufrhufhurHbuicebucuhedhiueduhuoejidejiowjfsojdjeidjjiodjiojuihiugtfhjdjdksowowpskskdkdkdkdkdksowososkdmdmdkskwksjdjdjskdkdmdnejwkwowowkwkwkdjdmejwjwjwjwkekwkwk,0,1,0.0
5837,112711744,fhhfiueprfhrfhuirepihrhuhfurhfrhuifhufrhurfhurfhufrhufhurHbuicebucuhedhiueduhuoejidejiowjfsojdjeidjjiodjiojuihiugtfhjdjdksowowpskskdkdkdkdkdksowososkdmdmdkskwksjdjdjskdkdmdnejwkwowowkwkwkdjdmejwjwjwjwkekwkwk,0,1,0.0


In [17]:
# 先選乾淨候選：每個 object_id 只留一筆（最早/最晚都可，這裡取最早）
# 你也可以先排除 _is_test_like / _is_low_quality_v* 再做重複，視需求決定
cand = (
    df.sort_values(["object_id","date_created"], ascending=[True, True])
      .drop_duplicates(subset="object_id", keep="first")
      .loc[:, ["object_id", "complex_id", "review_text"]]
      .copy()
)

# 標準化：大小寫/HTML/標點/空白
import re, html, string

def normalize_for_exact(s: str) -> str:
    if not isinstance(s, str): 
        return ""
    s = html.unescape(s)
    s = s.casefold()
    s = s.replace("\u200b", "")              # 零寬字
    s = re.sub(r"\s+", " ", s).strip()
    return s

def normalize_for_near(s: str) -> str:
    if not isinstance(s, str): 
        return ""
    s = html.unescape(s)
    s = s.casefold()
    s = re.sub(r"\s+", " ", s)
    # 去掉大部分標點（保留句號/驚嘆/問號對語氣不重要時也可移除）
    s = s.translate(str.maketrans("", "", string.punctuation))
    s = re.sub(r"\s+", " ", s).strip()
    return s

cand["_norm_exact"] = cand["review_text"].map(normalize_for_exact)
cand["_norm_near"]  = cand["review_text"].map(normalize_for_near)

print("候選評論數（以 object_id 去重）:", len(cand))


候選評論數（以 object_id 去重）: 19093


In [18]:
# === FINAL DEDUP PIPELINE (exact + near), ignore multi-flags for same object_id ===
import re, html, string, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# ---------- 0) 準備：以 object_id 聚合，避免把同一評論的多次檢舉當成重複 ----------
# 取每個 object_id 的「代表評論」（可選最早/最晚；這裡取最早）
cand = (
    df.sort_values(["object_id","date_created"], ascending=[True, True])
      .drop_duplicates(subset="object_id", keep="first")
      .loc[:, ["object_id", "complex_id", "review_text"]]
      .copy()
)

# ---------- 1) 標準化工具 ----------
def normalize_for_exact(s: str) -> str:
    if not isinstance(s, str): return ""
    s = html.unescape(s)
    s = s.casefold()
    s = s.replace("\u200b", "")        # 去掉零寬字
    s = re.sub(r"\s+", " ", s).strip()
    return s

def normalize_for_near(s: str) -> str:
    if not isinstance(s, str): return ""
    s = html.unescape(s)
    s = s.casefold()
    s = re.sub(r"\s+", " ", s)
    # 去標點（讓小差異不影響）
    s = s.translate(str.maketrans("", "", string.punctuation))
    s = re.sub(r"\s+", " ", s).strip()
    return s

cand["_norm_exact"] = cand["review_text"].map(normalize_for_exact)
cand["_norm_near"]  = cand["review_text"].map(normalize_for_near)

# ---------- 2) 嚴格重複（Exact dup, 跨 object_id） ----------
# 將每個 _norm_exact 對應到「唯一 object_id 集合」（用 set() 避免同一評論多列）
grp_exact = (
    cand.groupby("_norm_exact")["object_id"]
        .agg(lambda x: sorted(set(x)))
        .reset_index()
)
# 只挑「跨 object_id」的鍵（集合大小 >1）
dup_exact_keys = set(grp_exact.loc[grp_exact["object_id"].map(len) > 1, "_norm_exact"])

# 標記 exact 重複：對於同鍵的 object_id 列表，第一個當「代表」，其餘標為重複
cand["_is_dup_exact"] = False
cand["_dup_rep_object"] = pd.NA  # 代表 object_id
for _, row in grp_exact.loc[grp_exact["_norm_exact"].isin(dup_exact_keys)].iterrows():
    objs = row["object_id"]
    if len(objs) > 1:
        rep = objs[0]
        cand.loc[cand["object_id"] == rep, "_dup_rep_object"] = rep
        cand.loc[cand["object_id"].isin(objs[1:]), "_is_dup_exact"] = True

# 建 group id（只有 exact dup 的群組）
cand["_dup_group_id"] = pd.NA
if dup_exact_keys:
    m = cand["_norm_exact"].isin(dup_exact_keys)
    cand.loc[m, "_dup_group_id"] = pd.factorize(cand.loc[m, "_norm_exact"])[0]

# ---------- 3) 近似重複（Near dup, 跨 object_id） ----------
# 只對「未被 exact 判重」的候選做近似搜尋
cand2 = cand.loc[~cand["_is_dup_exact"]].copy().reset_index(drop=True)

# 若資料量很大，可調整 ngram / max_features
vec = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=2, max_features=200000)
X = vec.fit_transform(cand2["_norm_near"])

nn = NearestNeighbors(metric="cosine", algorithm="brute")
nn.fit(X)

k = 6            # 每筆取前 k 個近鄰
SIM_THR = 0.92   # 相似度門檻（0.90~0.95 可調）
distances, indices = nn.kneighbors(X, n_neighbors=k, return_distance=True)

is_near_dup = np.zeros(len(cand2), dtype=bool)
near_pairs = []  # (a_oid, b_oid, sim)

# 注意：僅在「不同 object_id」之間標記近似重複
for i in range(len(cand2)):
    a_oid = cand2.loc[i, "object_id"]
    for j_idx, d in zip(indices[i], distances[i]):
        if i == j_idx: 
            continue
        b_oid = cand2.loc[j_idx, "object_id"]
        if a_oid == b_oid:
            continue
        sim = 1 - d
        if sim >= SIM_THR:
            is_near_dup[i] = True
            near_pairs.append((a_oid, b_oid, sim))

cand2["_is_dup_near"] = is_near_dup

# 回填近似結果回 cand，再帶回 df
cand = cand.merge(cand2[["object_id","_is_dup_near"]], on="object_id", how="left")
cand["_is_dup_near"] = cand["_is_dup_near"].fillna(False)

# ---------- 4) 回填回原 df（依 object_id 對應），同一評論的多次檢舉會繼承標記 ----------
df = df.merge(cand[["object_id","_is_dup_exact","_is_dup_near","_dup_group_id"]], on="object_id", how="left")
df["_is_dup_exact"] = df["_is_dup_exact"].fillna(False)
df["_is_dup_near"]  = df["_is_dup_near"].fillna(False)
df["_is_duplicate"] = df["_is_dup_exact"] | df["_is_dup_near"]

# ---------- 5) 總結與幾個示例 ----------
n_all = len(df)
n_exact = int(df["_is_dup_exact"].sum())
n_near  = int(df["_is_dup_near"].sum())
n_any   = int(df["_is_duplicate"].sum())

print(f"Exact duplicates (跨 object_id): {n_exact}")
print(f"Near  duplicates (跨 object_id): {n_near}")
print(f"Any   duplicates:               {n_any} / {n_all} ({n_any/n_all:.2%})")

pd.set_option('display.max_colwidth', None)
print("\n[Examples] Exact duplicate rows:")
display(df.loc[df["_is_dup_exact"], ["object_id","complex_id","review_text"]].head(10))

print("\n[Examples] Near duplicate rows:")
display(df.loc[df["_is_dup_near"], ["object_id","complex_id","review_text"]].head(10))


Exact duplicates (跨 object_id): 16
Near  duplicates (跨 object_id): 97
Any   duplicates:               113 / 32183 (0.35%)

[Examples] Exact duplicate rows:


  cand["_is_dup_near"] = cand["_is_dup_near"].fillna(False)


Unnamed: 0,object_id,complex_id,review_text
306,976982,615356222037221,... oh let&#39;s see...\nI love taking a walk through the serene landscape while happening upon building 8 facing the main drive and get the honor and viewing of 5 or 6 African-Americans&#44; a dog&#44; and two children on the balcony&#44; sharing their musings very loudly&#44; all the whilst sipping their obvious bottle of Hennessy. Occasionally you can see the children playing with the geese&#44; hoping that they&#39;ll fetch the large stick they hurl towards them. Not only do I get the honor of seeing and hearing this comradery on a weekend afternoon or evening&#44; but if I&#39;m lucky enough I get to see and hear it on a weekend morning!!!!\n\nAnd let me not forget the emaciated looking man in front of building six that sits (and sometimes) sleeps in his truck. Occasionally you can see him sitting in there talking to himself&#44; but if he gets the urge to enjoy the landscape he can be seen taking a smoke outside of his truck having a wonderful conversation with no other than himself!\n\nSigh... there&#39;s no place like home....
4067,112698525,513831889945150,"This community was an extremely beautiful, well kept and peaceful community. Everything was so beautiful that I felt like I was at a resort. The office staff was extremely accommodating and friendly also. It could not be better located and was amidst a lot of shopping, restaurants, transportation and schools. It offered amenities that most communities did not."
4178,112698525,513831889945150,"This community was an extremely beautiful, well kept and peaceful community. Everything was so beautiful that I felt like I was at a resort. The office staff was extremely accommodating and friendly also. It could not be better located and was amidst a lot of shopping, restaurants, transportation and schools. It offered amenities that most communities did not."
10319,111800243,508872170001701,Great place to live.
10454,111800243,508872170001701,Great place to live.
13775,112429689,713941801577034,"Renter&apos;s, Employee&apos;s, Contractor&apos;s BEWARE of Eureka&apos;s Business Practices As a business owner, and renter myself I felt compelled to share the disturbing things I&apos;ve experienced and uncovered concerning this company and owner. Here is the link to just a few of the news articles I found detailing multiple code violations, failed HUD inspections, and numerous lawsuits alleging dangerous living conditions. https:// https:// https://dallasprogress. Rene Campos Jr., President- 46, lives in a $1 million-dollar home in University Park. He owns a $4 million-dollar vacation property Hawaii. He also plays polo, rides a custom motorcycle, and drives a Land Rover. Yet is referred to as a Slumlord more concerned with maximizing his income by holding down expenses selecting the least expensive bids often using unlicensed contractors and necessary permits rather than fulfilling tenants living commitments. On top of the failed inspections, code violations, Rene was also fined for failing to file 3 CONCECUTIVE YEARS of audited financial statements on time blaming &quot;inattentiveness&quot; by his accountants. To consider this companies &quot;Code of Ethics&quot; to be lacking would be modest. The multifamily industry depends on professional staff who routinely enter apartment homes, handle confidential information and carry out financial transactions. Multifamily owners and operators check criminal backgrounds to help protect the safety and security of residents and staff and to reduce the risk of violence, theft and fraud. However, Eureka hires unlicensed contractors to complete construction, renovations, electrical repairs requiring A License Contractor and Permits pulled by the city to approve the work prior to/ in progress/ upon completion. Due to the short cuts and shady business dealings more often than not the project goes incomplete or not meeting standards. That&apos;s if the contractor doesn&apos;t walk of the job for non-payment. Rene blames delayed repairs, incomplete projects, and non-payment on reluctant insurance carriers for his lack of financial resources. (Reluctant is a mild term… the insurance carriers are fighting fraudulent claims on repairs/damages persisting prior to Hurricane Harvey they are seeking funds for.) I was even able to find reviews from previous employees describing the unprofessional & unsafe work environment. The high turnover could be in part to another unsafe attempt on expenses. I think this is what I found to be the most disturbing of all. Eureka has a 30-day probation period for new hires giving them the status of &quot;Contract Labor&quot;. Common practice in Multi-Family Housing right?? WRONG!! &quot;Contract Employee&quot; status DOES NOT REQUIRE A CRIMINAL BACKGROUND CHECK!! Meaning your friendly maintenance team may or may not be a convicted felon. Eureka picked up the unethical practice to cure the constant staff turnover, cost of screening multiples, and only paying &quot;Contract Employee&quot; straight-time for anything exceeding 80 hours avoiding costly overtime expense. Bottom line is this owner continues to make millions despite repeated contract violations, failure to maintain &quot;safe, decent, sanitary living conditions&quot;, and continued failure to file required financial statements on time each year. It&apos;s no surprise Eureka has come under relentless attacks by critics alleging fraud when it&apos;s clear they continue turning a profit by short-changing residents, employees, contractors, etc. The facts are in black in white! Think twice before you engage with this owner or company."
13776,112429689,713941801577034,"Renter&apos;s, Employee&apos;s, Contractor&apos;s BEWARE of Eureka&apos;s Business Practices As a business owner, and renter myself I felt compelled to share the disturbing things I&apos;ve experienced and uncovered concerning this company and owner. Here is the link to just a few of the news articles I found detailing multiple code violations, failed HUD inspections, and numerous lawsuits alleging dangerous living conditions. https:// https:// https://dallasprogress. Rene Campos Jr., President- 46, lives in a $1 million-dollar home in University Park. He owns a $4 million-dollar vacation property Hawaii. He also plays polo, rides a custom motorcycle, and drives a Land Rover. Yet is referred to as a Slumlord more concerned with maximizing his income by holding down expenses selecting the least expensive bids often using unlicensed contractors and necessary permits rather than fulfilling tenants living commitments. On top of the failed inspections, code violations, Rene was also fined for failing to file 3 CONCECUTIVE YEARS of audited financial statements on time blaming &quot;inattentiveness&quot; by his accountants. To consider this companies &quot;Code of Ethics&quot; to be lacking would be modest. The multifamily industry depends on professional staff who routinely enter apartment homes, handle confidential information and carry out financial transactions. Multifamily owners and operators check criminal backgrounds to help protect the safety and security of residents and staff and to reduce the risk of violence, theft and fraud. However, Eureka hires unlicensed contractors to complete construction, renovations, electrical repairs requiring A License Contractor and Permits pulled by the city to approve the work prior to/ in progress/ upon completion. Due to the short cuts and shady business dealings more often than not the project goes incomplete or not meeting standards. That&apos;s if the contractor doesn&apos;t walk of the job for non-payment. Rene blames delayed repairs, incomplete projects, and non-payment on reluctant insurance carriers for his lack of financial resources. (Reluctant is a mild term… the insurance carriers are fighting fraudulent claims on repairs/damages persisting prior to Hurricane Harvey they are seeking funds for.) I was even able to find reviews from previous employees describing the unprofessional & unsafe work environment. The high turnover could be in part to another unsafe attempt on expenses. I think this is what I found to be the most disturbing of all. Eureka has a 30-day probation period for new hires giving them the status of &quot;Contract Labor&quot;. Common practice in Multi-Family Housing right?? WRONG!! &quot;Contract Employee&quot; status DOES NOT REQUIRE A CRIMINAL BACKGROUND CHECK!! Meaning your friendly maintenance team may or may not be a convicted felon. Eureka picked up the unethical practice to cure the constant staff turnover, cost of screening multiples, and only paying &quot;Contract Employee&quot; straight-time for anything exceeding 80 hours avoiding costly overtime expense. Bottom line is this owner continues to make millions despite repeated contract violations, failure to maintain &quot;safe, decent, sanitary living conditions&quot;, and continued failure to file required financial statements on time each year. It&apos;s no surprise Eureka has come under relentless attacks by critics alleging fraud when it&apos;s clear they continue turning a profit by short-changing residents, employees, contractors, etc. The facts are in black in white! Think twice before you engage with this owner or company."
13777,112429689,713941801577034,"Renter&apos;s, Employee&apos;s, Contractor&apos;s BEWARE of Eureka&apos;s Business Practices As a business owner, and renter myself I felt compelled to share the disturbing things I&apos;ve experienced and uncovered concerning this company and owner. Here is the link to just a few of the news articles I found detailing multiple code violations, failed HUD inspections, and numerous lawsuits alleging dangerous living conditions. https:// https:// https://dallasprogress. Rene Campos Jr., President- 46, lives in a $1 million-dollar home in University Park. He owns a $4 million-dollar vacation property Hawaii. He also plays polo, rides a custom motorcycle, and drives a Land Rover. Yet is referred to as a Slumlord more concerned with maximizing his income by holding down expenses selecting the least expensive bids often using unlicensed contractors and necessary permits rather than fulfilling tenants living commitments. On top of the failed inspections, code violations, Rene was also fined for failing to file 3 CONCECUTIVE YEARS of audited financial statements on time blaming &quot;inattentiveness&quot; by his accountants. To consider this companies &quot;Code of Ethics&quot; to be lacking would be modest. The multifamily industry depends on professional staff who routinely enter apartment homes, handle confidential information and carry out financial transactions. Multifamily owners and operators check criminal backgrounds to help protect the safety and security of residents and staff and to reduce the risk of violence, theft and fraud. However, Eureka hires unlicensed contractors to complete construction, renovations, electrical repairs requiring A License Contractor and Permits pulled by the city to approve the work prior to/ in progress/ upon completion. Due to the short cuts and shady business dealings more often than not the project goes incomplete or not meeting standards. That&apos;s if the contractor doesn&apos;t walk of the job for non-payment. Rene blames delayed repairs, incomplete projects, and non-payment on reluctant insurance carriers for his lack of financial resources. (Reluctant is a mild term… the insurance carriers are fighting fraudulent claims on repairs/damages persisting prior to Hurricane Harvey they are seeking funds for.) I was even able to find reviews from previous employees describing the unprofessional & unsafe work environment. The high turnover could be in part to another unsafe attempt on expenses. I think this is what I found to be the most disturbing of all. Eureka has a 30-day probation period for new hires giving them the status of &quot;Contract Labor&quot;. Common practice in Multi-Family Housing right?? WRONG!! &quot;Contract Employee&quot; status DOES NOT REQUIRE A CRIMINAL BACKGROUND CHECK!! Meaning your friendly maintenance team may or may not be a convicted felon. Eureka picked up the unethical practice to cure the constant staff turnover, cost of screening multiples, and only paying &quot;Contract Employee&quot; straight-time for anything exceeding 80 hours avoiding costly overtime expense. Bottom line is this owner continues to make millions despite repeated contract violations, failure to maintain &quot;safe, decent, sanitary living conditions&quot;, and continued failure to file required financial statements on time each year. It&apos;s no surprise Eureka has come under relentless attacks by critics alleging fraud when it&apos;s clear they continue turning a profit by short-changing residents, employees, contractors, etc. The facts are in black in white! Think twice before you engage with this owner or company."
13780,112429689,713941801577034,"Renter&apos;s, Employee&apos;s, Contractor&apos;s BEWARE of Eureka&apos;s Business Practices As a business owner, and renter myself I felt compelled to share the disturbing things I&apos;ve experienced and uncovered concerning this company and owner. Here is the link to just a few of the news articles I found detailing multiple code violations, failed HUD inspections, and numerous lawsuits alleging dangerous living conditions. https:// https:// https://dallasprogress. Rene Campos Jr., President- 46, lives in a $1 million-dollar home in University Park. He owns a $4 million-dollar vacation property Hawaii. He also plays polo, rides a custom motorcycle, and drives a Land Rover. Yet is referred to as a Slumlord more concerned with maximizing his income by holding down expenses selecting the least expensive bids often using unlicensed contractors and necessary permits rather than fulfilling tenants living commitments. On top of the failed inspections, code violations, Rene was also fined for failing to file 3 CONCECUTIVE YEARS of audited financial statements on time blaming &quot;inattentiveness&quot; by his accountants. To consider this companies &quot;Code of Ethics&quot; to be lacking would be modest. The multifamily industry depends on professional staff who routinely enter apartment homes, handle confidential information and carry out financial transactions. Multifamily owners and operators check criminal backgrounds to help protect the safety and security of residents and staff and to reduce the risk of violence, theft and fraud. However, Eureka hires unlicensed contractors to complete construction, renovations, electrical repairs requiring A License Contractor and Permits pulled by the city to approve the work prior to/ in progress/ upon completion. Due to the short cuts and shady business dealings more often than not the project goes incomplete or not meeting standards. That&apos;s if the contractor doesn&apos;t walk of the job for non-payment. Rene blames delayed repairs, incomplete projects, and non-payment on reluctant insurance carriers for his lack of financial resources. (Reluctant is a mild term… the insurance carriers are fighting fraudulent claims on repairs/damages persisting prior to Hurricane Harvey they are seeking funds for.) I was even able to find reviews from previous employees describing the unprofessional & unsafe work environment. The high turnover could be in part to another unsafe attempt on expenses. I think this is what I found to be the most disturbing of all. Eureka has a 30-day probation period for new hires giving them the status of &quot;Contract Labor&quot;. Common practice in Multi-Family Housing right?? WRONG!! &quot;Contract Employee&quot; status DOES NOT REQUIRE A CRIMINAL BACKGROUND CHECK!! Meaning your friendly maintenance team may or may not be a convicted felon. Eureka picked up the unethical practice to cure the constant staff turnover, cost of screening multiples, and only paying &quot;Contract Employee&quot; straight-time for anything exceeding 80 hours avoiding costly overtime expense. Bottom line is this owner continues to make millions despite repeated contract violations, failure to maintain &quot;safe, decent, sanitary living conditions&quot;, and continued failure to file required financial statements on time each year. It&apos;s no surprise Eureka has come under relentless attacks by critics alleging fraud when it&apos;s clear they continue turning a profit by short-changing residents, employees, contractors, etc. The facts are in black in white! Think twice before you engage with this owner or company."
13781,112429689,713941801577034,"Renter&apos;s, Employee&apos;s, Contractor&apos;s BEWARE of Eureka&apos;s Business Practices As a business owner, and renter myself I felt compelled to share the disturbing things I&apos;ve experienced and uncovered concerning this company and owner. Here is the link to just a few of the news articles I found detailing multiple code violations, failed HUD inspections, and numerous lawsuits alleging dangerous living conditions. https:// https:// https://dallasprogress. Rene Campos Jr., President- 46, lives in a $1 million-dollar home in University Park. He owns a $4 million-dollar vacation property Hawaii. He also plays polo, rides a custom motorcycle, and drives a Land Rover. Yet is referred to as a Slumlord more concerned with maximizing his income by holding down expenses selecting the least expensive bids often using unlicensed contractors and necessary permits rather than fulfilling tenants living commitments. On top of the failed inspections, code violations, Rene was also fined for failing to file 3 CONCECUTIVE YEARS of audited financial statements on time blaming &quot;inattentiveness&quot; by his accountants. To consider this companies &quot;Code of Ethics&quot; to be lacking would be modest. The multifamily industry depends on professional staff who routinely enter apartment homes, handle confidential information and carry out financial transactions. Multifamily owners and operators check criminal backgrounds to help protect the safety and security of residents and staff and to reduce the risk of violence, theft and fraud. However, Eureka hires unlicensed contractors to complete construction, renovations, electrical repairs requiring A License Contractor and Permits pulled by the city to approve the work prior to/ in progress/ upon completion. Due to the short cuts and shady business dealings more often than not the project goes incomplete or not meeting standards. That&apos;s if the contractor doesn&apos;t walk of the job for non-payment. Rene blames delayed repairs, incomplete projects, and non-payment on reluctant insurance carriers for his lack of financial resources. (Reluctant is a mild term… the insurance carriers are fighting fraudulent claims on repairs/damages persisting prior to Hurricane Harvey they are seeking funds for.) I was even able to find reviews from previous employees describing the unprofessional & unsafe work environment. The high turnover could be in part to another unsafe attempt on expenses. I think this is what I found to be the most disturbing of all. Eureka has a 30-day probation period for new hires giving them the status of &quot;Contract Labor&quot;. Common practice in Multi-Family Housing right?? WRONG!! &quot;Contract Employee&quot; status DOES NOT REQUIRE A CRIMINAL BACKGROUND CHECK!! Meaning your friendly maintenance team may or may not be a convicted felon. Eureka picked up the unethical practice to cure the constant staff turnover, cost of screening multiples, and only paying &quot;Contract Employee&quot; straight-time for anything exceeding 80 hours avoiding costly overtime expense. Bottom line is this owner continues to make millions despite repeated contract violations, failure to maintain &quot;safe, decent, sanitary living conditions&quot;, and continued failure to file required financial statements on time each year. It&apos;s no surprise Eureka has come under relentless attacks by critics alleging fraud when it&apos;s clear they continue turning a profit by short-changing residents, employees, contractors, etc. The facts are in black in white! Think twice before you engage with this owner or company."



[Examples] Near duplicate rows:


Unnamed: 0,object_id,complex_id,review_text
74,112932847,5053324300871231,"elp your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site. No Personal Info We don&apos;t allow reviews with names, phone numbers, email or other contact info, so please avoid using it."
75,112932847,5053324300871231,"elp your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site. No Personal Info We don&apos;t allow reviews with names, phone numbers, email or other contact info, so please avoid using it."
128,112664492,9199332346275143753,"Hello, this is my 1st time posting a review here. It is such a good experience to live here. Everything goooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooood"
648,112910664,336992222827284,mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\nmmmmm\nmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm mmmmmmmmmokkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm
891,112744705,9199332346275193048,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site. Add Photos A picture tells a thousand words, or so they say. No Personal Info We don&apos;t allow reviews with names, phone numbers, email or other contact info, so please avoid using it."
1249,112111945,301470347420707,Good there should not be a minimum\nAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaas
1674,112817629,248471362548335,Reasonably good attention to numerous maintenance requests. Was able to get covered parking when requested. I do not have more feedback. nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
1880,112832233,9199332346275185419,Consistently great response and results. To many characters required. So here goes...xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx. This should be a comment not a novel.
2083,112838393,9199332346275159328,"IVE Living markets affordable living in Fairfield County, offering discounts to public servants such as teachers and police. Don&apos;t be fooled. They make up fake charges to profit off of your security deposit and fail to properly maintain their property. They stole hundreds of dollars of my money and made charges with no viable explanation or evidence. Outdoors, there were rats in the garbage, mold and grime covering the building; inside pictures were broken and falling off the wall, hallways were not cleaned, and the laundry facilities were not maintained and not able to use. This doesn&apos;t include that in the years living there, I dealt with bed bugs, a ceiling that fell in the bathroom, and mice. And there&apos;s no one to ask for help except for &quot;customer service.&quot; You&apos;ll never get a real response from a real person when there&apos;s an issue. There are multiple, similar bad reviews for a reason - avoid IVE at all costs."
2084,112838392,9199332346275177597,"DO NOT MOVE IN! IVE Living markets affordable living in Fairfield County, offering discounts to public servants such as teachers and police. Don&apos;t be fooled. They make up fake charges to profit off of your security deposit and fail to properly maintain their property. They stole hundreds of dollars of my money and made charges with no viable explanation or evidence. Outdoors, there were rats in the garbage, mold and grime covering the building; inside pictures were broken and falling off the wall, hallways were not cleaned, and the laundry facilities were not maintained and not able to use. This doesn&apos;t include that in the years living there, I dealt with bed bugs, a ceiling that fell in the bathroom, and mice. And there&apos;s no one to ask for help except for &quot;customer service.&quot; You&apos;ll never get a real response from a real person when there&apos;s an issue. There are multiple, similar bad reviews for a reason - avoid IVE at all costs."


In [19]:
import pandas as pd
import html, re

# 標準化文字：去大小寫、HTML符號、空白
def normalize_text(s: str) -> str:
    if not isinstance(s, str): return ""
    s = html.unescape(s)
    s = s.strip().casefold()
    s = re.sub(r"\s+", " ", s)
    return s

df["_norm_text"] = df["review_text"].map(normalize_text)

# 1️⃣ 建立一個表：每個 _norm_text 對應哪些 object_id
grp = (
    df.groupby("_norm_text")["object_id"]
      .agg(lambda x: sorted(set(x)))
      .reset_index()
)

# 2️⃣ 找出「同樣 review_text 但 object_id 不同」的 group
dup_texts = set(grp.loc[grp["object_id"].map(len) > 1, "_norm_text"])

# 3️⃣ 標記為重複
df["_is_duplicate_text"] = df["_norm_text"].isin(dup_texts)

# 4️⃣ 同時確保同 object_id 多筆不算（可選，但保險）
df["_is_duplicate_text"] = df.groupby("object_id")["_is_duplicate_text"].transform("max")

# 統計
print(f"文字完全相同、但 object_id 不同的重複評論數：{df['_is_duplicate_text'].sum()} / {len(df)}")

# 檢視範例
pd.set_option('display.max_colwidth', None)
dups = df.loc[df["_is_duplicate_text"], ["object_id","complex_id","review_text"]]
display(dups.head(20))


文字完全相同、但 object_id 不同的重複評論數：30 / 32183


Unnamed: 0,object_id,complex_id,review_text
305,976815,615356222037221,... oh let&#39;s see...\nI love taking a walk through the serene landscape while happening upon building 8 facing the main drive and get the honor and viewing of 5 or 6 African-Americans&#44; a dog&#44; and two children on the balcony&#44; sharing their musings very loudly&#44; all the whilst sipping their obvious bottle of Hennessy. Occasionally you can see the children playing with the geese&#44; hoping that they&#39;ll fetch the large stick they hurl towards them. Not only do I get the honor of seeing and hearing this comradery on a weekend afternoon or evening&#44; but if I&#39;m lucky enough I get to see and hear it on a weekend morning!!!!\n\nAnd let me not forget the emaciated looking man in front of building six that sits (and sometimes) sleeps in his truck. Occasionally you can see him sitting in there talking to himself&#44; but if he gets the urge to enjoy the landscape he can be seen taking a smoke outside of his truck having a wonderful conversation with no other than himself!\n\nSigh... there&#39;s no place like home....
306,976982,615356222037221,... oh let&#39;s see...\nI love taking a walk through the serene landscape while happening upon building 8 facing the main drive and get the honor and viewing of 5 or 6 African-Americans&#44; a dog&#44; and two children on the balcony&#44; sharing their musings very loudly&#44; all the whilst sipping their obvious bottle of Hennessy. Occasionally you can see the children playing with the geese&#44; hoping that they&#39;ll fetch the large stick they hurl towards them. Not only do I get the honor of seeing and hearing this comradery on a weekend afternoon or evening&#44; but if I&#39;m lucky enough I get to see and hear it on a weekend morning!!!!\n\nAnd let me not forget the emaciated looking man in front of building six that sits (and sometimes) sleeps in his truck. Occasionally you can see him sitting in there talking to himself&#44; but if he gets the urge to enjoy the landscape he can be seen taking a smoke outside of his truck having a wonderful conversation with no other than himself!\n\nSigh... there&#39;s no place like home....
4067,112698525,513831889945150,"This community was an extremely beautiful, well kept and peaceful community. Everything was so beautiful that I felt like I was at a resort. The office staff was extremely accommodating and friendly also. It could not be better located and was amidst a lot of shopping, restaurants, transportation and schools. It offered amenities that most communities did not."
4178,112698525,513831889945150,"This community was an extremely beautiful, well kept and peaceful community. Everything was so beautiful that I felt like I was at a resort. The office staff was extremely accommodating and friendly also. It could not be better located and was amidst a lot of shopping, restaurants, transportation and schools. It offered amenities that most communities did not."
7144,112239510,630357470060532,"This community was an extremely beautiful, well kept and peaceful community. Everything was so beautiful that I felt like I was at a resort. The office staff was extremely accommodating and friendly also. It could not be better located and was amidst a lot of shopping, restaurants, transportation and schools. It offered amenities that most communities did not."
10319,111800243,508872170001701,Great place to live.
10454,111800243,508872170001701,Great place to live.
13775,112429689,713941801577034,"Renter&apos;s, Employee&apos;s, Contractor&apos;s BEWARE of Eureka&apos;s Business Practices As a business owner, and renter myself I felt compelled to share the disturbing things I&apos;ve experienced and uncovered concerning this company and owner. Here is the link to just a few of the news articles I found detailing multiple code violations, failed HUD inspections, and numerous lawsuits alleging dangerous living conditions. https:// https:// https://dallasprogress. Rene Campos Jr., President- 46, lives in a $1 million-dollar home in University Park. He owns a $4 million-dollar vacation property Hawaii. He also plays polo, rides a custom motorcycle, and drives a Land Rover. Yet is referred to as a Slumlord more concerned with maximizing his income by holding down expenses selecting the least expensive bids often using unlicensed contractors and necessary permits rather than fulfilling tenants living commitments. On top of the failed inspections, code violations, Rene was also fined for failing to file 3 CONCECUTIVE YEARS of audited financial statements on time blaming &quot;inattentiveness&quot; by his accountants. To consider this companies &quot;Code of Ethics&quot; to be lacking would be modest. The multifamily industry depends on professional staff who routinely enter apartment homes, handle confidential information and carry out financial transactions. Multifamily owners and operators check criminal backgrounds to help protect the safety and security of residents and staff and to reduce the risk of violence, theft and fraud. However, Eureka hires unlicensed contractors to complete construction, renovations, electrical repairs requiring A License Contractor and Permits pulled by the city to approve the work prior to/ in progress/ upon completion. Due to the short cuts and shady business dealings more often than not the project goes incomplete or not meeting standards. That&apos;s if the contractor doesn&apos;t walk of the job for non-payment. Rene blames delayed repairs, incomplete projects, and non-payment on reluctant insurance carriers for his lack of financial resources. (Reluctant is a mild term… the insurance carriers are fighting fraudulent claims on repairs/damages persisting prior to Hurricane Harvey they are seeking funds for.) I was even able to find reviews from previous employees describing the unprofessional & unsafe work environment. The high turnover could be in part to another unsafe attempt on expenses. I think this is what I found to be the most disturbing of all. Eureka has a 30-day probation period for new hires giving them the status of &quot;Contract Labor&quot;. Common practice in Multi-Family Housing right?? WRONG!! &quot;Contract Employee&quot; status DOES NOT REQUIRE A CRIMINAL BACKGROUND CHECK!! Meaning your friendly maintenance team may or may not be a convicted felon. Eureka picked up the unethical practice to cure the constant staff turnover, cost of screening multiples, and only paying &quot;Contract Employee&quot; straight-time for anything exceeding 80 hours avoiding costly overtime expense. Bottom line is this owner continues to make millions despite repeated contract violations, failure to maintain &quot;safe, decent, sanitary living conditions&quot;, and continued failure to file required financial statements on time each year. It&apos;s no surprise Eureka has come under relentless attacks by critics alleging fraud when it&apos;s clear they continue turning a profit by short-changing residents, employees, contractors, etc. The facts are in black in white! Think twice before you engage with this owner or company."
13776,112429689,713941801577034,"Renter&apos;s, Employee&apos;s, Contractor&apos;s BEWARE of Eureka&apos;s Business Practices As a business owner, and renter myself I felt compelled to share the disturbing things I&apos;ve experienced and uncovered concerning this company and owner. Here is the link to just a few of the news articles I found detailing multiple code violations, failed HUD inspections, and numerous lawsuits alleging dangerous living conditions. https:// https:// https://dallasprogress. Rene Campos Jr., President- 46, lives in a $1 million-dollar home in University Park. He owns a $4 million-dollar vacation property Hawaii. He also plays polo, rides a custom motorcycle, and drives a Land Rover. Yet is referred to as a Slumlord more concerned with maximizing his income by holding down expenses selecting the least expensive bids often using unlicensed contractors and necessary permits rather than fulfilling tenants living commitments. On top of the failed inspections, code violations, Rene was also fined for failing to file 3 CONCECUTIVE YEARS of audited financial statements on time blaming &quot;inattentiveness&quot; by his accountants. To consider this companies &quot;Code of Ethics&quot; to be lacking would be modest. The multifamily industry depends on professional staff who routinely enter apartment homes, handle confidential information and carry out financial transactions. Multifamily owners and operators check criminal backgrounds to help protect the safety and security of residents and staff and to reduce the risk of violence, theft and fraud. However, Eureka hires unlicensed contractors to complete construction, renovations, electrical repairs requiring A License Contractor and Permits pulled by the city to approve the work prior to/ in progress/ upon completion. Due to the short cuts and shady business dealings more often than not the project goes incomplete or not meeting standards. That&apos;s if the contractor doesn&apos;t walk of the job for non-payment. Rene blames delayed repairs, incomplete projects, and non-payment on reluctant insurance carriers for his lack of financial resources. (Reluctant is a mild term… the insurance carriers are fighting fraudulent claims on repairs/damages persisting prior to Hurricane Harvey they are seeking funds for.) I was even able to find reviews from previous employees describing the unprofessional & unsafe work environment. The high turnover could be in part to another unsafe attempt on expenses. I think this is what I found to be the most disturbing of all. Eureka has a 30-day probation period for new hires giving them the status of &quot;Contract Labor&quot;. Common practice in Multi-Family Housing right?? WRONG!! &quot;Contract Employee&quot; status DOES NOT REQUIRE A CRIMINAL BACKGROUND CHECK!! Meaning your friendly maintenance team may or may not be a convicted felon. Eureka picked up the unethical practice to cure the constant staff turnover, cost of screening multiples, and only paying &quot;Contract Employee&quot; straight-time for anything exceeding 80 hours avoiding costly overtime expense. Bottom line is this owner continues to make millions despite repeated contract violations, failure to maintain &quot;safe, decent, sanitary living conditions&quot;, and continued failure to file required financial statements on time each year. It&apos;s no surprise Eureka has come under relentless attacks by critics alleging fraud when it&apos;s clear they continue turning a profit by short-changing residents, employees, contractors, etc. The facts are in black in white! Think twice before you engage with this owner or company."
13777,112429689,713941801577034,"Renter&apos;s, Employee&apos;s, Contractor&apos;s BEWARE of Eureka&apos;s Business Practices As a business owner, and renter myself I felt compelled to share the disturbing things I&apos;ve experienced and uncovered concerning this company and owner. Here is the link to just a few of the news articles I found detailing multiple code violations, failed HUD inspections, and numerous lawsuits alleging dangerous living conditions. https:// https:// https://dallasprogress. Rene Campos Jr., President- 46, lives in a $1 million-dollar home in University Park. He owns a $4 million-dollar vacation property Hawaii. He also plays polo, rides a custom motorcycle, and drives a Land Rover. Yet is referred to as a Slumlord more concerned with maximizing his income by holding down expenses selecting the least expensive bids often using unlicensed contractors and necessary permits rather than fulfilling tenants living commitments. On top of the failed inspections, code violations, Rene was also fined for failing to file 3 CONCECUTIVE YEARS of audited financial statements on time blaming &quot;inattentiveness&quot; by his accountants. To consider this companies &quot;Code of Ethics&quot; to be lacking would be modest. The multifamily industry depends on professional staff who routinely enter apartment homes, handle confidential information and carry out financial transactions. Multifamily owners and operators check criminal backgrounds to help protect the safety and security of residents and staff and to reduce the risk of violence, theft and fraud. However, Eureka hires unlicensed contractors to complete construction, renovations, electrical repairs requiring A License Contractor and Permits pulled by the city to approve the work prior to/ in progress/ upon completion. Due to the short cuts and shady business dealings more often than not the project goes incomplete or not meeting standards. That&apos;s if the contractor doesn&apos;t walk of the job for non-payment. Rene blames delayed repairs, incomplete projects, and non-payment on reluctant insurance carriers for his lack of financial resources. (Reluctant is a mild term… the insurance carriers are fighting fraudulent claims on repairs/damages persisting prior to Hurricane Harvey they are seeking funds for.) I was even able to find reviews from previous employees describing the unprofessional & unsafe work environment. The high turnover could be in part to another unsafe attempt on expenses. I think this is what I found to be the most disturbing of all. Eureka has a 30-day probation period for new hires giving them the status of &quot;Contract Labor&quot;. Common practice in Multi-Family Housing right?? WRONG!! &quot;Contract Employee&quot; status DOES NOT REQUIRE A CRIMINAL BACKGROUND CHECK!! Meaning your friendly maintenance team may or may not be a convicted felon. Eureka picked up the unethical practice to cure the constant staff turnover, cost of screening multiples, and only paying &quot;Contract Employee&quot; straight-time for anything exceeding 80 hours avoiding costly overtime expense. Bottom line is this owner continues to make millions despite repeated contract violations, failure to maintain &quot;safe, decent, sanitary living conditions&quot;, and continued failure to file required financial statements on time each year. It&apos;s no surprise Eureka has come under relentless attacks by critics alleging fraud when it&apos;s clear they continue turning a profit by short-changing residents, employees, contractors, etc. The facts are in black in white! Think twice before you engage with this owner or company."


In [20]:
dup_groups = (
    df.loc[df["_is_duplicate_text"]]
      .groupby("_norm_text")["object_id"]
      .apply(list)
      .reset_index(name="object_id_list")
)

print("重複群組數量：", len(dup_groups))
display(dup_groups.head(10))


重複群組數量： 8


Unnamed: 0,_norm_text,object_id_list
0,"... oh let's see... i love taking a walk through the serene landscape while happening upon building 8 facing the main drive and get the honor and viewing of 5 or 6 african-americans, a dog, and two children on the balcony, sharing their musings very loudly, all the whilst sipping their obvious bottle of hennessy. occasionally you can see the children playing with the geese, hoping that they'll fetch the large stick they hurl towards them. not only do i get the honor of seeing and hearing this comradery on a weekend afternoon or evening, but if i'm lucky enough i get to see and hear it on a weekend morning!!!! and let me not forget the emaciated looking man in front of building six that sits (and sometimes) sleeps in his truck. occasionally you can see him sitting in there talking to himself, but if he gets the urge to enjoy the landscape he can be seen taking a smoke outside of his truck having a wonderful conversation with no other than himself! sigh... there's no place like home....","[976815, 976982]"
1,.........................................................................................................................................................................................................,"[112197720, 112197757]"
2,great place to live.,"[111800243, 111800243, 1552359, 1552359, 112079128]"
3,"i and my family currently live in this community. in my opinion this is the best community to live in pompano beach. people are happy and at peace, has no problem because we always have high quality timely service, the system of garbage collection and compaction is seamless and efficient. the security in public areas is equal to the best places in the world during the day and night. we have an excellent car collection system for offenders in the parking area which allows us to monitor unauthorized to park in the common area strangers. my family and i highly recommend this community to visit and live in the heart of pompano beach. the office staff is excellent, friendly and respectful, and very professional, along with the property manager. thanks to the staff for their high quality of service and dedication.","[111738863, 111443707]"
4,i moved into the boulevard view in june 2014 when i began my residency at university hospitals. ideal location. walking distance to hospital and cedar fairmount shopping district. cannot beat the underground garage parking which was included in the rent. free wifi saves me approximately $75.00 per month. older building with updated kitchens and bathrooms. new electrical wiring is very important so there is plenty of capacity for modern appliances and devices without popping fuses. in suite washer and dryer saves time and cost savings. great apartment and location for doctors doing their training and trying to save money to repay student loans.,"[111962459, 111966147, 111966147]"
5,"never ever recommended !!!!!!! there is no option for a negative rating, had to give one. if you want to be completely miserable and absolutely hate where you live, deal with miscommunication, and live in a n---- camp, this is the place for you! i rather sleep in a homeless hiv tuberculosis ridden dump than do this again. this is not the place you get for what you are paying. we signed up for 6 month lease and vacated the place. after vacating, we were surprised to see huge fines for which they give blabber reasons. they don't mention few things at the time of inspection which is done at the time of vacating the place even though every thing looks great and clean. they come up with some damage and charge you for that. we were hugely charged for roach treatment and carpet replacement. there are no proofs for any of those. in the first place, the treatment cost wouldn't have been huge. if asked for invoice, they charge more for producing invoice. charging for carpet replacement is something which blown our mind. they charged for carpet replacement as 9 month lease. even, there was nothing mentioned at the time of inspection regarding that. as far as maintenance issues considered, it takes lots of weeks to get solved which is just pain in the ass. leasing consultants doesn't respond in helpful manner. when ever something comes up they are not ready to take it on their side or get it solved instead of that arguing takes place and simply say we are not responsible or contact some xyz. we felt like we were helpless in those situations and hands were tied. generally, people bear fines if things got worse or that would have been done by them. no one will be hesitating to clear the fines for that type of things. even though charges are made when there is no resident's fault, that doesn't look fair. it's managements responsibility to take care of things for next new resident instead of blindly charging the existing resident. the overall experience was a nightmare and would suggest others to be cautious before you take any move to step in or else ""get set ready for some huge fines"". there are no pros as far as i experienced. cons: management is very lazy, issues doesn't get solve, huge fines","[112453054, 112452966, 112452966, 112452966, 112452966]"
6,"renter's, employee's, contractor's beware of eureka's business practices as a business owner, and renter myself i felt compelled to share the disturbing things i've experienced and uncovered concerning this company and owner. here is the link to just a few of the news articles i found detailing multiple code violations, failed hud inspections, and numerous lawsuits alleging dangerous living conditions. https:// https:// https://dallasprogress. rene campos jr., president- 46, lives in a $1 million-dollar home in university park. he owns a $4 million-dollar vacation property hawaii. he also plays polo, rides a custom motorcycle, and drives a land rover. yet is referred to as a slumlord more concerned with maximizing his income by holding down expenses selecting the least expensive bids often using unlicensed contractors and necessary permits rather than fulfilling tenants living commitments. on top of the failed inspections, code violations, rene was also fined for failing to file 3 concecutive years of audited financial statements on time blaming ""inattentiveness"" by his accountants. to consider this companies ""code of ethics"" to be lacking would be modest. the multifamily industry depends on professional staff who routinely enter apartment homes, handle confidential information and carry out financial transactions. multifamily owners and operators check criminal backgrounds to help protect the safety and security of residents and staff and to reduce the risk of violence, theft and fraud. however, eureka hires unlicensed contractors to complete construction, renovations, electrical repairs requiring a license contractor and permits pulled by the city to approve the work prior to/ in progress/ upon completion. due to the short cuts and shady business dealings more often than not the project goes incomplete or not meeting standards. that's if the contractor doesn't walk of the job for non-payment. rene blames delayed repairs, incomplete projects, and non-payment on reluctant insurance carriers for his lack of financial resources. (reluctant is a mild term… the insurance carriers are fighting fraudulent claims on repairs/damages persisting prior to hurricane harvey they are seeking funds for.) i was even able to find reviews from previous employees describing the unprofessional & unsafe work environment. the high turnover could be in part to another unsafe attempt on expenses. i think this is what i found to be the most disturbing of all. eureka has a 30-day probation period for new hires giving them the status of ""contract labor"". common practice in multi-family housing right?? wrong!! ""contract employee"" status does not require a criminal background check!! meaning your friendly maintenance team may or may not be a convicted felon. eureka picked up the unethical practice to cure the constant staff turnover, cost of screening multiples, and only paying ""contract employee"" straight-time for anything exceeding 80 hours avoiding costly overtime expense. bottom line is this owner continues to make millions despite repeated contract violations, failure to maintain ""safe, decent, sanitary living conditions"", and continued failure to file required financial statements on time each year. it's no surprise eureka has come under relentless attacks by critics alleging fraud when it's clear they continue turning a profit by short-changing residents, employees, contractors, etc. the facts are in black in white! think twice before you engage with this owner or company.","[112429689, 112429689, 112429689, 112429689, 112429689, 112429688, 112429688, 112429688]"
7,"this community was an extremely beautiful, well kept and peaceful community. everything was so beautiful that i felt like i was at a resort. the office staff was extremely accommodating and friendly also. it could not be better located and was amidst a lot of shopping, restaurants, transportation and schools. it offered amenities that most communities did not.","[112698525, 112698525, 112239510]"


In [21]:
import re, html

# 單一詞彙或弱訊號（需要集滿或與測試語境靠近）
SOFT_TOKENS = [
    r"\btest\b",
    r"\bsample\b",
    r"\bdummy\b",
    r"\bqa\b",
    r"\bautomation\b",
]

# 先放「例外」條款（會優先於硬模板）
EXCEPTIONS = [
    r"\btest (these|this|the) (spot|spots)\b",            # 你的案例
    r"\btest drive\b",
    r"\bdrug test\b",
    r"\bcovid test\b",
    r"\btest score(s)?\b",
    r"\bfire alarm test\b",
    # ---- 真實檢測語境（健康/設備/住家）----
    r"\btest(ing)? (the )?apartment\b",
    r"\bmold (test|testing)\b",
    r"\b(air|air\s*quality|a/?c|hvac|water|lead|radon) (test|testing)\b",
    r"\bspecialist to test\b",
    r"\bshould test this\b",
    # ---- 「更新評論」真實語境排除 ----
    r"\bupdat(e|ing)\s+(my|our)\s+review(s)?\b",
    r"\bupdat(e|ing).+review sites?\b",                    # e.g., updating all review sites
    r"\bupdat(e|ing)\s+all\s+review(s)?\b",
]

# 收斂「Updating X Review…」為 UI/模板語氣（行首 + 專有名詞 + Review）
HARD_PATTERNS = [
    r"(?mi)^(?:updating|update)\s+[A-Z][\w'-]*(?:\s+[A-Z][\w'-]*)*\s+review\b(?:\s*(?:\.{3}|…))?",  # e.g., "Updating Le Villa Review..."
    r"describe your experience",
    r"help your fellow renters",
    r"we (don'?t|do not) allow reviews",
    r"lorem\s+ipsum",
    r"review\s+(content|text)\s+(goes|here)",
    r"^this is a test",
    r"for testing purpose",
    r"please ignore",
    r"\bautomation test\b",
    r"\bqa testing\b",
    r"^123+$",
    r"^abc+$",
]

# test 與測試語境詞彙的近鄰（彼此 0~3 個詞以內）
NEAR_TEST_CONTEXT = r"\btest\b(?:\W+\w+){0,3}\W+\b(review|please|ignore|data|only|post|comment|entry)\b"
NEAR_TEST_CONTEXT_REV = r"\b(review|please|ignore|data|only|post|comment|entry)\b(?:\W+\w+){0,3}\W+\btest\b"

def is_test_like(s: str) -> bool:
    if not isinstance(s, str):
        return False
    s_norm = html.unescape(s).lower().strip()
    if len(s_norm) < 5:
        return False

    # 例外語境先行排除（除非同時命中硬模板）
    if any(re.search(p, s_norm) for p in EXCEPTIONS):
        # 若同時命中硬模板仍視為測試
        if any(re.search(p, s_norm) for p in HARD_PATTERNS):
            return True
        else:
            return False

    # 硬模板：只要一條命中就判定為測試
    if any(re.search(p, s_norm) for p in HARD_PATTERNS):
        return True

    # 近鄰條件：test 與測試語境詞在 3 詞距內
    if re.search(NEAR_TEST_CONTEXT, s_norm) or re.search(NEAR_TEST_CONTEXT_REV, s_norm):
        return True

    # 軟指標需要累積（至少 2 個）
    soft_hits = sum(1 for p in SOFT_TOKENS if re.search(p, s_norm))
    if soft_hits >= 2:
        return True

    return False

# --- 套用偵測 ---
df["_is_test_like"] = df["review_text"].apply(is_test_like)
print(f"偵測到測試樣本數: {df['_is_test_like'].sum()} / {len(df)} ({df['_is_test_like'].mean():.2%})")

import pandas as pd
pd.set_option('display.max_colwidth', None)
display(df.loc[df["_is_test_like"], ["object_id","complex_id","review_text"]].head(20))


偵測到測試樣本數: 41 / 32183 (0.13%)


Unnamed: 0,object_id,complex_id,review_text
7,112932810,9199332346275191876,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
8,112932810,9199332346275191876,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
9,112932810,9199332346275191876,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
10,112932810,9199332346275191876,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
11,112932810,9199332346275191876,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
12,112932810,9199332346275191876,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
13,112932810,9199332346275191876,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
14,112932810,9199332346275191876,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
35,112932845,9199332346275143753,"Updating Le Villa Review...Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site. No Personal Info We don&apos;t allow reviews with names, phone numbers, email or other contact info, so please avoid using it."
36,112932845,9199332346275143753,"Updating Le Villa Review...Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site. No Personal Info We don&apos;t allow reviews with names, phone numbers, email or other contact info, so please avoid using it."


In [22]:
# --- 第二階段：語意檢測（只跑在沒被 regex 命中的評論上） ---
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# 1) 準備 QA/測試樣板語料（可自行擴充）
semantic_templates = [
    # 典型 UI/樣板
    "Updating <Property> Review... Describe your experience...",
    "Describe your experience",
    "Help your fellow renters get the most out of your review",
    "We don’t allow reviews with names or contact info",
    "This is a test review, please ignore.",
    "Sample text for testing purposes only.",
    "Review text goes here.",
    "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
    "12345",
    "abcde",
    # 你提供的工程師模板
    "Tongue flank swine shank shankle capicola jowl meatloaf venison prosciutto ribeye.",
    "Rump bacon corned beef doner porchetta.",
    "Tail sirloin spare ribs beef turkey chuck hamburger short loin cupim shankle porchetta frankfurter meatball.",
    "Pork loin corned beef biltong fatback cupim kevin rump ground round ham ham hock capicola.",
    "Kielbasa andouille corned beef, boudin pork chop porchetta tongue t-bone fatback drumstick short loin picanha.",
    # 你提供的 Occidental / Cambridge friend 文本
    "The new common language will be more simple and regular than the existing European languages. It will be as simple as Occidental; in fact, it will be Occidental.",
    "To an English person, it will seem like simplified English, as a skeptical Cambridge friend of mine told me what Occidental is.",
    "The European languages are members of the same family.",
]

# 2) 載入輕量語意模型（第一次會自動下載）
model = SentenceTransformer("all-MiniLM-L6-v2")

# 3) 對樣板建立向量（L2 正規化後可用點積作 cosine）
tmpl_emb = model.encode(semantic_templates, normalize_embeddings=True)

# 4) 僅針對「未被 regex 命中」的評論做語意比對
if "_is_test_like" not in df.columns:
    raise ValueError("找不到 _is_test_like 欄位。請先跑『第一階段：Regex 偵測』cell。")

mask_candidates = ~df["_is_test_like"].fillna(False)
texts = df.loc[mask_candidates, "review_text"].fillna("").astype(str).tolist()

# 若候選集為空，建立空欄位後結束
if len(texts) == 0:
    df["_semantic_score"] = df.get("_semantic_score", np.nan)
    df["_semantic_test_like"] = df.get("_semantic_test_like", False)
else:
    emb = model.encode(texts, normalize_embeddings=True)
    # 5) 計算每筆與樣板的最大相似度（cosine = dot 因已 normalize）
    max_sims = (emb @ tmpl_emb.T).max(axis=1)

    # 寫回 DataFrame
    df.loc[mask_candidates, "_semantic_score"] = max_sims

    # 6) 設定語意門檻（可調整：高→更嚴格；低→抓更多）
    THRESH = 0.65
    df["_semantic_test_like"] = df.get("_semantic_test_like", False)
    df.loc[mask_candidates, "_semantic_test_like"] = df.loc[mask_candidates, "_semantic_score"] > THRESH

# 7) 合併最終判定（不覆蓋你原本的 _is_test_like；產生新欄位）
df["_is_test_combined"] = df["_is_test_like"].fillna(False) | df["_semantic_test_like"].fillna(False)
df["_why"] = np.where(
    df["_is_test_like"].fillna(False), "regex",
    np.where(df["_semantic_test_like"].fillna(False), "semantic", "")
)

# 8) 檢視結果（前 20 筆）
cols = ["_is_test_combined","_why","_is_test_like","_semantic_test_like","_semantic_score","review_text"]
display(df.loc[df["_is_test_combined"], cols].sort_values(
    ["_is_test_like","_semantic_test_like","_semantic_score"], ascending=[False, False, False]
).head(20))

print(
    f"Regex 命中: {int(df['_is_test_like'].sum())} | "
    f"Semantic 新增命中: {int((df['_semantic_test_like'] & ~df['_is_test_like']).sum())} | "
    f"合併總計: {int(df['_is_test_combined'].sum())}"
)


Unnamed: 0,_is_test_combined,_why,_is_test_like,_semantic_test_like,_semantic_score,review_text
7,True,regex,True,False,,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
8,True,regex,True,False,,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
9,True,regex,True,False,,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
10,True,regex,True,False,,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
11,True,regex,True,False,,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
12,True,regex,True,False,,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
13,True,regex,True,False,,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
14,True,regex,True,False,,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
35,True,regex,True,False,,"Updating Le Villa Review...Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site. No Personal Info We don&apos;t allow reviews with names, phone numbers, email or other contact info, so please avoid using it."
36,True,regex,True,False,,"Updating Le Villa Review...Describe your experience, be it good or bad, but don&apos;t go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site. No Personal Info We don&apos;t allow reviews with names, phone numbers, email or other contact info, so please avoid using it."


Regex 命中: 41 | Semantic 新增命中: 20 | 合併總計: 61


In [23]:
import pandas as pd

df = pd.read_csv("../data/raw/exploratory_output.csv")
print(f"共有 {len(df):,} 筆資料")
df.head()



共有 32,183 筆資料


Unnamed: 0,object_id,complex_id,vote_reason_id,reason,date_created,date_updated,review_text,_text_raw,_len_words,_punct_ratio,...,_is_dup_near,_dup_group_id,_norm_text,_is_duplicate_text,_is_dup_near_m,_is_test_like,_semantic_score,_semantic_test_like,_is_test_combined,_why
0,112885551,972359147075013,6,"It contains threats, lewdness or hate speech",2021-09-27 13:52:34,2021-09-27 13:52:34,"Initially I was excited about moving to Settlers Gate but the experience quickly soured. Shortly after being approved for the apartment and had paid all of the required fees to reserve the space, I was told that the date that was originally given to me to move was no longer available. They asked me to move a week away from the original date. They did not try to accommodate me with an alternative option since it was their error in the first place. Seeing that I am not from the area I asked for a refund to find a place that could accommodate the original move in date. I was told that I would have to wait 2-4 weeks to receive a full refund because the refund comes from corporate. Now I have to spend extra money on admin, app, and deposit fees because of their lack of professionalism with their scheduling move in dates. Also, one of the male staff members hung up in my face when I was expressing my concern about the situation and how it was handled. The price for this place is awesome but the lack of professionalism is not worth the hassle.","Initially I was excited about moving to Settlers Gate but the experience quickly soured. Shortly after being approved for the apartment and had paid all of the required fees to reserve the space, I was told that the date that was originally given to me to move was no longer available. They asked me to move a week away from the original date. They did not try to accommodate me with an alternative option since it was their error in the first place. Seeing that I am not from the area I asked for a refund to find a place that could accommodate the original move in date. I was told that I would have to wait 2-4 weeks to receive a full refund because the refund comes from corporate. Now I have to spend extra money on admin, app, and deposit fees because of their lack of professionalism with their scheduling move in dates. Also, one of the male staff members hung up in my face when I was expressing my concern about the situation and how it was handled. The price for this place is awesome but the lack of professionalism is not worth the hassle.",199,0.013308,...,False,,"initially i was excited about moving to settlers gate but the experience quickly soured. shortly after being approved for the apartment and had paid all of the required fees to reserve the space, i was told that the date that was originally given to me to move was no longer available. they asked me to move a week away from the original date. they did not try to accommodate me with an alternative option since it was their error in the first place. seeing that i am not from the area i asked for a refund to find a place that could accommodate the original move in date. i was told that i would have to wait 2-4 weeks to receive a full refund because the refund comes from corporate. now i have to spend extra money on admin, app, and deposit fees because of their lack of professionalism with their scheduling move in dates. also, one of the male staff members hung up in my face when i was expressing my concern about the situation and how it was handled. the price for this place is awesome but the lack of professionalism is not worth the hassle.",False,False,False,0.25174,False,False,
1,112795026,9199332346275186185,1,It's for the wrong community,2021-08-23 17:11:15,2021-08-23 17:11:15,Love my apartment and the staff here. I like the landscape and the mature trees.Great location to everything and good rental rates. Overall satisfied and would recommend the apartments to friends or coworkers.,Love my apartment and the staff here. I like the landscape and the mature trees.Great location to everything and good rental rates. Overall satisfied and would recommend the apartments to friends or coworkers.,34,0.019139,...,False,,love my apartment and the staff here. i like the landscape and the mature trees.great location to everything and good rental rates. overall satisfied and would recommend the apartments to friends or coworkers.,False,False,False,0.437073,False,False,
2,112795026,9199332346275186185,1,It's for the wrong community,2021-08-23 16:55:42,2021-08-23 16:55:42,Love my apartment and the staff here. I like the landscape and the mature trees.Great location to everything and good rental rates. Overall satisfied and would recommend the apartments to friends or coworkers.,Love my apartment and the staff here. I like the landscape and the mature trees.Great location to everything and good rental rates. Overall satisfied and would recommend the apartments to friends or coworkers.,34,0.019139,...,False,,love my apartment and the staff here. i like the landscape and the mature trees.great location to everything and good rental rates. overall satisfied and would recommend the apartments to friends or coworkers.,False,False,False,0.437073,False,False,
3,112889848,9199332346275166581,3,It contains false information,2021-08-17 09:41:23,2021-08-17 09:41:23,My experience with Century was outstanding while I lived there. Nice neighbors everyone kept to themselves and very safe with all of the surveillance and only key access doors. The staff were excellent lead by an exceptional Manager Tina who was very easy to work with always available for my living needs and very nice and professional at all times. I enjoyed my stay while living here in Anchorage. Thank You...,My experience with Century was outstanding while I lived there. Nice neighbors everyone kept to themselves and very safe with all of the surveillance and only key access doors. The staff were excellent lead by an exceptional Manager Tina who was very easy to work with always available for my living needs and very nice and professional at all times. I enjoyed my stay while living here in Anchorage. Thank You...,71,0.016949,...,False,,my experience with century was outstanding while i lived there. nice neighbors everyone kept to themselves and very safe with all of the surveillance and only key access doors. the staff were excellent lead by an exceptional manager tina who was very easy to work with always available for my living needs and very nice and professional at all times. i enjoyed my stay while living here in anchorage. thank you...,False,False,False,0.252121,False,False,
4,112932850,9199332346275193048,3,It contains false information,2021-08-16 14:25:58,2021-08-16 14:25:58,"Tongue flank swine shank shankle capicola jowl meatloaf venison prosciutto ribeye. Rump bacon corned beef doner porchetta. Tail sirloin spare ribs beef turkey chuck hamburger short loin cupim shankle porchetta frankfurter meatball. Pork loin corned beef biltong fatback cupim kevin rump ground round ham ham hock capicola. Venison cow doner leberkas capicola tongue chuck turkey ground round jowl pork loin beef. Kielbasa andouille corned beef, boudin pork chop porchetta tongue t-bone fatback drumstick short loin picanha.","Tongue flank swine shank shankle capicola jowl meatloaf venison prosciutto ribeye. Rump bacon corned beef doner porchetta. Tail sirloin spare ribs beef turkey chuck hamburger short loin cupim shankle porchetta frankfurter meatball. Pork loin corned beef biltong fatback cupim kevin rump ground round ham ham hock capicola. Venison cow doner leberkas capicola tongue chuck turkey ground round jowl pork loin beef. Kielbasa andouille corned beef, boudin pork chop porchetta tongue t-bone fatback drumstick short loin picanha.",77,0.015296,...,False,,"tongue flank swine shank shankle capicola jowl meatloaf venison prosciutto ribeye. rump bacon corned beef doner porchetta. tail sirloin spare ribs beef turkey chuck hamburger short loin cupim shankle porchetta frankfurter meatball. pork loin corned beef biltong fatback cupim kevin rump ground round ham ham hock capicola. venison cow doner leberkas capicola tongue chuck turkey ground round jowl pork loin beef. kielbasa andouille corned beef, boudin pork chop porchetta tongue t-bone fatback drumstick short loin picanha.",False,False,False,0.790424,True,True,semantic


In [24]:
import re, math
from collections import Counter
from typing import Iterable

# -----------------------------
# Tunables (adjust per corpus)
# -----------------------------
LONG_RUN_WORDS      = 50
MAX_PUNCT_RATIO     = 0.55
NONASCII_THR        = 0.20
SHORT_ALLOWED       = {"ok", "good", "great", "nice", "love", "bad", "meh", "mid", "fine"}
NEGATIVE_KEYWORDS   = {"terrible","awful","scam","fraud","hate","worst","sucks","filthy","disgusting"}
FIRST_PERSON_HINTS  = {"i ","i'm ","i’ve ","i've ","my ","me ","we ","our "}
TIME_PLACE_HINTS    = {
    "today","yesterday","last week","monday","tuesday","wednesday","thursday","friday","saturday","sunday",
    "202"," ny","new york","boston","la ","los angeles","chicago","seattle","sf ","san francisco","miami",
}
KEYBOARD_SMASH_SEQS = {"asdf","qwer","zxcv","hjkl","asdfgh","qwerty","zxcvbn"}

EMOJI_PATTERN = re.compile("[\U0001F300-\U0001FAFF\U00002700-\U000027BF]")
RE_SENT_END   = re.compile(r"[.!?…]+")                       # English punctuation only
RE_WORD       = re.compile(r"\b[\w’']+\b", re.UNICODE)
RE_REPEAT_TOK = re.compile(r"(\b[\w’']+\b)(?:\s+\1){1,}", re.UNICODE)
RE_REPEAT_CHAR= re.compile(r"(.)\1{3,}", re.UNICODE)         # >=4 repeated chars
RE_CONS_RUN   = re.compile(r"(?i)[bcdfghjklmnpqrstvwxyz]{5,}")
RE_URL        = re.compile(r"https?://|www\.", re.IGNORECASE)
RE_EMAIL      = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)

def _safe_str(s) -> str:
    return (s or "").strip()

def _words(s: str) -> list[str]:
    return RE_WORD.findall(s.lower())

def _char_trigram_entropy(s: str) -> float:
    s = _safe_str(s).lower()
    if len(s) < 10:
        return 99.0
    trigrams = [s[i:i+3] for i in range(len(s)-2)]
    if not trigrams:
        return 99.0
    c = Counter(trigrams)
    total = sum(c.values())
    return -sum((n/total) * math.log(n/total + 1e-12, 2) for n in c.values())

def _vowel_ratio(s: str) -> float:
    s = _safe_str(s).lower()
    if not s:
        return 0.0
    vowels = sum(ch in "aeiou" for ch in s)
    letters= sum(ch.isalpha() for ch in s)
    return vowels / max(1, letters)

def _emoji_ratio(s: str) -> float:
    s = _safe_str(s)
    if not s:
        return 0.0
    em = len(EMOJI_PATTERN.findall(s))
    return em / max(1, len(s))

def _url_like_ratio(s: str) -> float:
    s = _safe_str(s)
    if not s:
        return 0.0
    url_hits = sum(1 for _ in RE_URL.finditer(s))
    return url_hits / max(1, len(s.split()))

# 取文字欄位：容忍多種常見欄位名
def _get_text(row) -> str:
    return _safe_str(
        row.get("_text_raw")
        or row.get("review_text")
        or row.get("text")
        or row.get("content")
        or ""
    )

# ---- 強化版 gibberish_score_v3 ----
def gibberish_score_v3(s: str) -> int:
    """
    Signals (add points = more likely garbage):
    +1 very low lexical diversity (unique_ratio < 0.35)
    +1 very long single run without sentence-ending punctuation ([.!?…]) and word count > LONG_RUN_WORDS
    +1 consecutive repeated tokens (e.g., 'great great great')
    +1 repeated chars >= 4 (aaaa, !!!!, ~~~~)
    +1 low vowel ratio (<0.25) OR long consonant run (>=5)
    +1 low char trigram entropy (< 2.3)
    +1 keyboard smash sequences (asdf / qwerty / zxcvbn)
    +1 URL-dominant (url_like_ratio > 0.4) OR contains email with short text (<12 words)

    Guards (subtract points = more likely genuine):
    -1 contains NEGATIVE_KEYWORDS
    -1 first-person hints (I/my/me/we/our)
    -1 time/place hints (today, Monday, New York, etc.)

    Extra heuristics:
    +2 letters-only long run (>=30) without sentence ends
    +2 high-entropy (>5.0) & mid-low vowel ratio (<0.45) when len(s) >= 60
    """
    if not isinstance(s, str) or not s.strip():
        return 0

    s_lower = s.lower()
    ws = _words(s_lower)
    if not ws:
        return 0

    score = 0

    unique_ratio = len(set(ws)) / max(1, len(ws))
    if unique_ratio < 0.35:
        score += 1

    has_sentence_end = bool(RE_SENT_END.search(s))
    if (not has_sentence_end) and (len(ws) > LONG_RUN_WORDS):
        score += 1

    if RE_REPEAT_TOK.search(s_lower):
        score += 1
    if RE_REPEAT_CHAR.search(s):
        score += 1

    vowel_ratio = _vowel_ratio(s)
    if vowel_ratio < 0.25 or RE_CONS_RUN.search(s_lower):
        score += 1

    entropy = _char_trigram_entropy(s)
    if entropy < 2.3:
        score += 1

    if any(k in s_lower for k in KEYBOARD_SMASH_SEQS):
        score += 1

    url_like = _url_like_ratio(s)
    if url_like > 0.4 or (RE_EMAIL.search(s) and len(ws) < 12):
        score += 1

    if any(kw in s_lower for kw in NEGATIVE_KEYWORDS):
        score -= 1
    if any(h in s_lower for h in FIRST_PERSON_HINTS):
        score -= 1
    if any(h in s_lower for h in TIME_PLACE_HINTS):
        score -= 1

    # 〈新增〉長串字母湯：拿掉非 a-z 後仍很長，且沒有句點/驚嘆號等結尾符號
    letters_only = re.sub(r"[^a-z]", "", s_lower)
    if len(letters_only) >= 30 and not has_sentence_end:
        score += 2

    # 〈新增〉高熵 + 偏低母音比例 + 長字串
    if len(s) >= 60 and entropy > 5.0 and vowel_ratio < 0.45:
        score += 2

    return max(0, score)

# ---- 容錯＋補特徵版 is_low_quality_v3 ----
# 允許多種欄位名；哪個有值就用哪個
def _get_text(row) -> str:
    return _safe_str(
        row.get("_text_raw")
        or row.get("review_text")
        or row.get("text")
        or row.get("content")
        or ""
    )

def is_low_quality_v3(row) -> bool:
    """
    Final rules (同你原本的規則)：
    0) wc < 2 → drop
    1) wc ∈ [2,4] and contains SHORT_ALLOWED → keep
    2) wc > 10000 → drop
    3) punct_ratio > MAX_PUNCT_RATIO or nonascii_ratio > NONASCII_THR → drop，
       但若為長文且含負面或第一人稱字樣，則不丟
    4) Emotion guard: wc >= 10 and punct_ratio < 0.35 and has [!?] → keep
    5) Emoji-heavy (>0.25) and short (wc < 8) → drop
    6) URL-only / link farm (url_like_ratio > 0.5) → drop
    7) gibberish_score_v3 >= 3 → drop；若 ==2，除非有負面/第一人稱/時間地點提示，否則丟
    """
    s = _get_text(row)

    # 若前處理欄位缺漏或為 0/NaN，這裡動態計算
    try:
        wc = int(row.get("_len_words", 0))
    except Exception:
        wc = 0
    if wc <= 0:
        wc = len(_words(s))

    try:
        pr = float(row.get("_punct_ratio", 0.0))
    except Exception:
        pr = 0.0
    if pr == 0.0 and s:
        punct_chars = set(".,!?;:…—-()[]{}'\"")
        pr = sum(ch in punct_chars for ch in s) / max(1, len(s))

    try:
        nr = float(row.get("_nonascii_ratio", 0.0))
    except Exception:
        nr = 0.0
    if nr == 0.0 and s:
        nr = sum(ord(ch) > 127 for ch in s) / max(1, len(s))

    # ---- 以下規則 ----
    if wc < 2:
        return True

    if 2 <= wc <= 4:
        ws = set(_words(s))
        if not ws.isdisjoint(SHORT_ALLOWED):
            return False

    if wc > 10000:
        return True

    # --- 修正版：高標點/非ASCII 保留「真實抱怨文」 ---
    has_first_person = any(h in s.lower() for h in FIRST_PERSON_HINTS)
    has_negative_kw  = any(kw in s.lower() for kw in NEGATIVE_KEYWORDS)

    if pr > MAX_PUNCT_RATIO or nr > NONASCII_THR:
        if wc >= 20 and (has_first_person or has_negative_kw):
            pass  # 視為真實評論，繼續往下判斷
        else:
            return True
    # -----------------------------------------------------

    if wc >= 10 and pr < 0.35 and re.search(r"[!?]", s):
        return False

    if _emoji_ratio(s) > 0.25 and wc < 8:
        return True

    if _url_like_ratio(s) > 0.5:
        return True

    gib = gibberish_score_v3(s)

    if gib >= 3:
        return True
    if gib == 2:
        s_lower = s.lower()
        has_guard = (
            any(kw in s_lower for kw in NEGATIVE_KEYWORDS)
            or any(h in s_lower for h in FIRST_PERSON_HINTS)
            or any(h in s_lower for h in TIME_PLACE_HINTS)
        )
        return not has_guard

    return False




In [25]:
needle = "hgtghjjvfesdfhjkoouygrsdcvbnkopiytrewqasddfvcxzsdfvnjjhgdserfgjkoppkmnnbggthbvff...,,imbue,,lunggujjgfdswefgyytrsdfhmnbvcxsd,..mjnbfgjoitesdghk..jhvvsweghioopkhgfvhj,.ljhvdrfdsweryuopknvxawwfgil..jhfddvh"
print(is_low_quality_v3({"review_text": needle}))


True


In [26]:
import os
import pandas as pd
import numpy as np

# 讀檔（若 df 尚未存在）
csv_path = "../data/raw/exploratory_output.csv"
if "df" not in globals():
    df = pd.read_csv(csv_path)
    print(f"[info] 讀入 {csv_path} ｜ 共 {len(df):,} 列")
else:
    print(f"[info] 已存在 df（共 {len(df):,} 列），直接使用）")

# 對齊 _text_raw（若缺則從常見欄位鏡像）
if "_text_raw" not in df.columns:
    for cand in ["review_text", "text", "content"]:
        if cand in df.columns:
            df["_text_raw"] = df[cand].astype(str)
            print(f"[info] 使用欄位 {cand} → 映射為 _text_raw")
            break
    if "_text_raw" not in df.columns:
        raise KeyError("找不到文字欄位（_text_raw / review_text / text / content）")

# 補齊必要特徵（若缺）
if "_len_words" not in df.columns:
    df["_len_words"] = df["_text_raw"].fillna("").str.findall(r"\b[\w’']+\b").apply(len)
if "_punct_ratio" not in df.columns:
    punct_chars = set(".,!?;:…—-()[]{}'\"")
    df["_punct_ratio"] = df["_text_raw"].fillna("").apply(lambda s: sum(ch in punct_chars for ch in s) / max(1, len(s)))
if "_nonascii_ratio" not in df.columns:
    df["_nonascii_ratio"] = df["_text_raw"].fillna("").apply(lambda s: sum(ord(ch) > 127 for ch in s) / max(1, len(s)))

# ✅ 一律重算 is_low_quality（避免舊版結果）
print("[info] 正在重新計算 is_low_quality 欄位 ...")
df["is_low_quality"] = df.apply(is_low_quality_v3, axis=1)
print("[info] 已完成重算。")

# 篩出低品質評論
low_mask = df["is_low_quality"] == True
cols = [c for c in ["object_id", "_text_raw", "review_text", "_len_words", "_punct_ratio", "_nonascii_ratio", "is_low_quality"] if c in df.columns]
low_quality_df = df.loc[low_mask, cols]

# 統計
n_total = len(df)
n_low = len(low_quality_df)
pct = n_low / max(1, n_total) * 100
print(f"[summary] 被判定為低品質的共有 {n_low:,} 筆（{pct:.2f}%）")

# 預覽前 10 筆（避免整張表塞滿輸出）
if n_low > 0:
    print("\n=== 預覽前 10 筆低品質評論 ===")
    display(low_quality_df.head(10))
else:
    print("[info] 沒有低品質評論。")

# 匯出
output_path = "../data/debug/low_quality_rows.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
low_quality_df.to_csv(output_path, index=False)
print(f"[info] 已匯出 {n_low:,} 筆至 {output_path}")


[info] 已存在 df（共 32,183 列），直接使用）
[info] 正在重新計算 is_low_quality 欄位 ...
[info] 已完成重算。
[summary] 被判定為低品質的共有 2,385 筆（7.41%）

=== 預覽前 10 筆低品質評論 ===


Unnamed: 0,object_id,_text_raw,review_text,_len_words,_punct_ratio,_nonascii_ratio,is_low_quality
38,112932831,The only reason why they would like me is because they have to wait for a the time to go get on their food so I can go nuggets lol lol but I’ll let you know if you have any questions or questions I need to help you out with your thoughts about,The only reason why they would like me is because they have to wait for a the time to go get on their food so I can go nuggets lol lol but I’ll let you know if you have any questions or questions I need to help you out with your thoughts about,54,0.0,0.004115,True
39,112932827,The same time for me to see if I could have the same thing to be able with my phone lol but I’ll let you know when I’m ready for a meeting and I can go back on the time to see if you guys are interested and if I could,The same time for me to see if I could have the same thing to be able with my phone lol but I’ll let you know when I’m ready for a meeting and I can go back on the time to see if you guys are interested and if I could,53,0.0,0.009217,True
40,112932831,The only reason why they would like me is because they have to wait for a the time to go get on their food so I can go nuggets lol lol but I’ll let you know if you have any questions or questions I need to help you out with your thoughts about,The only reason why they would like me is because they have to wait for a the time to go get on their food so I can go nuggets lol lol but I’ll let you know if you have any questions or questions I need to help you out with your thoughts about,54,0.0,0.004115,True
122,112846703,"e Specific Don't just complain about something; rather, describe specific things you do or don't like. Explain what happened. We've found users lend more weight to a review with details. Be Courteous and Honest","e Specific Don&apos;t just complain about something; rather, describe specific things you do or don&apos;t like. Explain what happened. We&apos;ve found users lend more weight to a review with details. Be Courteous and Honest",37,0.038095,0.0,True
129,112718022,Final case to check out the first day of that day I was gonna we had yyyy our day and we had our last game for the first week of our game so I think I would be a great team but if I do I'll let y'all go and I will be there tomorrow I'll be happy Birthday I,Final case to check out the first day of that day I was gonna we had yyyy our day and we had our last game for the first week of our game so I think I would be a great team but if I do I&apos;ll let y&apos;all go and I will be there tomorrow I&apos;ll be happy Birthday I,63,0.011719,0.0,True
130,112821726,review post test test test ndhei w icbsbbwhsj e ejehebe ueuebe ejebe ejeje e eieueie isieje jeieiiee ueueueu jdjdjdj jejeje uwjwjw ieueue eiusheeh iqowiwir liqueur iwiwhebr wiwoejhr iwieiir wiejrhrhbe att QA,review post test test test ndhei w icbsbbwhsj e ejehebe ueuebe ejebe ejeje e eieueie isieje jeieiiee ueueueu jdjdjdj jejeje uwjwjw ieueue eiusheeh iqowiwir liqueur iwiwhebr wiwoejhr iwieiir wiejrhrhbe att QA,31,0.0,0.0,True
131,112889749,"The Big Oxmox advised her not to do so, because there were thousands of bad Commas, wild Question Marks and devious Semikoli, but the Little Blind Text didn’t listen. She packed her seven versalia, put her initial into the belt and made herself on the way.","The Big Oxmox advised her not to do so, because there were thousands of bad Commas, wild Question Marks and devious Semikoli, but the Little Blind Text didn’t listen. She packed her seven versalia, put her initial into the belt and made herself on the way.",47,0.023438,0.003906,True
174,112929642,BbbbbbbbbbbbbbbbbbbbbbbbbbnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnMmmmmmmmmkmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmkkmmmmmmmmmmkmkkmmm,BbbbbbbbbbbbbbbbbbbbbbbbbbnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnMmmmmmmmmkmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmkkmmmmmmmmmmkmkkmmm,1,0.0,0.0,True
183,112853620,"The Quin apartments are a beautiful addition to Plainfield and surrounding community. There are various layouts all featuring nice open space, large windows and high end appliances. The Quin has an impressive range of amenities including beautiful common space, outdoor grills, gym and business center. It is about a 10 minute walk to the Netherwood train station.","The Quin apartments are a beautiful addition to Plainfield and surrounding community. There are various layouts all featuring nice open space, large windows and high end appliances. The Quin has an impressive range of amenities including beautiful common space, outdoor grills, gym and business center. It is about a 10 minute walk to the Netherwood train station.",57,0.019231,0.0,True
185,112926966,"There is a loud train, but other than that its spectacular. There is a loud train, but other than that its spectacular. There is a loud train, but other than that its spectacular. There is a loud train, but other than that its spectacular.","There is a loud train, but other than that its spectacular. There is a loud train, but other than that its spectacular. There is a loud train, but other than that its spectacular. There is a loud train, but other than that its spectacular.",44,0.033473,0.0,True


[info] 已匯出 2,385 筆至 ../data/debug/low_quality_rows.csv


In [27]:
!python find_duplicate_defs.py 01_exploratory_iteration.ipynb


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


=== Function definitions report ===
Total defs found: 31
Unique function names: 26
Functions with duplicates: 5

--- Duplicates ---

_get_text  (count=2)
  - 01_exploratory_iteration.ipynb:816  [cell #23 line 69]  signature: _get_text(row)
  - 01_exploratory_iteration.ipynb:905  [cell #23 line 158]  signature: _get_text(row)

is_test_like  (count=2)
  - 01_exploratory_iteration.ipynb:143  [cell #8 line 49]  signature: is_test_like(s)
  - 01_exploratory_iteration.ipynb:624  [cell #20 line 53]  signature: is_test_like(s)

normalize_for_exact  (count=2)
  - 01_exploratory_iteration.ipynb:376  [cell #16 line 13]  signature: normalize_for_exact(s)
  - 01_exploratory_iteration.ipynb:417  [cell #17 line 17]  signature: normalize_for_exact(s)

normalize_for_near  (count=2)
  - 01_exploratory_iteration.ipynb:385  [cell #16 line 22]  signature: normalize_for_near(s)
  - 01_exploratory_iteration.ipynb:425  [cell #17 line 25]  signature: normalize_for_near(s)

normalize_text  (count=2)
  - 01_expl

In [28]:
import inspect, textwrap

def show_active(func):
    src, start = inspect.getsourcelines(func)
    print(f"Starts at line {start}")
    print(textwrap.dedent(src[0].rstrip()))

# 範例（逐一檢）
# show_active(normalize_text)
# show_active(is_test_like)
# show_active(normalize_for_exact)
# show_active(normalize_for_near)
# show_active(_get_text)


In [29]:
# === Extract latest (last-in-file) version of target functions and write a clean module ===
from pathlib import Path
import json, os, datetime
from typing import List, Dict, Tuple
import ast

# ===== 你可能需要修改的參數 =====
NOTEBOOK_PATH = "01_exploratory_iteration.ipynb"  # 若不在當前資料夾，改成絕對路徑
TARGETS = ["normalize_text", "is_test_like", "normalize_for_exact", "normalize_for_near", "_get_text"]
OUT_PATH = Path("utils/text_normalize.py")
# =================================

# 小工具：讀取 ipynb 內的所有 code cell
def load_ipynb_code(path: Path) -> List[Tuple[int, str]]:
    txt = path.read_text(encoding="utf-8")
    data = json.loads(txt)
    cells = []
    for i, cell in enumerate(data.get("cells", [])):
        if cell.get("cell_type") == "code":
            src = cell.get("source") or []
            if isinstance(src, list):
                src = "".join(src)
            cells.append((i, src))
    return cells

# 小工具：AST 掃描某段原始碼的 function defs（含 end_lineno；若不能就只取起始）
def scan_defs_in_source(src: str, file: str, cell_index: int, base_line: int):
    out = []
    try:
        tree = ast.parse(src)
    except SyntaxError:
        return out
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            name = node.name
            start = getattr(node, "lineno", 1)
            end = getattr(node, "end_lineno", None)
            out.append({
                "name": name,
                "file": file,
                "cell_index": cell_index,
                "line_in_cell": start,
                "abs_line": base_line + start - 1,
                "end_in_cell": end,
            })
    return out

# 小工具：切出該函式完整原始碼（使用 end_lineno；若無則從 def 開始往下抓到下一個 def/class 或結尾）
def extract_func_source(src: str, start_line: int, end_line: int | None):
    lines = src.splitlines()
    if end_line is None:
        # fallback：尋找下一個 def/class
        i = start_line - 1
        end = len(lines)
        for j in range(i + 1, len(lines)):
            L = lines[j].lstrip()
            if L.startswith("def ") or L.startswith("class "):
                end = j
                break
        return "\n".join(lines[i:end]) + "\n"
    else:
        return "\n".join(lines[start_line-1:end_line]) + "\n"

# 嘗試尋找 notebook 檔案
nb_path = Path(NOTEBOOK_PATH)
if not nb_path.exists():
    # 在目前目錄搜尋名字相符的檔案
    cand = list(Path(".").glob("**/01_exploratory_iteration.ipynb"))
    if cand:
        nb_path = cand[0]
    else:
        raise FileNotFoundError(f"找不到 notebook：{NOTEBOOK_PATH}")

# 讀取所有 code cell
cells = load_ipynb_code(nb_path)

# 逐 cell 掃描所有 defs，並記錄（依檔案物理順序建立「全檔行號」）
defs = []
running_line = 1
for idx, src in cells:
    defs.extend(scan_defs_in_source(src, str(nb_path), idx, running_line))
    running_line += src.count("\n") + 1

# 依函式名分組，找出「最後一次定義」
from collections import defaultdict
group = defaultdict(list)
for d in defs:
    group[d["name"]].append(d)

latest = {}
for name, arr in group.items():
    # 以 (cell_index, abs_line) 最大者為最後一次定義
    arr_sorted = sorted(arr, key=lambda x: (x["cell_index"], x["abs_line"]))
    latest[name] = arr_sorted[-1]  # last one

# 報告 + 準備匯出
print("=== Latest function versions (by file order) ===")
export_items = []  # (name, code)
for name in TARGETS:
    if name not in group:
        print(f"[MISS] {name}  ← 檔案中找不到")
        continue
    last = latest[name]
    # 取得該 cell 原始碼
    cell_index = last["cell_index"]
    src = cells[cell_index][1]
    start = last["line_in_cell"]
    end = last.get("end_in_cell")
    code = extract_func_source(src, start, end)
    export_items.append((name, code))
    # 舊版（若有）
    dups = group[name]
    print(f"\n{name}: 取用最後定義 → cell #{cell_index}, line {start}")
    older = [d for d in dups if not (d["cell_index"] == cell_index and d["line_in_cell"] == start)]
    if older:
        print("  其他版本（較早）:")
        for o in sorted(older, key=lambda x:(x["cell_index"], x["abs_line"])):
            print(f"   - cell #{o['cell_index']}, line {o['line_in_cell']}")

# 寫出 utils/text_normalize.py
if export_items:
    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    header = [
        '# Auto-generated from notebook scan',
        f'# Source notebook: {nb_path}',
        f'# Generated at: {ts}',
        '# DO NOT edit in notebook again—treat this as canonical module.\n'
    ]
    with OUT_PATH.open("w", encoding="utf-8") as f:
        f.write("\n".join(header))
        for name, code in export_items:
            # 在每個函式前加上簡短註記
            f.write(f"# --- canonical: {name} ---\n")
            f.write(code if code.endswith("\n") else code + "\n")
    print(f"\n✅ 已輸出 {len(export_items)} 個函式到: {OUT_PATH.resolve()}")
    print("接下來請在 notebook 開頭加入：\n")
    print("%load_ext autoreload\n%autoreload 2\nfrom utils.text_normalize import (\n    " + ",\n    ".join([n for n,_ in export_items]) + "\n)")
else:
    print("\n⚠️ 沒有可輸出的目標函式（TARGETS 可能不在這份 notebook 內）。")


=== Latest function versions (by file order) ===

normalize_text: 取用最後定義 → cell #18, line 5
  其他版本（較早）:
   - cell #8, line 12

is_test_like: 取用最後定義 → cell #20, line 53
  其他版本（較早）:
   - cell #8, line 49

normalize_for_exact: 取用最後定義 → cell #17, line 17
  其他版本（較早）:
   - cell #16, line 13

normalize_for_near: 取用最後定義 → cell #17, line 25
  其他版本（較早）:
   - cell #16, line 22

_get_text: 取用最後定義 → cell #23, line 158
  其他版本（較早）:
   - cell #23, line 69

✅ 已輸出 5 個函式到: /Users/tiffanytseng/Documents/ai-review-moderation-2/notebooks/utils/text_normalize.py
接下來請在 notebook 開頭加入：

%load_ext autoreload
%autoreload 2
from utils.text_normalize import (
    normalize_text,
    is_test_like,
    normalize_for_exact,
    normalize_for_near,
    _get_text
)


In [30]:
import inspect
for f in [normalize_text, is_test_like, normalize_for_exact, normalize_for_near, _get_text]:
    print(f"{f.__name__} →", inspect.getfile(f))


normalize_text → /var/folders/gz/bhmz7vj94s7207h9k2_rgndc0000gn/T/ipykernel_37974/2834907427.py
is_test_like → /var/folders/gz/bhmz7vj94s7207h9k2_rgndc0000gn/T/ipykernel_37974/1805633953.py
normalize_for_exact → /var/folders/gz/bhmz7vj94s7207h9k2_rgndc0000gn/T/ipykernel_37974/3273057046.py
normalize_for_near → /var/folders/gz/bhmz7vj94s7207h9k2_rgndc0000gn/T/ipykernel_37974/3273057046.py
_get_text → /var/folders/gz/bhmz7vj94s7207h9k2_rgndc0000gn/T/ipykernel_37974/240333897.py


In [31]:
!ls utils
!cat utils/text_normalize.py | head -n 20


text_normalize.py


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Auto-generated from notebook scan
# Source notebook: 01_exploratory_iteration.ipynb
# Generated at: 2025-10-24 14:19:38
# DO NOT edit in notebook again—treat this as canonical module.
# --- canonical: normalize_text ---
def normalize_text(s: str) -> str:
    if not isinstance(s, str): return ""
    s = html.unescape(s)
    s = s.strip().casefold()
    s = re.sub(r"\s+", " ", s)
    return s
# --- canonical: is_test_like ---
def is_test_like(s: str) -> bool:
    if not isinstance(s, str):
        return False
    s_norm = html.unescape(s).lower().strip()
    if len(s_norm) < 5:
        return False

    # 例外語境先行排除（除非同時命中硬模板）


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
import inspect
for name in ["normalize_text","is_test_like","normalize_for_exact","normalize_for_near"]:
    f = getattr(textnorm, name)
    print(f"{name} →", inspect.getfile(f))



normalize_text → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
is_test_like → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
normalize_for_exact → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py
normalize_for_near → /Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py


In [33]:
from importlib import reload
reload(textnorm)


<module 'utils.text_normalize' from '/Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py'>

In [34]:
!grep -n "import" /Users/tiffanytseng/Documents/ai-review-moderation-2/notebooks/utils/text_normalize.py


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [35]:
import inspect
inspect.getfile(textnorm)


'/Users/tiffanytseng/Documents/ai-review-moderation-2/utils/text_normalize.py'

In [36]:
textnorm.normalize_text("Hello&nbsp;WORLD!!!   ")
# 預期：'hello world!!!'


'hello world!!!'

In [37]:
assert textnorm.normalize_text("  A  ") == "a"
assert textnorm.normalize_for_exact("Hi\u200b  ! ") == "hi !"
assert textnorm.normalize_for_near("Hi, world!!!") == "hi world"
print("smoke tests passed ✓")


smoke tests passed ✓


In [38]:
# === 測試 is_test_like 標記數量 ===
print("[info] 正在套用 is_test_like ...")
df["is_test_like"] = df["_text_raw"].apply(textnorm.is_test_like)

# 統計結果
n_total = len(df)
n_testlike = df["is_test_like"].sum()
pct = n_testlike / max(1, n_total) * 100
print(f"共有 {n_testlike:,} 筆被判定為 is_test_like（占 {pct:.2f}%）")

# 預覽前 10 筆
if n_testlike > 0:
    print("\n=== 預覽前 10 筆 is_test_like ===")
    display(df.loc[df["is_test_like"], ["object_id", "_text_raw"]].head(10))
else:
    print("[info] 沒有符合 is_test_like 的資料。")


[info] 正在套用 is_test_like ...
共有 225 筆被判定為 is_test_like（占 0.70%）

=== 預覽前 10 筆 is_test_like ===


Unnamed: 0,object_id,_text_raw
7,112932810,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don't just complain about something; rather, describe specific things you do or don't like. Explain what happened. We've found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
8,112932810,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don't just complain about something; rather, describe specific things you do or don't like. Explain what happened. We've found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
9,112932810,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don't just complain about something; rather, describe specific things you do or don't like. Explain what happened. We've found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
10,112932810,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don't just complain about something; rather, describe specific things you do or don't like. Explain what happened. We've found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
11,112932810,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don't just complain about something; rather, describe specific things you do or don't like. Explain what happened. We've found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
12,112932810,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don't just complain about something; rather, describe specific things you do or don't like. Explain what happened. We've found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
13,112932810,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don't just complain about something; rather, describe specific things you do or don't like. Explain what happened. We've found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
14,112932810,"Help your fellow renters get the most out of your review with the following tips: Be Specific Don't just complain about something; rather, describe specific things you do or don't like. Explain what happened. We've found users lend more weight to a review with details. Be Courteous and Honest Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site."
35,112932845,"Updating Le Villa Review...Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site. No Personal Info We don't allow reviews with names, phone numbers, email or other contact info, so please avoid using it."
36,112932845,"Updating Le Villa Review...Describe your experience, be it good or bad, but don't go overboard. Keep your review precise and to the point. We value open expression, but profanity and hateful speech are not allowed on the site. No Personal Info We don't allow reviews with names, phone numbers, email or other contact info, so please avoid using it."


In [39]:
from importlib import reload
reload(textnorm)

sample = "hgtghjjvfesdfhjkoouygrsdcvbnkopiytrewqasddfvcxzsdfvnjjhgdserfgjkoppkmnnbggthbvff...,,imbue,,lunggujjgfdswefgyytrsdfhmnbvcxsd,..mjnbfgjoitesdghk..jhvvsweghioopkhgfvhj,.ljhvdrfdsweryuopknvxawwfgil..jhfddvh"
print("gib_score =", textnorm.gibberish_score_v3(sample))
print("low_quality =", textnorm.is_low_quality_v3({"_text_raw": sample, "_len_words": len(sample.split())}))


gib_score = 2
low_quality = True


In [43]:
# === Run low_quality v3 over full dataset (simple & reliable) ===
from importlib import reload
import time
import pandas as pd

reload(textnorm)

t0 = time.time()

# 1) 準備必要特徵（向量化，避免 axis=1 慢計算）
if "_text_raw" not in df.columns:
    for cand in ["review_text", "text", "content"]:
        if cand in df.columns:
            df["_text_raw"] = df[cand].astype(str)
            break
    if "_text_raw" not in df.columns:
        raise KeyError("缺少文字欄位：_text_raw / review_text / text / content")

if "_len_words" not in df.columns:
    df["_len_words"] = (
        df["_text_raw"].fillna("")
        .str.findall(r"\b[\w’']+\b")
        .apply(len)
    )

# 2) 計算 gibberish 分數（每列一次函式呼叫，但很快）
print("[info] computing gibberish_score_v3 ...")
df["_gibberish_score_v3"] = df["_text_raw"].apply(textnorm.gibberish_score_v3)

# 3) 判定低品質（函式只用 _text_raw + _len_words）
print("[info] computing is_low_quality_v3 ...")
df["_is_low_quality_v3"] = df.apply(textnorm.is_low_quality_v3, axis=1)

elapsed = time.time() - t0
n = len(df)
n_low = int(df["_is_low_quality_v3"].sum())
print(f"[summary] low_quality_v3: {n_low:,} / {n:,}  ({n_low/max(1,n):.2%})  | time: {elapsed:.1f}s")

# 4) 取重點欄位，小預覽，不整表全印
preview_cols = [c for c in ["object_id","_text_raw","_gibberish_score_v3","_len_words"] if c in df.columns]
display(df.loc[df["_is_low_quality_v3"], preview_cols].head(10))

# 5) 匯出（可選）
outp = "../data/debug/low_quality_v3_rows.csv"
import os
os.makedirs(os.path.dirname(outp), exist_ok=True)
df.loc[df["_is_low_quality_v3"], preview_cols].to_csv(outp, index=False)
print(f"[info] exported: {outp}")


[info] computing gibberish_score_v3 ...
[info] computing is_low_quality_v3 ...
[summary] low_quality_v3: 612 / 32,183  (1.90%)  | time: 22.5s


Unnamed: 0,object_id,_text_raw,_gibberish_score_v3,_len_words
38,112932831,The only reason why they would like me is because they have to wait for a the time to go get on their food so I can go nuggets lol lol but I’ll let you know if you have any questions or questions I need to help you out with your thoughts about,3,54
40,112932831,The only reason why they would like me is because they have to wait for a the time to go get on their food so I can go nuggets lol lol but I’ll let you know if you have any questions or questions I need to help you out with your thoughts about,3,54
76,112923937,"Best all around experience you can ask for in a apartment home. From safety, security, events, well keep, you being the top priority of the community, to an community wall where you and your neighbors can connect in stay in touch",3,41
77,112923937,"Best all around experience you can ask for in a apartment home. From safety, security, events, well keep, you being the top priority of the community, to an community wall where you and your neighbors can connect in stay in touch",3,41
83,112923937,"Best all around experience you can ask for in a apartment home. From safety, security, events, well keep, you being the top priority of the community, to an community wall where you and your neighbors can connect in stay in touch",3,41
84,112923937,"Best all around experience you can ask for in a apartment home. From safety, security, events, well keep, you being the top priority of the community, to an community wall where you and your neighbors can connect in stay in touch",3,41
85,112923937,"Best all around experience you can ask for in a apartment home. From safety, security, events, well keep, you being the top priority of the community, to an community wall where you and your neighbors can connect in stay in touch",3,41
86,112923937,"Best all around experience you can ask for in a apartment home. From safety, security, events, well keep, you being the top priority of the community, to an community wall where you and your neighbors can connect in stay in touch",3,41
87,112923937,"Best all around experience you can ask for in a apartment home. From safety, security, events, well keep, you being the top priority of the community, to an community wall where you and your neighbors can connect in stay in touch",3,41
88,112923937,"Best all around experience you can ask for in a apartment home. From safety, security, events, well keep, you being the top priority of the community, to an community wall where you and your neighbors can connect in stay in touch",3,41


[info] exported: ../data/debug/low_quality_v3_rows.csv
