In [24]:
import pandas as pd
import os
import re

DATA_FILE = '../processed/microblogs_clean.csv'


In [25]:
def make_pat(phrases):
    escaped = [re.escape(p) for p in phrases]
    return re.compile("|".join(escaped), flags=re.IGNORECASE)


In [26]:
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"‚ùå Kh√¥ng t√¨m th·∫•y file {DATA_FILE}")

df = pd.read_csv(DATA_FILE, usecols=['ID','timestamp','text_clean','Location'])
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# L·ªçc ng√†y 12‚Äì15 th√°ng 5
mask_days = (df['timestamp'].dt.day.isin([12,13,14,15])) & (df['timestamp'].dt.month == 5)
df = df.loc[mask_days].copy()

df['text_lower'] = df['text_clean'].fillna('').str.lower()

print(f"üìä T·ªïng s·ªë b·∫£n ghi sau l·ªçc th·ªùi gian: {len(df)}")


üìä T·ªïng s·ªë b·∫£n ghi sau l·ªçc th·ªùi gian: 182422


In [27]:
COVERT_KEYWORDS = {
    '1. Street Crime': [
        "robbed","breaking into","burglar","mugged",
        "gunshot","stabbed","shooting","murdered","homicide",
        "police chase","arrested","handcuffed","swat"
    ],
    '2. Terror & Explosives': [
        "bombing","suicide bomber","grenade","hostage",
        "terrorist attack","explosive"
    ],
    '3. Bio-Chemical': [
        "anthrax","ricin","sarin","nerve gas","biohazard",
        "gas leak","chemical smell"
    ],
    '4. Suspicious Ops': [
        "unmarked car","black van","white van",
        "suspicious package","unattended bag","stalking me"
    ],
    '5. Outbreak / Pandemic': [
        "outbreak","epidemic","pandemic","quarantine","lockdown"
    ],
    '6. Conspiracy / Bioterror': [
        "hoax","cover up","false flag","plandemic",
        "bioweapon","engineered virus","lab-made virus"
    ]
}
CRIME_INCIDENT = make_pat([
    "robbed", "mugged", "burglar", "break in", "breaking into",
    "stole", "theft", "kidnapped",
    "stabbed", "shooting", "gunshot", "homicide", "murder"
])
# Evidence ri√™ng cho d·ªãch (kh√¥ng d√πng police)
HEALTH_EVIDENCE = make_pat([
    "cdc", "who", "health department", "hospital", "icu", "tested positive",
    "confirmed case", "confirmed cases", "outbreak", "epidemic", "pandemic",
    "quarantine", "lockdown", "vaccin", "infection", "infected", "virus"
])



In [28]:
NOISE = make_pat([
    "movie","tv show","episode","season","game","music",
    "twitter","gta","jailbreak","zombie","vampire"
])

# D·ªãch b·ªánh ‚Äì b·∫Øt bu·ªôc y t·∫ø + b√πng ph√°t
PANDEMIC_CORE = make_pat([
    "virus","flu","fever","infection","infected",
    "hospital","icu","vaccine","tested positive"
])

PANDEMIC_SIGNAL = make_pat([
    "outbreak","epidemic","pandemic","quarantine","lockdown"
])

# √Çm m∆∞u ‚Äì b·∫Øt bu·ªôc conspiracy + bio/pandemic
CONSP = make_pat([
    "hoax","cover up","false flag","plandemic"
])

BIO = make_pat([
    "bioweapon","engineered virus","lab-made","anthrax","ricin"
])

# Evidence hi·ªán tr∆∞·ªùng
EVIDENCE = make_pat([
    "police","cop","ambulance","fire truck","swat",
    "arrested","investigating","reported","via reuters"
])
EXTRA_CRIME_NOISE = make_pat([
    "swatting", "swatted", "fly swatter", "swatch", "swatch books",
    "arrested development",  # t√™n phim
    "film;", "finished arrested development"
])
NEWS_STYLE = make_pat([
    "world briefing", "sports briefing", "ap", "reuters", "via reuters",
    "police say", "suspected", "charged", "is arrested in", "are arrested"
])



In [29]:
for group, keywords in COVERT_KEYWORDS.items():
    print(f"\n--- üîç {group.upper()} ---")
    
    target = make_pat(keywords)
    subset = df[df['text_lower'].str.contains(target, na=False)].copy()
    subset = subset[~subset['text_lower'].str.contains(NOISE, na=False)]

    # --- AND FILTER ---
    if "Outbreak" in group:
        subset = subset[
            subset['text_lower'].str.contains(PANDEMIC_CORE, na=False) &
            subset['text_lower'].str.contains(PANDEMIC_SIGNAL, na=False)
        ]
        subset["evidence_score"] = subset["text_lower"].str.count(HEALTH_EVIDENCE)

    elif "Conspiracy" in group:
        # N·ªõi theo pandemic (ƒë·ªÉ kh√¥ng b·ªã 0 tuy·ªát ƒë·ªëi)
        subset = subset[
            subset['text_lower'].str.contains(CONSP, na=False) &
            (
                subset['text_lower'].str.contains(BIO, na=False) |
                subset['text_lower'].str.contains(PANDEMIC_CORE, na=False) |
                subset['text_lower'].str.contains(PANDEMIC_SIGNAL, na=False)
            )
        ]
        subset["evidence_score"] = 0

    else:
        # Evidence hi·ªán tr∆∞·ªùng
        subset['evidence_score'] = subset['text_lower'].str.count(EVIDENCE)
        subset = subset[subset['evidence_score'] >= 1]

        # --- SI·∫æT RI√äNG CHO STREET CRIME ---
        if "STREET CRIME" in group.upper():
            # 1) b·ªè nhi·ªÖu swat/swatch/swatting + arrested development
            subset = subset[~subset['text_lower'].str.contains(EXTRA_CRIME_NOISE, na=False)]
            # 2) b·ªè style tin b√°o ch√≠
            subset = subset[~subset['text_lower'].str.contains(NEWS_STYLE, na=False)]
            # 3) b·∫Øt bu·ªôc c√≥ h√†nh vi t·ªôi ph·∫°m/b·∫°o l·ª±c th·∫≠t
            subset = subset[subset['text_lower'].str.contains(CRIME_INCIDENT, na=False)]

    subset = subset.sort_values('timestamp')

    if subset.empty:
        print("   (Kh√¥ng c√≥ k·∫øt qu·∫£)")
        continue

    print(f"   ‚ö†Ô∏è {len(subset)} b·∫£n ghi:")
    for _, r in subset.iterrows():
        t = r['timestamp'].strftime('%d/%m %H:%M') if pd.notna(r['timestamp']) else "NA"
        loc = str(r.get('Location', '')).replace('POINT','').replace('(','').replace(')','').strip()
        print(f"{r.get('ID','')}|{t} | {loc} | {r.get('text_clean','')}")


--- üîç 1. STREET CRIME ---
   ‚ö†Ô∏è 8 b·∫£n ghi:
175920|12/05 09:58 | 42.21859 93.23061 | ching sing attacked the police the police took evasive action and return fire. he was later found in bushes suffering from gunshot woun
21497|13/05 12:14 | 42.20444 93.37919 | the police took evasive action and return fire. the man was found suffering from gunshot wounds. he was taken to hospital dead !!
76126|13/05 15:15 | 42.20385 93.38957 | driver who was robbed showed up. told suspect arrested is a juvenile.
169912|14/05 01:08 | 42.29083 93.2326 | when a black man is murdered it is 'regrettable.' when a cop is murdered it is a national travesty #oscargrant
29238|14/05 17:37 | 42.24218 93.33805 | so... someone got stabbed right outside of target! i'm a bit confused as to why the ambulance is in the middle of the street though!
99809|14/05 22:06 | 42.20208 93.33885 | why 2 findings conflict in fatal police shooting
115568|15/05 06:04 | 42.21771 93.35522 | 23 div person found by police on the

In [30]:
BIO_AGENT = make_pat([
    "anthrax","ricin","sarin","smallpox",
    "virus","bacteria","pathogen","flu strain",
    "engineered virus","lab strain"
])
BIO_PREP = make_pat([
    "test sample","testing","experiment",
    "release","spread","contaminate",
    "weaponize","modify","culture",
    "lab leak","leaked sample"
])
BIO_CONTEXT = make_pat([
    "lab","laboratory","facility",
    "biohazard","containment","clean room",
    "sample storage","transport"
])


In [31]:
print("\n--- üîç BIO-TERROR PRE-INCIDENT DETECTION ---")

subset = df.copy()

subset = subset[
    subset['text_lower'].str.contains(BIO_AGENT, na=False) &
    (
        subset['text_lower'].str.contains(BIO_PREP, na=False) |
        subset['text_lower'].str.contains(BIO_CONTEXT, na=False)
    )
]

# Lo·∫°i tin b√°o ch√≠
subset = subset[~subset['text_lower'].str.contains(
    make_pat(["reuters","associated press","world briefing","news"]), na=False
)]

if subset.empty:
    print("‚úÖ Kh√¥ng ph√°t hi·ªán blog m·ªù √°m li√™n quan chu·∫©n b·ªã kh·ªßng b·ªë sinh h·ªçc.")
else:
    print(f"‚ö†Ô∏è Ph√°t hi·ªán {len(subset)} blog ƒê√ÅNG NGHI:")
    for _, r in subset.iterrows():
        t = r['timestamp'].strftime('%d/%m %H:%M')
        print(f"{r['ID']} | {t} | {r['text_clean']}")



--- üîç BIO-TERROR PRE-INCIDENT DETECTION ---
‚ö†Ô∏è Ph√°t hi·ªán 1 blog ƒê√ÅNG NGHI:
80521 | 14/05 03:26 | my love affair with #apple is on the rocks. 94 hours days out from release still no iphone 4 pricing - how arrogant? annoying?


In [32]:
BIO_PREP = make_pat([
    "release virus",
    "release bacteria",
    "release pathogen",
    "intentional release",
    "deliberate release",
    "aerosolized",
    "contaminate water",
    "contaminate food",
    "spread infection",
    "weaponize virus",
    "weaponised pathogen",
    "engineered strain",
    "modify virus",
    "culture bacteria",
    "grow bacteria",
    "lab leak",
    "leaked pathogen"
])
NEGATIVE_CONTEXT = make_pat([
    "iphone","apple","pricing","price","release date",
    "product release","launch event",
    "album","song","movie","film","season",
    "software","update","version","beta"
])


In [33]:
print("\n--- üîç BIO-TERROR PRE-INCIDENT DETECTION (STRICT) ---")

subset = df.copy()

subset = subset[
    subset['text_lower'].str.contains(BIO_AGENT, na=False) &
    (
        subset['text_lower'].str.contains(BIO_PREP, na=False) |
        subset['text_lower'].str.contains(BIO_CONTEXT, na=False)
    )
]

# ‚ùå Lo·∫°i ng·ªØ c·∫£nh c√¥ng ngh·ªá / gi·∫£i tr√≠
subset = subset[~subset['text_lower'].str.contains(NEGATIVE_CONTEXT, na=False)]

# ‚ùå Lo·∫°i tin b√°o ch√≠
subset = subset[~subset['text_lower'].str.contains(
    make_pat(["reuters","associated press","world briefing","news"]), na=False
)]

if subset.empty:
    print("‚úÖ Kh√¥ng ph√°t hi·ªán blog m·ªù √°m li√™n quan chu·∫©n b·ªã kh·ªßng b·ªë sinh h·ªçc.")
else:
    print(f"‚ö†Ô∏è Ph√°t hi·ªán {len(subset)} blog ƒê√ÅNG NGHI:")
    for _, r in subset.iterrows():
        t = r['timestamp'].strftime('%d/%m %H:%M')
        print(f"{r['ID']} | {t} | {r['text_clean']}")



--- üîç BIO-TERROR PRE-INCIDENT DETECTION (STRICT) ---
‚ö†Ô∏è Ph√°t hi·ªán 1 blog ƒê√ÅNG NGHI:
33319 | 13/05 02:36 | is it bad to air dry hair? some book i read say that your scalp can grow bacteria or wtv if you air dry your hair. seriously?! 0_0
