In [2]:
import pandas as pd
import os
import re

DATA_FILE = '../processed/microblogs_clean.csv'

def analyze_crime_blogs():
    if not os.path.exists(DATA_FILE):
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y file {DATA_FILE}")
        return

    print(f"üëÆ ƒêang r√† so√°t AN NINH & T·ªòI PH·∫†M (Phi√™n b·∫£n High-Precision)...")
    print(f"M·ª•c ti√™u: Lo·∫°i b·ªè tin th·ªÉ thao/gi·∫£i tr√≠, t·∫≠p trung v√†o h√†nh vi th·ª±c t·∫ø.")

    # 1) ƒê·ªçc d·ªØ li·ªáu (S·ª¨A: th√™m 'ID')
    try:
        df = pd.read_csv(DATA_FILE, usecols=['ID', 'timestamp', 'text_clean', 'Location'])
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    except Exception as e:
        print(f"‚ùå L·ªói ƒë·ªçc file: {e}")
        return

    # 2) L·ªçc th·ªùi gian (Ng√†y 12,13,14,15 th√°ng 5)
    mask_days = (df['timestamp'].dt.day.isin([12, 13, 14, 15])) & (df['timestamp'].dt.month == 5)
    df_subset = df.loc[mask_days].copy()
    if df_subset.empty:
        print("‚úÖ Kh√¥ng c√≥ b·∫£n ghi trong kho·∫£ng th·ªùi gian y√™u c·∫ßu.")
        return

    df_subset['text_lower'] = df_subset['text_clean'].fillna('').str.lower()

    COVERT_KEYWORDS = {
        '1. T·ªôi ph·∫°m ƒê∆∞·ªùng ph·ªë (Street Crime)': [
            "stole my", "robbed at", "thief", "breaking into", "break in",
            "burglar", "mugged", "pickpocket", "shoplift",
            "gunshot", "stabbed", "shooting at", "murdered", "homicide",
            "dead body", "corpse", "beaten up",
            "police chase", "arrested for", "handcuffed", "sirens wailing",
            "cop car", "police raid", "swat team"
        ],
        '2. Kh·ªßng b·ªë & N·ªï (Terror & Explosives)': [
            "bomb threat", "bombing", "suicide bomber", "improvised explosive",
            "detonated", "explosion heard", "loud blast", "shrapnel",
            "hostage situation", "terrorist attack", "hijacked",
            "suspicious device", "found a bomb", "grenade"
        ],
        '3. V≈© kh√≠ Sinh/H√≥a h·ªçc (Bio-Chemical)': [
            "gas leak", "nerve gas", "sarin", "anthrax", "ricin",
            "mustard gas", "biohazard", "hazmat suit", "decontamination",
            "chemical smell", "smell of gas", "strange fog", "yellow smoke",
            "eyes burning", "skin blistering", "choking on smoke"
        ],
        '4. Ho·∫°t ƒë·ªông Kh·∫£ nghi (Suspicious Ops)': [
            "black van", "white van", "unmarked car", "suspicious truck",
            "getaway car", "speeding away", "license plate",
            "unattended bag", "suspicious package", "left a bag", "dropped a bag",
            "stalking me", "following me", "spying on", "looking in windows",
            "wearing a mask"
        ]
    }

    NOISE_KEYWORDS = [
        'movie', 'film', 'cinema', 'watch', 'episode', 'series', 'season', 'tv show',
        'trailer', 'spoiler', 'actor', 'actress', 'director', 'hollywood', 'dvd',
        'masterchef', 'true blood', 'vampire', 'zombie', 'fiction', 'novel',
        'game', 'play', 'xbox', 'ps3', 'wii', 'nintendo', 'console', 'download',
        'app', 'iphone', 'android', 'update', 'level', 'score', 'high score',
        'sport', 'match', 'win', 'won', 'lost', 'lose', 'defeat', 'championship',
        'league', 'tournament', 'cup', 'golf', 'shoot', 'shot', 'par', 'birdie',
        'basketball', 'football', 'soccer', 'hockey', 'ufc', 'fight', 'round',
        'song', 'music', 'band', 'concert', 'album', 'track', 'listen', 'radio',
        'dj', 'remix', 'singer', 'ticket', 'tour', 'video', 'youtube',
        'twitter jail', 'kill time', 'killing me', 'stole my heart', 'shoot me',
        'photo shoot', 'snapshot', 'bomb food', 'da bomb', 'blow up my phone',
        'burn calories', 'sun burn', 'hair cut', 'bag of', 'shopping bag'
    ]

    # 3) Compile regex an to√†n (S·ª¨A: escape keyword)
    def make_pat(phrases):
        escaped = [re.escape(p) for p in phrases]
        return re.compile("|".join(escaped), flags=re.IGNORECASE)

    noise_re = make_pat(NOISE_KEYWORDS)

    for group_name, keywords in COVERT_KEYWORDS.items():
        print(f"\n--- üîç R√† so√°t nh√≥m: {group_name.upper()} ---")

        target_re = make_pat(keywords)

        matched = df_subset[df_subset['text_lower'].str.contains(target_re, na=False)]
        clean_matched = matched[~matched['text_lower'].str.contains(noise_re, na=False)]
        clean_matched = clean_matched.sort_values('timestamp')

        if clean_matched.empty:
            print("   (Kh√¥ng t√¨m th·∫•y d·∫•u hi·ªáu ƒë√°ng ng·ªù sau khi l·ªçc k·ªπ)")
            continue

        print(f"   ‚ö†Ô∏è T√¨m th·∫•y {len(clean_matched)} tin ti·ªÅm nƒÉng:")

        for _, row in clean_matched.iterrows():
            id_str = str(row.get('ID', '')).strip()
            time_str = row['timestamp'].strftime('%d/%m %H:%M') if pd.notna(row['timestamp']) else "NA"
            loc_str = str(row.get('Location', '')).replace('POINT', '').replace('(', '').replace(')', '').strip()
            text = row.get('text_clean', '')
            print(f"{id_str}|{time_str:<10} | {loc_str:<20} | {text}")

    print("‚úÖ HO√ÄN T·∫§T R√Ä SO√ÅT.")

if __name__ == "__main__":
    analyze_crime_blogs()


üëÆ ƒêang r√† so√°t AN NINH & T·ªòI PH·∫†M (Phi√™n b·∫£n High-Precision)...
M·ª•c ti√™u: Lo·∫°i b·ªè tin th·ªÉ thao/gi·∫£i tr√≠, t·∫≠p trung v√†o h√†nh vi th·ª±c t·∫ø.

--- üîç R√† so√°t nh√≥m: 1. T·ªòI PH·∫†M ƒê∆Ø·ªúNG PH·ªê (STREET CRIME) ---
   ‚ö†Ô∏è T√¨m th·∫•y 108 tin ti·ªÅm nƒÉng:
11799|12/05 01:14 | 42.24071 93.36002    | my last tweet fails cause it made no sense what so ever. all i know is that i want to go to go to the beach for spring break in a yea
8577|12/05 02:12 | 42.16818 93.46546    | why does it take 3 cop cars to pull someone over for speeding on speer?
4850|12/05 03:55 | 42.23334 93.33166    | walking to god know's where?! good thing alex's and mark's dad is with us. don't feel like getting mugged again.
118971|12/05 04:08 | 42.26105 93.46027    | hugh hefner is 84 has a 24 year old girlfriend she says age don't matter sixty year difference like sleeping with a corpse? ugh.
165475|12/05 05:07 | 42.23098 93.35003    | haha.. i remember when twitter stole my pictur