In [None]:
!pip install langdetect

import pandas as pd
import numpy as np
import re
from langdetect import detect, DetectorFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

DetectorFactory.seed = 0

# ==============================================================================
# 1. C·∫§U H√åNH B·ªò L·ªåC
# ==============================================================================
MIN_CHARS = 70      # ƒê·ªô d√†i t·ªëi thi·ªÉu
MIN_WORDS = 15      # S·ªë t·ª´ t·ªëi thi·ªÉu
DUPLICATE_THRESHOLD = 0.85 # Ng∆∞·ª°ng tr√πng l·∫∑p

# Danh s√°ch t·ª´ kh√≥a "R√ÅC" c·∫ßn lo·∫°i b·ªè ngay l·∫≠p t·ª©c
SPAM_KEYWORDS = [
    "subscribe", "ƒëƒÉng k√Ω k√™nh", "b·∫•m chu√¥ng", "follow k√™nh",
    "ghi·ªÅn m√¨ g√µ", "ghien mi go", "ghienmigo", "faptv", "Ghi·ªÅn M√¨ G√µ"
    "copyright", "b·∫£n quy·ªÅn", "link trong bio",
    "http://", "https://", "www."
]

# ==============================================================================
# 2. C√ÅC H√ÄM X·ª¨ L√ù
# ==============================================================================

def clean_text_basic(text):
    """L√†m s·∫°ch s∆° b·ªô k√Ω t·ª± l·∫°"""
    if not isinstance(text, str): return ""
    # X√≥a k√Ω t·ª± l·∫°, gi·ªØ l·∫°i d·∫•u c√¢u
    text = re.sub(r'[^\w\s.,!?;:"\'\-%/()]', ' ', text)
    # X√≥a kho·∫£ng tr·∫Øng th·ª´a
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def is_spam_content(text):
    """
    Ki·ªÉm tra xem text c√≥ ch·ª©a t·ª´ kh√≥a r√°c ho·∫∑c l√† link URL kh√¥ng.
    Tr·∫£ v·ªÅ TRUE n·∫øu l√† SPAM.
    """
    text_lower = text.lower()

    # 1. Check t·ª´ kh√≥a c·∫•m
    for keyword in SPAM_KEYWORDS:
        if keyword in text_lower:
            return True

    # 2. Check n·∫øu to√†n b·ªô text nh√¨n gi·ªëng URL
    if re.match(r'^[a-zA-Z0-9\.\-_]+$', text): # Ki·ªÉu "vtv.vn" ho·∫∑c "tiktok.com"
        return True

    return False

def is_valid_quality(text):
    """Ki·ªÉm tra ƒë·ªô d√†i v√† ch·∫•t l∆∞·ª£ng n·ªôi dung"""
    # 1. ƒê·ªô d√†i
    if len(text) < MIN_CHARS: return False
    words = text.split()
    if len(words) < MIN_WORDS: return False

    # 2. T·ª∑ l·ªá s·ªë (tr√°nh d√≤ng to√†n s·ªë li·ªáu v√¥ nghƒ©a)
    digit_count = sum(c.isdigit() for c in text)
    if digit_count / len(text) > 0.5: return False

    return True

def is_vietnamese(text):
    """Ki·ªÉm tra ng√¥n ng·ªØ"""
    try:
        if len(text.split()) < 3: return False
        return detect(text) == 'vi'
    except:
        return False

# ==============================================================================
# 3. H√ÄM CH√çNH: CLEAN FILE REAL
# ==============================================================================

def clean_real_dataset_specifically():
    print("üöÄ B·∫ÆT ƒê·∫¶U L√ÄM S·∫†CH DATASET REAL")

    # --- ƒê∆Ø·ªúNG D·∫™N FILE (S·ª≠a l·∫°i t√™n file c·ªßa b·∫°n) ---
    # input_csv = "/content/drive/MyDrive/TikTok_Project/tiktok_videos_all_keywords_real_done.csv"
    # output_csv = "/content/drive/MyDrive/TikTok_Project/dataset_real_final_clean.csv"

    input_csv = "/content/drive/MyDrive/TikTok_Project/dataset_fake.csv"
    output_csv = "/content/drive/MyDrive/TikTok_Project/dataset_fake_final_clean.csv"

    try:
        df = pd.read_csv(input_csv)
        print(f"üìä T·ªïng s·ªë d√≤ng ban ƒë·∫ßu: {len(df)}")
    except Exception as e:
        print(f"‚ùå L·ªói ƒë·ªçc file: {e}")
        return

    # 1. X√≥a d√≤ng tr·ªëng
    df = df.dropna(subset=['text'])
    df['text'] = df['text'].astype(str).apply(clean_text_basic)

    # 2. L·ªåC SPAM (Subscribe, M√¨ G√µ, URL...)
    print("üóëÔ∏è ƒêang qu√©t v√† x√≥a c√°c c√¢u Spam/Intro r√°c...")
    # T·∫°o mask nh·ªØng d√≤ng KH√îNG ph·∫£i spam
    df['is_clean'] = ~df['text'].apply(is_spam_content)
    df = df[df['is_clean'] == True].drop(columns=['is_clean'])
    print(f"   -> Sau khi l·ªçc Spam: c√≤n {len(df)} d√≤ng")

    # 3. L·ªåC CH·∫§T L∆Ø·ª¢NG (ƒê·ªô d√†i)
    print(f"‚úÇÔ∏è ƒêang l·ªçc c√¢u ng·∫Øn (<{MIN_WORDS} t·ª´)...")
    df['is_valid'] = df['text'].apply(is_valid_quality)
    df = df[df['is_valid'] == True].drop(columns=['is_valid'])
    print(f"   -> Sau khi l·ªçc ng·∫Øn: c√≤n {len(df)} d√≤ng")

    # 4. L·ªåC TI·∫æNG VI·ªÜT
    print("üåç ƒêang ki·ªÉm tra ng√¥n ng·ªØ...")
    df['is_vi'] = df['text'].apply(is_vietnamese)
    df = df[df['is_vi'] == True].drop(columns=['is_vi'])

    # 5. X√ìA TR√ôNG L·∫∂P (FUZZY)
    print("ü§ñ ƒêang x√≥a tr√πng l·∫∑p n·ªôi dung (>85%)...")
    df = df.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)

    if len(df) > 1:
        tfidf = TfidfVectorizer(min_df=1).fit_transform(df['text'])
        cosine_sim = cosine_similarity(tfidf, tfidf)
        to_drop = set()
        for i in range(len(df)):
            if i in to_drop: continue
            for j in range(i + 1, len(df)):
                if j in to_drop: continue
                if cosine_sim[i, j] > DUPLICATE_THRESHOLD:
                    to_drop.add(j)
        df_final = df.drop(index=list(to_drop))
    else:
        df_final = df

    # 6. CHU·∫®N H√ìA CATEGORY
    df_final['category'] = 'real'

    # --- L∆ØU K·∫æT QU·∫¢ ---
    print("\n" + "="*40)
    print(f"‚úÖ HO√ÄN T·∫§T FILE REAL!")
    print(f"üëâ C√≤n l·∫°i: {len(df_final)} d√≤ng ch·∫•t l∆∞·ª£ng cao.")
    print(f"üëâ File l∆∞u t·∫°i: {output_csv}")
    print("="*40)

    df_final.to_csv(output_csv, index=False, encoding='utf-8-sig')

# Ch·∫°y h√†m
clean_real_dataset_specifically()

üöÄ B·∫ÆT ƒê·∫¶U L√ÄM S·∫†CH DATASET REAL
üìä T·ªïng s·ªë d√≤ng ban ƒë·∫ßu: 1590
üóëÔ∏è ƒêang qu√©t v√† x√≥a c√°c c√¢u Spam/Intro r√°c...
   -> Sau khi l·ªçc Spam: c√≤n 1415 d√≤ng
‚úÇÔ∏è ƒêang l·ªçc c√¢u ng·∫Øn (<15 t·ª´)...
   -> Sau khi l·ªçc ng·∫Øn: c√≤n 1415 d√≤ng
üåç ƒêang ki·ªÉm tra ng√¥n ng·ªØ...
ü§ñ ƒêang x√≥a tr√πng l·∫∑p n·ªôi dung (>85%)...

‚úÖ HO√ÄN T·∫§T FILE REAL!
üëâ C√≤n l·∫°i: 1415 d√≤ng ch·∫•t l∆∞·ª£ng cao.
üëâ File l∆∞u t·∫°i: /content/drive/MyDrive/TikTok_Project/dataset_fake_final_clean.csv


In [None]:
import pandas as pd
import os

def merge_fake_real_final():
    print("üöÄ B·∫ÆT ƒê·∫¶U G·ªòP DATASET FAKE V√Ä REAL...")

    # ==============================================================================
    # 1. C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N (B·∫°n thay ƒë√∫ng t√™n file c·ªßa b·∫°n v√†o ƒë√¢y)
    # ==============================================================================

    # File Fake (ƒë√£ c√≥ ƒë·ªß metadata v√† text s·∫°ch)
    path_fake = "/content/drive/MyDrive/TikTok_Project/dataset_fake_final_clean.csv"

    # File Real (ƒë√£ l·ªçc s·∫°ch spam, intro r√°c)
    path_real = "/content/drive/MyDrive/TikTok_Project/dataset_real_final_clean.csv"

    # File k·∫øt qu·∫£ cu·ªëi c√πng (S·∫Ω d√πng file n√†y ƒë·ªÉ Train AI)
    path_final = "/content/drive/MyDrive/TikTok_Project/dataset_TIKTOK_FAKE_NEWS_FINAL.csv"

    # ==============================================================================
    # 2. ƒê·ªåC D·ªÆ LI·ªÜU
    # ==============================================================================
    try:
        print(f"üìÇ ƒêang ƒë·ªçc Fake: {path_fake}")
        df_fake = pd.read_csv(path_fake)
        # ƒê·∫£m b·∫£o nh√£n ƒë√∫ng
        df_fake['category'] = 'fake'
        print(f"   -> S·ªë l∆∞·ª£ng Fake: {len(df_fake)} d√≤ng")

        print(f"üìÇ ƒêang ƒë·ªçc Real: {path_real}")
        df_real = pd.read_csv(path_real)
        # ƒê·∫£m b·∫£o nh√£n ƒë√∫ng
        df_real['category'] = 'real'
        print(f"   -> S·ªë l∆∞·ª£ng Real: {len(df_real)} d√≤ng")

    except Exception as e:
        print(f"‚ùå L·ªói ƒë·ªçc file: {e}")
        return

    # ==============================================================================
    # 3. G·ªòP (CONCAT)
    # ==============================================================================
    print("üîó ƒêang gh√©p 2 file l·∫°i v·ªõi nhau...")

    # ignore_index=True ƒë·ªÉ reset l·∫°i s·ªë th·ª© t·ª± t·ª´ 0 ƒë·∫øn h·∫øt
    df_merged = pd.concat([df_fake, df_real], ignore_index=True)

    print(f"üìä T·ªïng s·ªë d·ªØ li·ªáu sau khi g·ªôp: {len(df_merged)} d√≤ng")

    # ==============================================================================
    # 4. S·∫ÆP X·∫æP C·ªòT (REORDER COLUMNS)
    # ==============================================================================
    # S·∫Øp x·∫øp l·∫°i th·ª© t·ª± c·ªôt cho chu·∫©n, ƒë∆∞a c√°c th√¥ng tin quan tr·ªçng l√™n ƒë·∫ßu
    # C√°c c·ªôt metadata √≠t quan tr·ªçng h∆°n th√¨ ƒë·ªÉ ra sau

    desired_order = [
        'url', 'text', 'category',          # 3 c·ªôt quan tr·ªçng nh·∫•t cho Model
        'desc', 'keyword', 'createTime',    # Th√¥ng tin n·ªôi dung
        'shareCount', 'commentCount', 'playCount', 'diggCount', 'collectCount', # T∆∞∆°ng t√°c
        'author_nickname', 'author_unique_id', 'author_id', 'video_id', 'thumnail_url' # Metadata t√°c gi·∫£
    ]

    # Ch·ªâ l·∫•y nh·ªØng c·ªôt c√≥ trong d·ªØ li·ªáu th·ª±c t·∫ø (ƒë·ªÅ ph√≤ng thi·∫øu c·ªôt n√†o ƒë√≥)
    existing_cols = [c for c in desired_order if c in df_merged.columns]

    # N·∫øu c√≤n c·ªôt n√†o l·∫° ch∆∞a c√≥ trong danh s√°ch tr√™n th√¨ th√™m v√†o ƒëu√¥i
    remaining_cols = [c for c in df_merged.columns if c not in existing_cols]

    final_cols = existing_cols + remaining_cols

    df_merged = df_merged[final_cols]

    # ==============================================================================
    # 5. X√ÅO TR·ªòN D·ªÆ LI·ªÜU (SHUFFLE) - T√ôY CH·ªåN
    # ==============================================================================
    # ƒê·ªÉ khi m·ªü file l√™n kh√¥ng b·ªã ki·ªÉu: 1000 d√≤ng ƒë·∫ßu to√†n Fake, 1000 d√≤ng sau to√†n Real
    # Gi√∫p c√°i nh√¨n kh√°ch quan h∆°n
    print("üîÄ ƒêang x√°o tr·ªôn ng·∫´u nhi√™n th·ª© t·ª± d√≤ng...")
    df_merged = df_merged.sample(frac=1, random_state=42).reset_index(drop=True)

    # ==============================================================================
    # 6. L∆ØU K·∫æT QU·∫¢
    # ==============================================================================
    df_merged.to_csv(path_final, index=False, encoding='utf-8-sig')

    print("\n" + "="*50)
    print("üéâ CH√öC M·ª™NG! B·∫†N ƒê√É C√ì DATASET HO√ÄN CH·ªàNH!")
    print(f"üëâ ƒê∆∞·ªùng d·∫´n: {path_final}")
    print(f"üëâ T·ªïng s·ªë d√≤ng: {len(df_merged)}")
    print("-" * 30)
    print("Ph√¢n b·ªë nh√£n:")
    print(df_merged['category'].value_counts())
    print("="*50)

# Ch·∫°y h√†m
merge_fake_real_final()

üöÄ B·∫ÆT ƒê·∫¶U G·ªòP DATASET FAKE V√Ä REAL...
üìÇ ƒêang ƒë·ªçc Fake: /content/drive/MyDrive/TikTok_Project/dataset_fake_final_clean.csv
   -> S·ªë l∆∞·ª£ng Fake: 1415 d√≤ng
üìÇ ƒêang ƒë·ªçc Real: /content/drive/MyDrive/TikTok_Project/dataset_real_final_clean.csv
   -> S·ªë l∆∞·ª£ng Real: 1069 d√≤ng
üîó ƒêang gh√©p 2 file l·∫°i v·ªõi nhau...
üìä T·ªïng s·ªë d·ªØ li·ªáu sau khi g·ªôp: 2484 d√≤ng
üîÄ ƒêang x√°o tr·ªôn ng·∫´u nhi√™n th·ª© t·ª± d√≤ng...

üéâ CH√öC M·ª™NG! B·∫†N ƒê√É C√ì DATASET HO√ÄN CH·ªàNH!
üëâ ƒê∆∞·ªùng d·∫´n: /content/drive/MyDrive/TikTok_Project/dataset_TIKTOK_FAKE_NEWS_FINAL.csv
üëâ T·ªïng s·ªë d√≤ng: 2484
------------------------------
Ph√¢n b·ªë nh√£n:
category
fake    1415
real    1069
Name: count, dtype: int64
