In [14]:
import os
import re
import pandas as pd
from tqdm import tqdm

# === SETUP ===
year = 2025
output_folder = f'output/{year}'
keyword_file = 'keywords.txt'
os.makedirs(output_folder, exist_ok=True)

# === Load daftar keyword dari file txt ===
def load_theme_keywords_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        keywords = [line.strip() for line in f if line.strip()]
    return keywords

theme_keywords = load_theme_keywords_from_txt(keyword_file)

# === Load daftar negara asing ===
foreign_countries = [
    "Amerika", "AS", "Amerika Serikat", "USA", "United States", "global",
    "China", "Tiongkok", "Trump", "US", "Vietnam", "Federal Reserve", "ECB",
    "Jepang", "Korea", "Eropa", "Uni Eropa", "UE", "UK", "Inggris", "RBA",
    "India", "Australia", "Jerman", "Perancis", "Brazil", "Rusia", "Ringgit", 
    "euro", "Georgia", "Beijing", "Iran", "Biden", "Fed", "Middle Eastern",
    "eurozone", "Nasdaq", "PricewaterhouseCoopers", "Kwantas", "Kuala Lumpur",
    "chinese", "european", "Japan", "Jerome Powell", "Canada", "America", "OPEC",
    "Czech", "Hungary", "Poland", "BoE", "lira", "Wall Street", "rupee", "Russian",
    "IMF", "British", "United Kingdom", "Saudi", "saham", "emiten", "FOMC",
    "Michigan","Daghlian","New Zealand","Filipina","Bonds","Federal", "Americans",
    "Euros", "American", "BOJ"
]

# === Fungsi filter regex (keyword Bahasa Indonesia) ===
def filter_sentences_regex(df, keywords):
    pattern = r'\b(' + '|'.join(re.escape(k.lower()) for k in keywords) + r')\b'
    mask = df['Sentences'].astype(str).str.lower().str.contains(pattern, regex=True)
    return df[mask].reset_index(drop=True)

# === Fungsi filter Bahasa Inggris (hard rule) ===
def has_inflation_and_expectation(text):
    if pd.isna(text): return False
    text = text.lower()
    return ('inflation' in text and ('expectation' in text or 'forecast' in text or 'projection' in text))

# === Fungsi filter domestik (berbasis negara asing) ===
def should_keep_sentence(sentence, pattern):
    if pd.isna(sentence): return True
    sentence_lower = sentence.lower()
    has_foreign = re.search(pattern, sentence_lower) is not None
    has_indonesia = 'indonesia' in sentence_lower
    return not has_foreign or (has_foreign and has_indonesia)

# === Main loop untuk 1 tahun ===
for month in range(1, 13):
    month_str = f"{month:02d}"
    file_date = f"{year}_{month_str}"

    print(f"\n📁 Memproses bulan: {file_date}")

    #tokenized_file = os.path.join(output_folder, f'tokenized_{file_date}.xlsx')
    tokenized_file = os.path.join(output_folder, f'tokenized_{file_date}.csv')
    filtered_file = os.path.join(output_folder, f'filtered_regex_{file_date}_v2.xlsx')
    domestic_file = os.path.join(output_folder, f'domestic_inflation_{file_date}_v2.xlsx')

    if not os.path.exists(tokenized_file):
        print(f"❌ File tidak ditemukan: {tokenized_file}")
        continue

    # === Step 1: Filter regex Indonesia ===
    #df = pd.read_excel(tokenized_file)
    df = pd.read_csv(tokenized_file)
    print(f"✅ Memuat {len(df)} kalimat dari {tokenized_file}")

    print("🔍 Filter regex untuk Bahasa Indonesia...")
    filtered_df = filter_sentences_regex(df, theme_keywords)

    # === Step 2: Filter rule Bahasa Inggris ===
    print("🔍 Filter hard-rule untuk Bahasa Inggris...")
    df['keep_en'] = df['Sentences'].apply(has_inflation_and_expectation)
    filtered_en = df[df['keep_en'] == True].drop(columns=['keep_en']).reset_index(drop=True)
    print(f"✅ Dapat {len(filtered_en)} kalimat Bahasa Inggris")

    # === Step 3: Gabungkan hasil filter regex + Inggris
    combined_df = pd.concat([filtered_df, filtered_en], ignore_index=True)
    combined_df.to_excel(filtered_file, index=False)
    print(f"💾 Disimpan hasil gabungan: {len(combined_df)} kalimat → {filtered_file}")

    # === Step 4: Filter kalimat domestik (buang asing yang tidak ada "Indonesia")
    foreign_pattern = r'\b(' + '|'.join(re.escape(fc.lower()) for fc in foreign_countries) + r')\b'

    print("🇮🇩 Filtering kalimat domestik...")
    tqdm.pandas(desc="Domestic Filter")
    combined_df['keep'] = combined_df['Sentences'].progress_apply(lambda x: should_keep_sentence(x, foreign_pattern))
    domestic_df = combined_df[combined_df['keep'] == True].drop(columns=['keep']).reset_index(drop=True)
    print(f"✅ Filter domestik: dari {len(combined_df)} → {len(domestic_df)} kalimat")

    domestic_df.to_excel(domestic_file, index=False)
    print(f"✅ Disimpan ke: {domestic_file}")



📁 Memproses bulan: 2025_01
✅ Memuat 1293954 kalimat dari output/2025\tokenized_2025_01.csv
🔍 Filter regex untuk Bahasa Indonesia...


  mask = df['Sentences'].astype(str).str.lower().str.contains(pattern, regex=True)


🔍 Filter hard-rule untuk Bahasa Inggris...
✅ Dapat 39 kalimat Bahasa Inggris
💾 Disimpan hasil gabungan: 326 kalimat → output/2025\filtered_regex_2025_01_v2.xlsx
🇮🇩 Filtering kalimat domestik...


Domestic Filter: 100%|████████████████████████████████████████████████████████████| 326/326 [00:00<00:00, 54464.97it/s]

✅ Filter domestik: dari 326 → 196 kalimat
✅ Disimpan ke: output/2025\domestic_inflation_2025_01_v2.xlsx

📁 Memproses bulan: 2025_02





✅ Memuat 1343941 kalimat dari output/2025\tokenized_2025_02.csv
🔍 Filter regex untuk Bahasa Indonesia...


  mask = df['Sentences'].astype(str).str.lower().str.contains(pattern, regex=True)


🔍 Filter hard-rule untuk Bahasa Inggris...
✅ Dapat 66 kalimat Bahasa Inggris
💾 Disimpan hasil gabungan: 316 kalimat → output/2025\filtered_regex_2025_02_v2.xlsx
🇮🇩 Filtering kalimat domestik...


Domestic Filter: 100%|████████████████████████████████████████████████████████████| 316/316 [00:00<00:00, 70771.04it/s]


✅ Filter domestik: dari 316 → 185 kalimat
✅ Disimpan ke: output/2025\domestic_inflation_2025_02_v2.xlsx

📁 Memproses bulan: 2025_03
✅ Memuat 951923 kalimat dari output/2025\tokenized_2025_03.csv
🔍 Filter regex untuk Bahasa Indonesia...


  mask = df['Sentences'].astype(str).str.lower().str.contains(pattern, regex=True)


🔍 Filter hard-rule untuk Bahasa Inggris...
✅ Dapat 58 kalimat Bahasa Inggris
💾 Disimpan hasil gabungan: 234 kalimat → output/2025\filtered_regex_2025_03_v2.xlsx
🇮🇩 Filtering kalimat domestik...


Domestic Filter: 100%|████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 78235.72it/s]

✅ Filter domestik: dari 234 → 129 kalimat
✅ Disimpan ke: output/2025\domestic_inflation_2025_03_v2.xlsx

📁 Memproses bulan: 2025_04
❌ File tidak ditemukan: output/2025\tokenized_2025_04.csv

📁 Memproses bulan: 2025_05
❌ File tidak ditemukan: output/2025\tokenized_2025_05.csv

📁 Memproses bulan: 2025_06
❌ File tidak ditemukan: output/2025\tokenized_2025_06.csv

📁 Memproses bulan: 2025_07
❌ File tidak ditemukan: output/2025\tokenized_2025_07.csv

📁 Memproses bulan: 2025_08
❌ File tidak ditemukan: output/2025\tokenized_2025_08.csv

📁 Memproses bulan: 2025_09
❌ File tidak ditemukan: output/2025\tokenized_2025_09.csv

📁 Memproses bulan: 2025_10
❌ File tidak ditemukan: output/2025\tokenized_2025_10.csv

📁 Memproses bulan: 2025_11
❌ File tidak ditemukan: output/2025\tokenized_2025_11.csv

📁 Memproses bulan: 2025_12
❌ File tidak ditemukan: output/2025\tokenized_2025_12.csv





In [12]:
import os
import re
import pandas as pd
from tqdm import tqdm

# === SETUP ===
year = 2024
output_folder = f'output/{year}'
keyword_file = 'keywords.txt'
os.makedirs(output_folder, exist_ok=True)

# === Load daftar keyword dari file txt ===
def load_theme_keywords_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        keywords = [line.strip() for line in f if line.strip()]
    return keywords

theme_keywords = load_theme_keywords_from_txt(keyword_file)

# === Load daftar negara asing ===
foreign_countries = [
    "Amerika", "AS", "Amerika Serikat", "USA", "United States", "global",
    "China", "Tiongkok", "Trump", "US", "Vietnam", "Federal Reserve", "ECB",
    "Jepang", "Korea", "Eropa", "Uni Eropa", "UE", "UK", "Inggris", "RBA",
    "India", "Australia", "Jerman", "Perancis", "Brazil", "Rusia", "Ringgit", 
    "euro", "Georgia", "Beijing", "Iran", "Biden", "Fed", "Middle Eastern",
    "eurozone", "Nasdaq", "PricewaterhouseCoopers", "Kwantas", "Kuala Lumpur",
    "chinese", "european", "Japan", "Jerome Powell", "Canada", "America", "OPEC",
    "Czech", "Hungary", "Poland", "BoE", "lira", "Wall Street", "rupee", "Russian",
    "IMF", "British", "United Kingdom", "Saudi", "saham", "emiten", "FOMC",
    "Michigan","Daghlian","New Zealand","Filipina","Bonds","Federal", "Americans",
    "Euros", "American", "BOJ"
]

# === Fungsi filter regex (keyword Bahasa Indonesia) ===
def filter_sentences_regex(df, keywords):
    pattern = r'\b(' + '|'.join(re.escape(k.lower()) for k in keywords) + r')\b'
    mask = df['Sentences'].astype(str).str.lower().str.contains(pattern, regex=True)
    return df[mask].reset_index(drop=True)

# === Fungsi filter Bahasa Inggris (hard rule) ===
def has_inflation_and_expectation(text):
    if pd.isna(text): return False
    text = text.lower()
    return ('inflation' in text and ('expectation' in text or 'forecast' in text or 'projection' in text))

# === Fungsi filter domestik (berbasis negara asing) ===
def should_keep_sentence(sentence, pattern):
    if pd.isna(sentence): return True
    sentence_lower = sentence.lower()
    has_foreign = re.search(pattern, sentence_lower) is not None
    has_indonesia = 'indonesia' in sentence_lower
    return not has_foreign or (has_foreign and has_indonesia)

# === Main loop untuk 1 tahun ===
for month in range(1, 13):
    month_str = f"{month:02d}"
    file_date = f"{year}_{month_str}"

    print(f"\n📁 Memproses bulan: {file_date}")

    #tokenized_file = os.path.join(output_folder, f'tokenized_{file_date}.xlsx')
    tokenized_file = os.path.join(output_folder, f'tokenized_{file_date}.csv')
    filtered_file = os.path.join(output_folder, f'filtered_regex_{file_date}_v2.xlsx')
    domestic_file = os.path.join(output_folder, f'domestic_inflation_{file_date}_v2.xlsx')

    if not os.path.exists(tokenized_file):
        print(f"❌ File tidak ditemukan: {tokenized_file}")
        continue

    # === Step 1: Filter regex Indonesia ===
    #df = pd.read_excel(tokenized_file)
    df = pd.read_csv(tokenized_file)
    print(f"✅ Memuat {len(df)} kalimat dari {tokenized_file}")

    print("🔍 Filter regex untuk Bahasa Indonesia...")
    filtered_df = filter_sentences_regex(df, theme_keywords)

    # === Step 2: Filter rule Bahasa Inggris ===
    print("🔍 Filter hard-rule untuk Bahasa Inggris...")
    df['keep_en'] = df['Sentences'].apply(has_inflation_and_expectation)
    filtered_en = df[df['keep_en'] == True].drop(columns=['keep_en']).reset_index(drop=True)
    print(f"✅ Dapat {len(filtered_en)} kalimat Bahasa Inggris")

    # === Step 3: Gabungkan hasil filter regex + Inggris
    combined_df = pd.concat([filtered_df, filtered_en], ignore_index=True)
    combined_df.to_excel(filtered_file, index=False)
    print(f"💾 Disimpan hasil gabungan: {len(combined_df)} kalimat → {filtered_file}")

    # === Step 4: Filter kalimat domestik (buang asing yang tidak ada "Indonesia")
    foreign_pattern = r'\b(' + '|'.join(re.escape(fc.lower()) for fc in foreign_countries) + r')\b'

    print("🇮🇩 Filtering kalimat domestik...")
    tqdm.pandas(desc="Domestic Filter")
    combined_df['keep'] = combined_df['Sentences'].progress_apply(lambda x: should_keep_sentence(x, foreign_pattern))
    domestic_df = combined_df[combined_df['keep'] == True].drop(columns=['keep']).reset_index(drop=True)
    print(f"✅ Filter domestik: dari {len(combined_df)} → {len(domestic_df)} kalimat")

    domestic_df.to_excel(domestic_file, index=False)
    print(f"✅ Disimpan ke: {domestic_file}")


📁 Memproses bulan: 2024_01
❌ File tidak ditemukan: output/2024\tokenized_2024_01.csv

📁 Memproses bulan: 2024_02
❌ File tidak ditemukan: output/2024\tokenized_2024_02.csv

📁 Memproses bulan: 2024_03
❌ File tidak ditemukan: output/2024\tokenized_2024_03.csv

📁 Memproses bulan: 2024_04
❌ File tidak ditemukan: output/2024\tokenized_2024_04.csv

📁 Memproses bulan: 2024_05
❌ File tidak ditemukan: output/2024\tokenized_2024_05.csv

📁 Memproses bulan: 2024_06
❌ File tidak ditemukan: output/2024\tokenized_2024_06.csv

📁 Memproses bulan: 2024_07
❌ File tidak ditemukan: output/2024\tokenized_2024_07.csv

📁 Memproses bulan: 2024_08
❌ File tidak ditemukan: output/2024\tokenized_2024_08.csv

📁 Memproses bulan: 2024_09
❌ File tidak ditemukan: output/2024\tokenized_2024_09.csv

📁 Memproses bulan: 2024_10
❌ File tidak ditemukan: output/2024\tokenized_2024_10.csv

📁 Memproses bulan: 2024_11
✅ Memuat 1060394 kalimat dari output/2024\tokenized_2024_11.csv
🔍 Filter regex untuk Bahasa Indonesia...


  mask = df['Sentences'].astype(str).str.lower().str.contains(pattern, regex=True)


🔍 Filter hard-rule untuk Bahasa Inggris...
✅ Dapat 14 kalimat Bahasa Inggris
💾 Disimpan hasil gabungan: 230 kalimat → output/2024\filtered_regex_2024_11_v2.xlsx
🇮🇩 Filtering kalimat domestik...


Domestic Filter: 100%|████████████████████████████████████████████████████████████| 230/230 [00:00<00:00, 55441.95it/s]

✅ Filter domestik: dari 230 → 157 kalimat





✅ Disimpan ke: output/2024\domestic_inflation_2024_11_v2.xlsx

📁 Memproses bulan: 2024_12


  df = pd.read_csv(tokenized_file)


✅ Memuat 1059069 kalimat dari output/2024\tokenized_2024_12.csv
🔍 Filter regex untuk Bahasa Indonesia...


  mask = df['Sentences'].astype(str).str.lower().str.contains(pattern, regex=True)


🔍 Filter hard-rule untuk Bahasa Inggris...
✅ Dapat 28 kalimat Bahasa Inggris
💾 Disimpan hasil gabungan: 218 kalimat → output/2024\filtered_regex_2024_12_v2.xlsx
🇮🇩 Filtering kalimat domestik...


Domestic Filter: 100%|████████████████████████████████████████████████████████████| 218/218 [00:00<00:00, 66219.46it/s]

✅ Filter domestik: dari 218 → 124 kalimat
✅ Disimpan ke: output/2024\domestic_inflation_2024_12_v2.xlsx



