In [1]:
import numpy as np
import pandas as pd
import os
import glob
import re

# === CẤU HÌNH ===
BASE_DIR = '..'
RAW_DIR = os.path.join(BASE_DIR, 'raw', 'berlin')
PROCESSED_DIR = os.path.join(BASE_DIR, 'processed', 'berlin')
REPORTS_DIR = os.path.join(BASE_DIR, 'reports')

os.makedirs(PROCESSED_DIR, exist_ok = True)
os.makedirs(REPORTS_DIR, exist_ok = True)

# Bounding Box Berlin
CITY_LAT_MIN, CITY_LAT_MAX = 52.30, 52.70
CITY_LON_MIN, CITY_LON_MAX = 13.00, 13.80

# Hàm Clean Price chuẩn
def clean_price(price_value):
    if pd.isna(price_value): return np.nan
    if isinstance(price_value, (int, float)): return float(price_value)
    s = str(price_value).strip()
    match = re.search(r"[-+]?[0-9,.]+", s.replace('$', ''))
    if not match: return np.nan
    num = match.group(0).replace(',', '') 
    try: return float(num)
    except: return np.nan

# === QUY TRÌNH XỬ LÝ ===
snapshot_folders = [f for f in glob.glob(os.path.join(RAW_DIR, '*')) if os.path.isdir(f)]
qa_summary_list = []

print(f"--- BẮT ĐẦU XỬ LÝ {len(snapshot_folders)} SNAPSHOTS BERLIN ---")

for folder_path in snapshot_folders:
    snapshot_name = os.path.basename(folder_path)
    print(f"\n>> Đang xử lý: {snapshot_name}")

    try:
        listings_df = pd.read_csv(os.path.join(folder_path, 'listings.csv.gz'), low_memory=False)
        calendar_df = pd.read_csv(os.path.join(folder_path, 'calendar.csv.gz'), low_memory=False)
        reviews_df = pd.read_csv(os.path.join(folder_path, 'reviews.csv.gz'), low_memory=False)
        neigh_df = pd.read_csv(os.path.join(folder_path, 'neighbourhoods.csv'))
    except FileNotFoundError:
        continue

    # [QA1] Price
    listings_df['price_numeric'] = listings_df['price'].apply(clean_price)
    calendar_df['price_numeric'] = calendar_df['price'].apply(clean_price)
    
    # [QA2] Flag Price <= 0
    listings_df['qa_flag_price_zero'] = listings_df['price_numeric'].fillna(0) <= 0
    qa_summary_list.append({
        'snapshot_date': snapshot_name, 'rule_id': 'QA001_price_zero',
        'records_affected': int(listings_df['qa_flag_price_zero'].sum()),
        'handling_decision': 'Gắn cờ'
    })

    # [QA3] Datetime & Coordinates
    listings_df['host_since'] = pd.to_datetime(listings_df['host_since'], errors='coerce')
    calendar_df['date'] = pd.to_datetime(calendar_df['date'], errors='coerce')
    reviews_df['date'] = pd.to_datetime(reviews_df['date'], errors='coerce')
    
    listings_df['latitude'] = pd.to_numeric(listings_df['latitude'], errors='coerce')
    listings_df['longitude'] = pd.to_numeric(listings_df['longitude'], errors='coerce')

    # [QA4] Check Coordinates Out of Bounds
    listings_df['qa_flag_out_of_city'] = (
        (listings_df['latitude'] < CITY_LAT_MIN) | (listings_df['latitude'] > CITY_LAT_MAX) |
        (listings_df['longitude'] < CITY_LON_MIN) | (listings_df['longitude'] > CITY_LON_MAX)
    )
    qa_summary_list.append({
        'snapshot_date': snapshot_name, 'rule_id': 'QA002_coords_out_of_bounds',
        'records_affected': int(listings_df['qa_flag_out_of_city'].sum()),
        'handling_decision': 'Gắn cờ'
    })

    # [QA5] Duplicate IDs
    dups = listings_df.duplicated(subset=['id']).sum()
    if dups > 0:
        listings_df = listings_df.drop_duplicates(subset=['id'], keep='first')
    qa_summary_list.append({
        'snapshot_date': snapshot_name, 'rule_id': 'QA003_duplicate_ids',
        'records_affected': int(dups),
        'handling_decision': 'Xoá dòng trùng'
    })
    
    if 'adjusted_price' in calendar_df.columns: calendar_df.drop(columns=['adjusted_price'], inplace=True)
    if 'neighbourhood_group' in neigh_df.columns: neigh_df.drop(columns=['neighbourhood_group'], inplace=True)

    out_dir = os.path.join(PROCESSED_DIR, snapshot_name)
    os.makedirs(out_dir, exist_ok=True)
    
    listings_df.to_csv(os.path.join(out_dir, 'listings_processed.csv'), index=False)
    calendar_df.to_csv(os.path.join(out_dir, 'calendar_processed.csv'), index=False)
    reviews_df.to_csv(os.path.join(out_dir, 'reviews_processed.csv'), index=False)
    neigh_df.to_csv(os.path.join(out_dir, 'neighbourhoods_processed.csv'), index=False)
    
    print(f"   -> Đã lưu xong: {out_dir}")

pd.DataFrame(qa_summary_list).to_csv(os.path.join(REPORTS_DIR, 'qa_summary_berlin.csv'), index=False)
print("\n--- HOÀN TẤT ---")

--- BẮT ĐẦU XỬ LÝ 4 SNAPSHOTS BERLIN ---

>> Đang xử lý: 15 March, 2025
   -> Đã lưu xong: ..\processed\berlin\15 March, 2025

>> Đang xử lý: 20 June, 2025
   -> Đã lưu xong: ..\processed\berlin\20 June, 2025

>> Đang xử lý: 21 December, 2024
   -> Đã lưu xong: ..\processed\berlin\21 December, 2024

>> Đang xử lý: 23 September, 2025
   -> Đã lưu xong: ..\processed\berlin\23 September, 2025

--- HOÀN TẤT ---
