# Semi-supervised Dataset Preparation

- Mục tiêu: tạo bộ dữ liệu giữ cả phần **chưa có nhãn AQI** (aqi_class = NaN) để dùng cho self-training/co-training.
- Đồng thời **giả lập thiếu nhãn trong TRAIN** (time-aware) để mini project có thể thử nhiều mức thiếu nhãn.

In [14]:
CLEANED_PATH = "data/processed/cleaned.parquet"
OUTPUT_SEMI_DATASET_PATH = "data/processed/dataset_for_semi.parquet"
CUTOFF = "2017-01-01"
LABEL_MISS0ING_FRACTION = 0.98
RANDOM_STATE = 42   

In [15]:
import sys
import pandas as pd
from pathlib import Path

current_dir = Path('.').resolve()
PROJECT_ROOT = current_dir.parent if current_dir.name == 'notebooks' else current_dir
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.semi_supervised_library import SemiDataConfig, mask_labels_time_aware

in_path = PROJECT_ROOT / CLEANED_PATH
out_path = PROJECT_ROOT / OUTPUT_SEMI_DATASET_PATH

if not in_path.exists():
    raise FileNotFoundError(f"File not found: {in_path}")

df = pd.read_parquet(in_path)

req_col = 'datetime'
if req_col not in df.columns:
    renamed = False
    for c in ['date', 'Date', 'timestamp', 'Time']:
        if c in df.columns:
            df.rename(columns={c: req_col}, inplace=True)
            renamed = True
            break
    if not renamed:
        if all(k in df.columns for k in ['year', 'month', 'day']):
            df[req_col] = pd.to_datetime(df[['year', 'month', 'day']])
        else:
            raise KeyError(f"Missing time column '{req_col}'")

df[req_col] = pd.to_datetime(df[req_col])

cfg = SemiDataConfig(cutoff=CUTOFF, random_state=int(RANDOM_STATE))
df_processed = mask_labels_time_aware(df, cfg, missing_fraction=float(LABEL_MISS0ING_FRACTION))

out_path.parent.mkdir(parents=True, exist_ok=True)
df_processed.to_parquet(out_path, index=False)

print(f"Saved: {out_path}")
print(f"Total records: {len(df_processed)}")

if "is_labeled" in df_processed.columns:
    train_ratio = df_processed[df_processed[req_col] < pd.to_datetime(CUTOFF)]['is_labeled'].mean()
    print(f"Train Label Ratio: {train_ratio:.4f}")

Saved: D:\3\Data_Mining\Mini_Prj\air_guard\data\processed\dataset_for_semi.parquet
Total records: 420768
Train Label Ratio: 0.0196
