In [None]:
# ==============================================================================
# FAST AI MODULE: XGBoost (No Condo Data) + Export .pkl
# ==============================================================================
# pip install imbalanced-learn xgboost joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_curve
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
import joblib  # สำหรับ Save Model

# ---------------------------------------------------------
# 1. Data Preparation & Feature Engineering
# ---------------------------------------------------------
# เปลี่ยน Path ไฟล์ของคุณที่นี่
file_path = '/opt/airflow/data/clean_data2.csv'

try:
    df_ml = pd.read_csv(file_path)
except FileNotFoundError:
    print("⚠️ หาไฟล์ไม่เจอ ใช้ Dummy Data แทนเพื่อทดสอบ Code")
    # สร้าง Data จำลอง (ตัดส่วน Condo ออกแล้ว)
    df_ml = pd.DataFrame({
        'count_reopen': np.random.choice([0, 1, 2], 1000, p=[0.7, 0.2, 0.1]),
        'timestamp': pd.date_range(start='1/1/2023', periods=1000),
        'comment': ['ทดสอบรายละเอียดปัญหา']*1000,
        'district': np.random.choice(['เขต A', 'เขต B', 'เขต C'], 1000),
        'subdistrict': np.random.choice(['แขวง 1', 'แขวง 2'], 1000),
        'type 1': np.random.choice(['ถนน', 'ทางเท้า', 'ความสะอาด'], 1000),
        'organization_1': np.random.choice(['สนง.เขต', 'สำนักการโยธา'], 1000),
    })

# Target: 1 = Reopen, 0 = Done
df_ml['is_reopen'] = (df_ml['count_reopen'] > 0).astype(int)

# Time Features
df_ml['timestamp'] = pd.to_datetime(df_ml['timestamp'], errors='coerce')
df_ml['hour'] = df_ml['timestamp'].dt.hour
df_ml['day_of_week'] = df_ml['timestamp'].dt.dayofweek
df_ml['month'] = df_ml['timestamp'].dt.month

# Text Feature: ความยาวคำร้องเรียน
df_ml['comment_len'] = df_ml['comment'].astype(str).apply(len)

# --- [ENCODING] ---
# เก็บ Encoder ใส่ Dictionary เพื่อ Save ไปใช้ทีหลัง
encoders_dict = {}

cols_to_encode = ['district', 'subdistrict', 'type 1']
# เช็คชื่อคอลัมน์ Organization
org_col = 'organization_1' if 'organization_1' in df_ml.columns else 'organization'
if org_col in df_ml.columns:
    cols_to_encode.append(org_col)

for col in cols_to_encode:
    if col in df_ml.columns:
        # Fill NA และแปลงเป็น String
        df_ml[col] = df_ml[col].fillna('Unknown').astype(str)

        # สร้าง Encoder
        le = LabelEncoder()
        df_ml[f'{col}_enc'] = le.fit_transform(df_ml[col])

        # เก็บ Encoder
        encoders_dict[col] = le

# Feature Selection (เอา Condo ออกแล้ว)
features = [
    # Location & Type
    f'{cols_to_encode[0]}_enc',                     # District
    f'{cols_to_encode[1]}_enc',                     # Subdistrict
    f'{cols_to_encode[2]}_enc',                     # Issue Type

    # Organization
    f'{org_col}_enc',

    # Complexity
    'comment_len',

    # Time
    'hour', 'day_of_week', 'month'
]

# กรองเฉพาะ Feature ที่มีอยู่จริงใน DataFrame (กัน Error)
valid_features = [f for f in features if f in df_ml.columns]

# Clean Missing Data
df_ready = df_ml.dropna(subset=valid_features)
X = df_ready[valid_features]
y = df_ready['is_reopen']

# Split Train/Test (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f" -> Train Size: {X_train.shape[0]} | Test Size: {X_test.shape[0]}")

# ---------------------------------------------------------
# 2. Handling Imbalance with Undersampling
# ---------------------------------------------------------
print("\n2. Applying Undersampling...")
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

# ---------------------------------------------------------
# 3. Training XGBoost Model
# ---------------------------------------------------------
print("\n3. Training XGBoost Model...")
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)
xgb_model.fit(X_train_rus, y_train_rus)

# ---------------------------------------------------------
# 4. Threshold Optimization
# ---------------------------------------------------------
print("\n4. Optimizing Decision Threshold...")
y_proba = xgb_model.predict_proba(X_test)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print(f" -> Optimal Threshold: {best_threshold:.4f}")

# ---------------------------------------------------------
# 5. Final Evaluation & SAVE MODEL
# ---------------------------------------------------------
y_pred_final = (y_proba >= best_threshold).astype(int)

print("\n" + "="*40)
print("       FINAL MODEL PERFORMANCE")
print("="*40)
print(classification_report(y_test, y_pred_final))

# --- [SAVE MODEL] ---
print("\nSaving Model to 'traffy_model.pkl'...")

model_package = {
    'model': xgb_model,
    'encoders': encoders_dict,
    'threshold': best_threshold,
    'features': valid_features,
    'org_col_name': org_col
}

joblib.dump(model_package, 'traffy_model.pkl')
print("✅ Saved successfully! File: 'traffy_model.pkl'")

 -> Train Size: 209068 | Test Size: 52268

2. Applying Undersampling...

3. Training XGBoost Model...

4. Optimizing Decision Threshold...
 -> Optimal Threshold: 0.5583

       FINAL MODEL PERFORMANCE
              precision    recall  f1-score   support

           0       0.94      0.79      0.86     48679
           1       0.11      0.34      0.17      3589

    accuracy                           0.76     52268
   macro avg       0.53      0.57      0.51     52268
weighted avg       0.89      0.76      0.81     52268


Saving Model to 'traffy_model.pkl'...
✅ Saved successfully! File: 'traffy_model.pkl'
