In [1]:
# ============================================================
# 1. SETUP: INSTALL & IMPORT LIBRARIES
# ============================================================
# Pastikan Anda sudah menjalankan: !pip install pandas numpy xgboost scikit-learn joblib pickle-mixin

import pandas as pd
import numpy as np
import random 
import pickle
import joblib 

from xgboost import XGBRegressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 


# ============================================================
# 2. CUSTOM PREPROCESSING FUNCTIONS (Necessary for saving/loading)
# ============================================================

# --- Custom Label Encoder ---
class CustomLabelEncoder:
    """A minimal class to fit and transform categorical data to integers."""
    def __init__(self):
        self.classes_ = []
        self._mapping = {}

    def fit(self, series):
        self.classes_ = series.unique().tolist()
        self._mapping = {val: i for i, val in enumerate(self.classes_)}
        return self

    def transform(self, series):
        return series.apply(lambda x: self._mapping.get(x, 0))

    def fit_transform(self, series):
        self.fit(series)
        return self.transform(series)

# --- Custom Standard Scaler ---
class CustomStandardScaler:
    """A minimal class to fit and transform data using Z-score standardization."""
    def __init__(self):
        self.mean_ = None
        self.std_ = None

    def fit(self, X):
        self.mean_ = X.mean()
        self.std_ = X.std()
        return self

    def transform(self, X):
        epsilon = 1e-6
        return (X - self.mean_) / (self.std_ + epsilon)
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

# --- Custom Train-Test Split (Simplified) ---
def custom_train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        random.seed(random_state)
        np.random.seed(random_state)

    df_combined = X.copy()
    df_combined['__target__'] = y.values
    
    indices = df_combined.index.tolist()
    random.shuffle(indices)
    
    test_count = int(len(indices) * test_size)
    test_indices = indices[:test_count]
    train_indices = indices[test_count:]
    
    X_train = df_combined.loc[train_indices, X.columns]
    X_test = df_combined.loc[test_indices, X.columns]
    y_train = df_combined.loc[train_indices, '__target__']
    y_test = df_combined.loc[test_indices, '__target__']
    
    return X_train, X_test, y_train, y_test


# ============================================================
# 3. DATA LOADING AND PRE-PROCESSING
# ============================================================
df = pd.read_csv(
    "books.csv",
    engine="python",
    on_bad_lines="skip"
)

# Clean Target Variable (Regression Target)
df['average_rating'] = pd.to_numeric(df['average_rating'], errors='coerce') 
df = df[df['average_rating'].notnull()]
df = df[df['average_rating'] > 0.0].copy()

# Fix missing values
df['publisher'] = df['publisher'].fillna("Unknown")
df['language_code'] = df['language_code'].fillna("Unknown")
df['authors'] = df['authors'].fillna("Unknown")

# Extract year
df['year'] = pd.to_datetime(df['publication_date'], errors='coerce').dt.year
df['year'] = df['year'].fillna(df['year'].median())
GLOBAL_YEAR_MEDIAN = df['year'].median() # Capture median for UI notebook

# Log-transform 
df['log_ratings_count'] = np.log1p(df['ratings_count'])
df['log_reviews_count'] = np.log1p(df['text_reviews_count'])

# Label encode
label_cols = ['language_code', 'publisher', 'authors']
encoders = {}

for col in label_cols:
    enc = CustomLabelEncoder()
    df[col] = enc.fit_transform(df[col].astype(str).str.strip())
    encoders[col] = enc

# Feature/Target Selection and Split
features = ["  num_pages", "language_code", "publisher", "authors", 
            "log_ratings_count", "log_reviews_count", "year"]

X = df[features]
y = df['average_rating'] 

X_train, X_test, y_train, y_test = custom_train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scaling (Fit the scaler, though XGBoost uses unscaled data)
scaling_cols = ["  num_pages", "log_ratings_count", "log_reviews_count", "year"]
scaler = CustomStandardScaler()
scaler.fit(X_train[scaling_cols]) 


# ============================================================
# 4. TRAIN MODEL (XGBoost Regressor)
# ============================================================
print("\n--- Starting Model Training ---")
xgb_reg = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)
xgb_reg.fit(X_train, y_train)
print("Model training complete.")


# ============================================================
# 5. EVALUATION (Optional, but good practice)
# ============================================================
preds = xgb_reg.predict(X_test)
preds = np.clip(preds, 0.0, 5.0)

mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds)) 
r2 = r2_score(y_test, preds)

print("\n--- Model Evaluation (Test Set) ---")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")


# ============================================================
# 6. SAVE MODEL AND PREPROCESSING ARTIFACTS (CRITICAL STEP)
# ============================================================
print("\n--- Saving Artifacts ---")
# 1. Save the trained XGBoost Regressor model
joblib.dump(xgb_reg, 'xgb_regressor_model.joblib')
print("Saved: xgb_regressor_model.joblib")

# 2. Save the custom encoders dictionary
with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)
print("Saved: encoders.pkl")

# 3. Save the CustomStandardScaler object
joblib.dump(scaler, 'custom_scaler.joblib')
print("Saved: custom_scaler.joblib")
print("\nTraining notebook finished. Ready to use the UI notebook.")


--- Starting Model Training ---
Model training complete.

--- Model Evaluation (Test Set) ---
MAE: 0.1991
RMSE: 0.2589
R²: 0.1955

--- Saving Artifacts ---
Saved: xgb_regressor_model.joblib
Saved: encoders.pkl
Saved: custom_scaler.joblib

Training notebook finished. Ready to use the UI notebook.
