<a href="https://colab.research.google.com/github/j1berlaka/OTP-Based-Wireless-Locking-System/blob/main/satoru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install only necessary packages
!pip install catboost xgboost scikit-learn tensorflow scikit-optimize bayesian-optimization lightgbm --quiet

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold
from catboost import CatBoostRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor # Added LightGBM import
from scipy.optimize import differential_evolution # Not used in final proposed code, but kept if you plan to use it for other parts
from skopt import BayesSearchCV
from bayes_opt import BayesianOptimization
from sklearn.multioutput import MultiOutputRegressor # Needed for consistent multi-output models
from sklearn.metrics import mean_absolute_percentage_error as mape_scorer # For evaluation

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def load_and_validate_data(train_path='train.csv', test_path='test.csv'):
    """Load and validate data with strict checks for feature consistency."""

    # Load data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # ==== VALIDATION CHECKS (CRITICAL FOR HACKATHON) ====
    # 1. Column Structure Check
    required_fraction_cols = [f'Component{i}_fraction' for i in range(1,6)]
    required_property_cols = [f'Component{i}_Property{j}' for i in range(1,6) for j in range(1,11)]
    required_target_cols = [f'BlendProperty{i}' for i in range(1,11)]

    all_train_cols = required_fraction_cols + required_property_cols + required_target_cols
    all_test_cols_no_id = required_fraction_cols + required_property_cols

    missing_train = set(all_train_cols) - set(train_data.columns)
    missing_test = set(all_test_cols_no_id) - set(test_data.columns)

    assert not missing_train, f"Missing in train: {missing_train}"
    assert not missing_test, f"Missing in test: {missing_test}"

    # 2. Data Type Check (more robust)
    for col_list in [required_fraction_cols, required_property_cols, required_target_cols]:
        for col in col_list:
            if col in train_data.columns:
                assert pd.api.types.is_numeric_dtype(train_data[col]), f"Non-numeric column detected in train: {col}"
            if col in test_data.columns: # Check test for features
                 assert pd.api.types.is_numeric_dtype(test_data[col]), f"Non-numeric column detected in test: {col}"


    # 3. Test ID Handling
    # The problem statement implies ID should be in the test submission
    if 'ID' not in test_data.columns:
        # If 'ID' column is not in test.csv, create sequential IDs as a fallback
        test_data['ID'] = range(1, len(test_data) + 1)
        print("Warning: 'ID' column not found in test.csv. Generating sequential IDs.")

    test_ids = test_data['ID']
    test_data_for_pred = test_data.drop(columns=['ID']) # Keep ID out of features for prediction

    # Prepare splits
    X = train_data.drop(columns=[f'BlendProperty{i}' for i in range(1,11)])
    y = train_data[[f'BlendProperty{i}' for i in range(1,11)]]

    # Stratified split by key property (BlendProperty3) - This is a good idea!
    # Ensure qcut works for potential value ranges. If it fails, revert to simple split or adjust q.
    try:
        y_stratify = pd.qcut(y['BlendProperty3'], q=5, labels=False, duplicates='drop')
    except Exception as e:
        print(f"Warning: Stratified split failed for BlendProperty3 ({e}). Falling back to simple split.")
        y_stratify = None # Fallback to no stratification if qcut fails

    X_train, X_val, y_train, y_val = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y_stratify # Use the generated stratification labels
    )

    return X_train, X_val, y_train, y_val, test_data_for_pred, test_ids

# Execute
X_train, X_val, y_train, y_val, X_test_raw, test_ids = load_and_validate_data()
print("✅ Data loaded & validated. Shapes:")
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test raw: {X_test_raw.shape}")

✅ Data loaded & validated. Shapes:
Train: (1600, 55), Val: (400, 55), Test raw: (500, 55)


In [None]:
# CELL 3: ERROR-PROOF FEATURE ENGINEERING AND SCALING
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

def create_aligned_features(df, ref_cols=None):
    """Generates features while preserving column order."""

    # Create a copy to avoid SettingWithCopyWarning
    df = df.copy()

    # 1. Core Features (Ensure fractions are not exactly zero before sqrt)
    for i in [1, 3, 5]:
        df[f'comp{i}_frac_cubed'] = df[f'Component{i}_fraction'] ** 3
        # Add a small epsilon to avoid sqrt(0) if it's a concern, or handle NaNs after
        df[f'comp{i}_frac_sqrt'] = np.sqrt(df[f'Component{i}_fraction'].replace(0, 1e-9)) # Replace 0 with small epsilon

    # 2. Fixed Synergy Features
    for i,j in [(1,3), (1,5), (3,5), (2,4)]:
        df[f'synergy_{i}_{j}'] = df[f'Component{i}_fraction'] * df[f'Component{j}_Property5']

    # 3. Polynomial Features (Order-stable)
    # Using more relevant features for interaction (example, adjust as per EDA)
    poly_cols_candidates = [
        'Component1_Property1', 'Component1_Property2', 'Component1_Property3',
        'Component2_Property1', 'Component2_Property2', 'Component2_Property3',
        'Component3_Property1', 'Component3_Property2', 'Component3_Property3',
        'Component4_Property1', 'Component4_Property2', 'Component4_Property3',
        'Component5_Property1', 'Component5_Property2', 'Component5_Property3',
        'Component1_fraction', 'Component2_fraction', 'Component3_fraction',
        'Component4_fraction', 'Component5_fraction'
    ]
    # Filter to only columns actually present in the DataFrame
    poly_cols = [col for col in poly_cols_candidates if col in df.columns]

    if len(poly_cols) > 1: # Ensure at least two columns for interaction_only
        poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
        # Apply polynomial features only to the selected columns
        poly_features = poly.fit_transform(df[poly_cols])
        # Generate new column names to avoid clashes
        poly_feature_names = [f'poly_inter_{name}' for name in poly.get_feature_names_out(poly_cols)]
        df_poly = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
        df = pd.concat([df, df_poly], axis=1)
    else:
        print("Warning: Not enough columns for polynomial interaction features.")

    # 4. Weighted Averages (Crucial for physical understanding of blends)
    # For each blend property, calculate a weighted average based on component fractions
    # This assumes component properties are independent for each blend property.
    # We will create 10 new features, one for each BlendProperty's potential estimation.
    for bp_idx in range(1, 11): # Loop for BlendProperty1 to BlendProperty10
        weighted_prop_sum = 0
        for comp_idx in range(1, 6): # Loop for Component1 to Component5
            # Ensure the property exists for the component
            comp_prop_col = f'Component{comp_idx}_Property{bp_idx}'
            frac_col = f'Component{comp_idx}_fraction'
            if comp_prop_col in df.columns and frac_col in df.columns:
                weighted_prop_sum += df[frac_col] * df[comp_prop_col]
        df[f'weighted_avg_BP{bp_idx}'] = weighted_prop_sum


    # Force column alignment if reference exists
    if ref_cols is not None:
        current_cols = set(df.columns)
        for col in ref_cols:
            if col not in current_cols:
                df[col] = 0.0 # Add missing columns with zeros (as float)
        # Drop extra columns that might be in df but not in ref_cols
        extra_cols = list(current_cols - set(ref_cols))
        if extra_cols:
            df = df.drop(columns=extra_cols)
        df = df[ref_cols] # Enforce exact column order

    # Handle any potential NaNs introduced by feature engineering (e.g., sqrt(0))
    df = df.fillna(0) # Or mean/median imputation based on EDA

    return df

# Create reference features from training data
X_train = create_aligned_features(X_train)
feature_order = X_train.columns.tolist()

# Apply to validation/test with enforced alignment
X_val = create_aligned_features(X_val, ref_cols=feature_order)
X_test = create_aligned_features(X_test_raw, ref_cols=feature_order) # Use X_test_raw here

print(f"✅ Generated {len(feature_order)} aligned features")
print(f"Example features: {feature_order[:5]}...")
print(f"Train FE shape: {X_train.shape}, Val FE shape: {X_val.shape}, Test FE shape: {X_test.shape}")

# Now, apply scaling AFTER feature engineering
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_val_scaled = X_scaler.transform(X_val)
X_test_scaled = X_scaler.transform(X_test) # Corrected: Use X_test after FE

y_train_scaled = y_scaler.fit_transform(y_train)
y_val_scaled = y_scaler.transform(y_val)

print("✅ Data scaled successfully.")
print(f"Train scaled shape: {X_train_scaled.shape}, Val scaled shape: {X_val_scaled.shape}, Test scaled shape: {X_test_scaled.shape}")

✅ Generated 285 aligned features
Example features: ['Component1_fraction', 'Component2_fraction', 'Component3_fraction', 'Component4_fraction', 'Component5_fraction']...
Train FE shape: (1600, 285), Val FE shape: (400, 285), Test FE shape: (500, 285)
✅ Data scaled successfully.
Train scaled shape: (1600, 285), Val scaled shape: (400, 285), Test scaled shape: (500, 285)


In [None]:
# CELL 4: NEURAL NETWORK MODEL TRAINING
def create_nn_model(input_shape, output_dim):
    model = models.Sequential([
        layers.Input(shape=(input_shape,)),
        layers.Dense(256, activation='relu', kernel_initializer='he_normal'),
        layers.Dropout(0.2),
        layers.Dense(128, activation='relu', kernel_initializer='he_normal'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(output_dim) # Output layer with linear activation for regression
    ])
    # Using Huber loss which is robust to outliers, and Adam optimizer
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss=tf.keras.losses.Huber(),
                  metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
    return model

# Define input and output dimensions
input_dim = X_train_scaled.shape[1]
output_dim = y_train_scaled.shape[1] # Should be 10 for 10 BlendProperties

# Create and train the NN model
nn_model = create_nn_model(input_dim, output_dim)
nn_model.summary()

# Callbacks for early stopping and learning rate reduction
early_stopping = callbacks.EarlyStopping(
    monitor='val_loss', patience=50, restore_best_weights=True
)
reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=20, min_lr=1e-7
)

print("\n🚀 Training Neural Network...")
history = nn_model.fit(
    X_train_scaled, y_train_scaled,
    epochs=500, # Increased epochs, but early stopping will control it
    batch_size=32,
    validation_data=(X_val_scaled, y_val_scaled),
    callbacks=[early_stopping, reduce_lr],
    verbose=0 # Set to 1 or 2 for progress updates
)
print("✅ Neural Network training complete.")

# Evaluate NN on validation set
nn_val_preds_scaled = nn_model.predict(X_val_scaled)
nn_val_preds = y_scaler.inverse_transform(nn_val_preds_scaled)
val_mape_nn = np.mean([mape_scorer(y_val.iloc[:, i], nn_val_preds[:, i]) for i in range(output_dim)])
print(f"Neural Network Validation MAPE: {val_mape_nn:.4f}")

# This part for test predictions will be moved to the final submission cell
# The original Cell 4 code for test set prediction will be removed here.


🚀 Training Neural Network...
✅ Neural Network training complete.
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Neural Network Validation MAPE: 1.8181


In [None]:
# CELL 5: STACKED ENSEMBLE (XGBoost + CatBoost + LightGBM)

# 1. XGBoost with MultiOutputRegressor and Bayesian Optimization
# Note: BayesSearchCV runs best on single targets or if the scoring metric is adapted for multi-output.
# For simplicity and robust training, we will use MultiOutputRegressor and tune basic params,
# or for deeper tuning, we would run 10 separate BayesSearches.
# For this example, we will train 10 separate XGBoost models.

xgb_models = []
print("\n🚀 Training 10 XGBoost Models...")
for i in range(output_dim): # Loop through each BlendProperty
    print(f"  - Optimizing and training XGBoost for BlendProperty{i+1}")
    # Using BayesSearchCV for each target
    # Lower n_iter for faster execution in example, increase for better results
    xgb_params = {
        'n_estimators': (500, 1500),
        'max_depth': (4, 9),
        'learning_rate': (0.005, 0.05),
        'subsample': (0.7, 1.0),
        'colsample_bytree': (0.7, 1.0)
    }

    # We use X_train directly for tree models, no need for scaled data
    xgb_opt = BayesSearchCV(
        xgb.XGBRegressor(random_state=42, tree_method='hist', # 'gpu_hist' if GPU is enabled and installed
                         n_jobs=-1, # Use all available cores
                         eval_metric='mape', # Use MAPE as evaluation metric for tuning
                         early_stopping_rounds=50 # Early stopping for individual models
                        ),
        xgb_params,
        n_iter=30, # Increased iterations for better tuning
        cv=3,
        scoring='neg_mean_absolute_percentage_error', # Skopt maximizes, so use negative MAPE
        random_state=42,
        verbose=0 # Set to 1 or 2 for progress
    )

    # Fit for the current BlendProperty
    xgb_opt.fit(X_train, y_train.iloc[:,i],
                eval_set=[(X_val, y_val.iloc[:,i])],
                )
    xgb_models.append(xgb_opt.best_estimator_)
    print(f"    Best params for BP{i+1}: {xgb_opt.best_params_}")
    print(f"    Best MAPE for BP{i+1} on val: {-xgb_opt.best_score_:.4f}")

print("✅ XGBoost models trained.")
# NEW CELL: Continue Base Model Training (CatBoost and LightGBM - CPU fallback for CatBoost)

# Ensure output_dim is defined (if not global from your previous execution)
if 'output_dim' not in locals():
    # Assuming y_train is available and has 10 columns
    output_dim = y_train.shape[1]
    print(f"Defined output_dim: {output_dim}")

# 2. CatBoost with Advanced Regularization (CPU Fallback for robustness)
cat_models = []
print("\n🚀 Training 10 CatBoost Models (CPU fallback for stability)...")
for i in range(output_dim):
    print(f"  - Training CatBoost for BlendProperty{i+1}")
    model = CatBoostRegressor(
        iterations=2000,
        depth=8,
        learning_rate=0.025,
        l2_leaf_reg=5,
        grow_policy='Lossguide',
        bootstrap_type='Bayesian',
        random_seed=42,
        verbose=100, # --- Changed to 100 to show updates every 100 iterations ---
        task_type='CPU',
        early_stopping_rounds=100,
        eval_metric='MAPE',
    )
    model.fit(X_train, y_train.iloc[:,i], eval_set=(X_val, y_val.iloc[:,i]))
    cat_models.append(model)
print("✅ CatBoost models trained.")


# 3. LightGBM for Direct Prediction
lgbm_models = []
print("\n🚀 Training 10 LightGBM Models...")
for i in range(output_dim):
    print(f"  - Training LightGBM for BlendProperty{i+1}")
    lgbm = LGBMRegressor(
        num_leaves=64,
        min_child_samples=20,
        reg_alpha=0.1,
        reg_lambda=0.1,
        n_estimators=1000,
        learning_rate=0.05,
        random_state=42,
        n_jobs=-1,
        metric='mape',
        early_stopping_round=50
    )
    lgbm.fit(X_train, y_train.iloc[:,i],
             eval_set=[(X_val, y_val.iloc[:,i])],
             callbacks=[lgbm.log_evaluation(period=0)])
    lgbm_models.append(lgbm)
print("✅ LightGBM models trained.")

print("\n✅ All base models (XGBoost, CatBoost, LightGBM) from your original Cell 5 are now trained.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[3]	validation_0-mape:0.99543
[4]	validation_0-mape:0.98805
[5]	validation_0-mape:0.97297
[6]	validation_0-mape:0.97721
[7]	validation_0-mape:0.97933
[8]	validation_0-mape:0.98205
[9]	validation_0-mape:0.95177
[10]	validation_0-mape:0.96922
[11]	validation_0-mape:0.91594
[12]	validation_0-mape:0.94590
[13]	validation_0-mape:0.96894
[14]	validation_0-mape:0.94239
[15]	validation_0-mape:0.96418
[16]	validation_0-mape:0.94199
[17]	validation_0-mape:0.88537
[18]	validation_0-mape:0.90118
[19]	validation_0-mape:0.93388
[20]	validation_0-mape:0.87589
[21]	validation_0-mape:0.87854
[22]	validation_0-mape:0.86731
[23]	validation_0-mape:0.89264
[24]	validation_0-mape:0.92588
[25]	validation_0-mape:0.92246
[26]	validation_0-mape:0.94531
[27]	validation_0-mape:0.92121
[28]	validation_0-mape:0.95371
[29]	validation_0-mape:0.93792
[30]	validation_0-mape:0.97604
[31]	validation_0-mape:0.96078
[32]	validation_0-mape:0.94310
[33]	validat