# Semiconductor Yield Prediction - Modeling

This notebook covers:
1. Data Loading (from preprocessed data)
2. Data Preparation (Train/Valid/Test split, Oversampling)
3. AutoML with PyCaret
4. Bayesian Optimization
5. Model Evaluation & Prediction

**Prerequisites:** Run `01_eda_and_preprocessing.ipynb` first

## 1. Setup & Data Loading

In [None]:
# Import utilities
from utils import *

# Additional imports for modeling
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb

from bayes_opt import BayesianOptimization

import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load preprocessed data
try:
    with open('../03_Results/preprocessed_data.pkl', 'rb') as f:
        data = pickle.load(f)
    
    X_train_raw = data['X_train']
    y_train_raw = data['y_train']
    X_predict = data['X_predict']
    col_X = data['col_X']
    col_selected = data['col_selected']
    
    print("Loaded preprocessed data successfully!")
    print(f"X_train: {X_train_raw.shape}")
    print(f"y_train: {y_train_raw.shape}")
    print(f"X_predict: {X_predict.shape}")
    
except FileNotFoundError:
    print("ERROR: Preprocessed data not found!")
    print("Please run 01_eda_and_preprocessing.ipynb first.")

## 2. Data Preparation

In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_train_raw, y_train_raw, 
    test_size=0.2, 
    random_state=1
)

# Train/Validation split
X_train_, X_valid, y_train_, y_valid = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=1
)

print(f"Train: {X_train_.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

In [None]:
# Prepare data for oversampling
os_df = X_train_.join(y_train_, how='left')

# Create target categories for oversampling
os_df.loc[os_df['y'] < 1242, 'y_cate'] = 'A'
os_df.loc[(os_df['y'] >= 1242) & (os_df['y'] <= 1283), 'y_cate'] = 'B'
os_df.loc[os_df['y'] > 1283, 'y_cate'] = 'C'

print("Target distribution before oversampling:")
print(os_df['y_cate'].value_counts())

In [None]:
# Log transform target (optional)
os_df['log_y'] = np.log1p(os_df['y'])
y_test_log = np.log1p(y_test)
y_valid_log = np.log1p(y_valid)

# Apply RandomOverSampler
ros = RandomOverSampler(random_state=1)
os_df_resampled, os_target = ros.fit_resample(
    os_df.drop('y_cate', axis=1), 
    os_df['y_cate']
)

print(f"\nAfter oversampling: {os_df_resampled.shape}")
print("Target distribution after oversampling:")
print(os_target.value_counts())

In [None]:
# Prepare final training data
feature_cols = [c for c in os_df_resampled.columns if c not in ['y', 'log_y']]
X_train_final = os_df_resampled[feature_cols]
y_train_final = os_df_resampled['log_y']  # Use log-transformed target

print(f"Final X_train: {X_train_final.shape}")
print(f"Final y_train: {y_train_final.shape}")

## 3. AutoML with PyCaret

In [None]:
# Note: PyCaret requires specific setup
# Uncomment and run if pycaret is installed

try:
    from pycaret.regression import setup, compare_models, tune_model, evaluate_model, create_model, predict_model
    PYCARET_AVAILABLE = True
except ImportError:
    print("PyCaret not installed. Skipping AutoML section.")
    print("Install with: pip install pycaret")
    PYCARET_AVAILABLE = False

In [None]:
if PYCARET_AVAILABLE:
    # Prepare data for PyCaret
    pycaret_df = os_df_resampled[feature_cols + ['log_y']].copy()
    
    # Setup PyCaret
    reg = setup(
        data=pycaret_df,
        target='log_y',
        normalize=True,
        train_size=0.8,
        fold=5,
        session_id=123,
        verbose=False
    )
    print("PyCaret setup complete!")

In [None]:
if PYCARET_AVAILABLE:
    # Compare models
    print("Comparing models (this may take several minutes)...")
    best = compare_models(sort='RMSE', n_select=5)
    print(f"\nBest model: {type(best).__name__}")

In [None]:
if PYCARET_AVAILABLE:
    # Create individual models
    print("Creating individual models...")
    
    models = {}
    model_names = ['et', 'rf', 'catboost', 'lightgbm', 'gbr']
    
    for name in model_names:
        try:
            models[name] = create_model(name, cross_validation=True, verbose=False)
            print(f"  - {name}: Created")
        except Exception as e:
            print(f"  - {name}: Failed ({e})")

In [None]:
if PYCARET_AVAILABLE:
    # Tune best model
    print("Tuning best model...")
    best_tuned = tune_model(best, verbose=False)
    print("Tuning complete!")

## 4. Bayesian Optimization

### 4.1 Ridge Regression Optimization

In [None]:
# Standardize features for Ridge
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_valid_scaled = scaler.transform(X_valid[feature_cols])
X_test_scaled = scaler.transform(X_test[feature_cols])

In [None]:
def evaluate_ridge(alpha):
    """Objective function for Ridge optimization."""
    model = Ridge(alpha=alpha)
    model.fit(X_train_scaled, y_train_final)
    return model.score(X_train_scaled, y_train_final)

# Bayesian Optimization for Ridge
print("Optimizing Ridge Regression...")
ridge_optimizer = BayesianOptimization(
    f=evaluate_ridge,
    pbounds={'alpha': (0.01, 10)},
    random_state=42,
    verbose=0
)

ridge_optimizer.maximize(init_points=5, n_iter=20)

best_alpha = ridge_optimizer.max['params']['alpha']
print(f"\nBest alpha: {best_alpha:.4f}")
print(f"Best score: {ridge_optimizer.max['target']:.4f}")

In [None]:
# Train final Ridge model
ridge_model = Ridge(alpha=best_alpha)
ridge_model.fit(X_train_scaled, y_train_final)

# Evaluate on validation set
ridge_pred_valid = ridge_model.predict(X_valid_scaled)
ridge_rmse_valid = rmse(y_valid_log, ridge_pred_valid)

print(f"Ridge Validation RMSE (log scale): {ridge_rmse_valid}")

### 4.2 Random Forest Optimization

In [None]:
def rf_cv(n_estimators, min_samples_split, max_features, max_depth):
    """Objective function for Random Forest optimization."""
    model = RandomForestRegressor(
        n_estimators=int(n_estimators),
        min_samples_split=int(min_samples_split),
        max_features=min(max_features, 0.999),
        max_depth=int(max_depth),
        random_state=42,
        n_jobs=-1
    )
    
    scores = cross_val_score(
        model, X_train_final, y_train_final,
        scoring='neg_mean_squared_error',
        cv=5
    )
    return np.nan_to_num(scores).mean()

# Bayesian Optimization for Random Forest
print("Optimizing Random Forest (this may take a while)...")
rf_optimizer = BayesianOptimization(
    f=rf_cv,
    pbounds={
        'n_estimators': (50, 300),
        'min_samples_split': (2, 10),
        'max_features': (0.1, 0.999),
        'max_depth': (3, 15)
    },
    random_state=42,
    verbose=0
)

rf_optimizer.maximize(init_points=5, n_iter=15)

print(f"\nBest parameters:")
for param, value in rf_optimizer.max['params'].items():
    print(f"  {param}: {value:.4f}")
print(f"Best CV score: {rf_optimizer.max['target']:.4f}")

In [None]:
# Train final Random Forest model
best_rf_params = rf_optimizer.max['params']

rf_model = RandomForestRegressor(
    n_estimators=int(best_rf_params['n_estimators']),
    min_samples_split=int(best_rf_params['min_samples_split']),
    max_features=min(best_rf_params['max_features'], 0.999),
    max_depth=int(best_rf_params['max_depth']),
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_final, y_train_final)

# Evaluate on validation set
rf_pred_valid = rf_model.predict(X_valid[feature_cols])
rf_rmse_valid = rmse(y_valid_log, rf_pred_valid)

print(f"Random Forest Validation RMSE (log scale): {rf_rmse_valid}")

## 5. Model Comparison & Final Evaluation

In [None]:
# Train additional models for comparison
models_comparison = {}

# Extra Trees
et_model = ExtraTreesRegressor(n_estimators=200, random_state=42, n_jobs=-1)
et_model.fit(X_train_final, y_train_final)
models_comparison['Extra Trees'] = et_model

# Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_final, y_train_final)
models_comparison['Gradient Boosting'] = gb_model

# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=200, random_state=42, verbose=-1)
lgb_model.fit(X_train_final, y_train_final)
models_comparison['LightGBM'] = lgb_model

# Add our optimized models
models_comparison['Ridge (Optimized)'] = ridge_model
models_comparison['Random Forest (Optimized)'] = rf_model

print("Models trained successfully!")

In [None]:
# Evaluate all models on test set
print("\n" + "="*60)
print("MODEL COMPARISON (Test Set)")
print("="*60)

results = []

for name, model in models_comparison.items():
    if name == 'Ridge (Optimized)':
        pred = model.predict(X_test_scaled)
    else:
        pred = model.predict(X_test[feature_cols])
    
    test_rmse = rmse(y_test_log, pred)
    test_r2 = r2_score(y_test_log, pred)
    
    results.append({
        'Model': name,
        'RMSE': test_rmse,
        'R2': round(test_r2, 4)
    })
    print(f"{name:30s} | RMSE: {test_rmse:.4f} | R2: {test_r2:.4f}")

results_df = pd.DataFrame(results).sort_values('RMSE')
print("\n" + "="*60)
print(f"Best Model: {results_df.iloc[0]['Model']}")
print("="*60)

## 6. Final Prediction

In [None]:
# Use best model for final prediction
best_model_name = results_df.iloc[0]['Model']
best_model = models_comparison[best_model_name]

print(f"Using {best_model_name} for final predictions...")

# Generate predictions
if 'Ridge' in best_model_name:
    X_predict_scaled = scaler.transform(X_predict[feature_cols])
    final_pred_log = best_model.predict(X_predict_scaled)
else:
    final_pred_log = best_model.predict(X_predict[feature_cols])

# Convert back from log scale
final_pred = np.expm1(final_pred_log)

print(f"\nPrediction Statistics:")
print(f"Mean: {final_pred.mean():.2f}")
print(f"Std: {final_pred.std():.2f}")
print(f"Min: {final_pred.min():.2f}")
print(f"Max: {final_pred.max():.2f}")

In [None]:
# Save predictions
predictions_df = pd.DataFrame({
    'predicted_y': final_pred
})

predictions_df.to_csv('../03_Results/predictions.csv', index=False)
print("Predictions saved to 03_Results/predictions.csv")

# Save best model
with open('../03_Results/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print(f"Best model ({best_model_name}) saved to 03_Results/best_model.pkl")

In [None]:
# Visualize predictions distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(final_pred, bins=30, edgecolor='black', alpha=0.7)
axes[0].axvline(final_pred.mean(), color='red', linestyle='--', label=f'Mean: {final_pred.mean():.2f}')
axes[0].set_xlabel('Predicted Quality Index')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Prediction Distribution')
axes[0].legend()

# Actual vs Predicted (on test set)
if 'Ridge' in best_model_name:
    test_pred_log = best_model.predict(X_test_scaled)
else:
    test_pred_log = best_model.predict(X_test[feature_cols])
test_pred = np.expm1(test_pred_log)
y_test_actual = np.expm1(y_test_log)

axes[1].scatter(y_test_actual, test_pred, alpha=0.5)
axes[1].plot([y_test_actual.min(), y_test_actual.max()], 
             [y_test_actual.min(), y_test_actual.max()], 'r--', label='Perfect Prediction')
axes[1].set_xlabel('Actual Quality Index')
axes[1].set_ylabel('Predicted Quality Index')
axes[1].set_title('Actual vs Predicted (Test Set)')
axes[1].legend()

plt.tight_layout()
plt.savefig('../03_Results/figures/prediction_results.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nFigure saved to 03_Results/figures/prediction_results.png")