In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import C, RBF
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define a function to get the trained model for each property based on the analysis
def get_trained_final_model(data, target, property_name):
    """
    Trains the best performing model for a specific blend property on the full training data.
    """
    # Define the final models and their parameters based on the analysis
    final_model_info = {
        'BlendProperty1': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty2': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty3': ('ElasticNet', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)),
        'BlendProperty4': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty5': ('Random_Forest', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)),
        'BlendProperty6': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty7': ('SVR_Poly', make_pipeline(StandardScaler(), SVR(kernel='poly', C=1.0, epsilon=0.1))),
        'BlendProperty8': ('ElasticNet', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)),
        'BlendProperty9': ('Advanced_Ensemble', 'ADVANCED_HYPERTUNED'),  # Special handling for BlendProperty9
        'BlendProperty10': ('Neural_Network', Sequential([Dense(64, activation='relu', input_shape=(data.shape[1],)), Dropout(0.2), Dense(64, activation='relu'), Dense(1)]))
    }

    model_name, model = final_model_info[property_name]

    X = data
    y = target

    print(f"Training {model_name} for {property_name} on full dataset...")

    if property_name == 'BlendProperty9':
        # Special advanced hyperparameter tuning for BlendProperty9
        return train_advanced_blendproperty9_model(X, y)
    elif model_name == 'Neural_Network':
        model.compile(optimizer='adam', loss='mae')
        model.fit(X, y, epochs=100, batch_size=32, verbose=0)
    elif model_name == 'TabNet':
         # TabNet requires numpy and potential scaling
         X_np = X.values
         y_np = y.values.reshape(-1, 1)
         scaler = StandardScaler()
         X_scaled = scaler.fit_transform(X_np)
         model.fit(X_scaled, y_np, max_epochs=200, patience=20, batch_size=256, virtual_batch_size=128, verbose=0)
         # Wrap TabNet model and scaler in a pipeline for consistent prediction interface
         class TabNetPipeline:
             def __init__(self, scaler, tabnet_model):
                 self.scaler = scaler
                 self.tabnet_model = tabnet_model
             def predict(self, X):
                 X_scaled = self.scaler.transform(X.values)
                 return self.tabnet_model.predict(X_scaled).flatten()
         model = TabNetPipeline(scaler, model) # Return the wrapped model
    elif isinstance(model, Pipeline): # Check against the Pipeline class
        model.fit(X, y) # Pipeline handles scaling internally
    else:
        model.fit(X, y)

    print(f"Training complete for {property_name}.")
    return model

# Load test data and sample submission
# Assuming test.csv and sample_solution.csv are in the current directory
try:
  test_df = pd.read_csv("test.csv")
  submission_df = pd.read_csv("sample_solution.csv")
  test_ids = test_df['ID']
  test_df_features = test_df.drop(columns=['ID'])
except FileNotFoundError:
    print("Make sure 'test.csv' and 'sample_solution.csv' are uploaded to your Colab session.")


if 'test_df_features' in locals(): # Check if test data was loaded
  # Generate predictions using the best model for each property
  for i in range(1, 11):
      property_name = f'BlendProperty{i}'
      print(f"\nProcessing {property_name} for final submission...")

      # Define features for this property
      features = ['Component1_fraction', 'Component2_fraction', 'Component3_fraction',
                 'Component4_fraction', 'Component5_fraction'] + \
                [f'Component{j}_Property{i}' for j in range(1, 6)]

      # Train the best model for this property on the full training data
      trained_model = get_trained_final_model(df[features], df[property_name], property_name)

      # Make predictions on the test data
      test_predictions = trained_model.predict(test_df_features[features])

      # Update the submission DataFrame
      submission_df[property_name] = test_predictions

  # Save the final submission file
  submission_df.to_csv('final_model_submission.csv', index=False)

  print("\n" + "="*80)
  print("Final submission file 'final_model_submission.csv' created successfully.")
  print("="*80)



ImportError: cannot import name 'C' from 'sklearn.gaussian_process.kernels' (/Users/MacbookPro/LocalStorage/Developer/ShellAi/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/kernels.py)

In [2]:
# Advanced Feature Engineering Functions
def create_ratio_features(df):
    """Create ratio features between component fractions"""
    comp_cols = [f'Component{i}_fraction' for i in range(1, 6)]
    
    for i in range(len(comp_cols)):
        for j in range(i+1, len(comp_cols)):
            df[f'ratio_{comp_cols[i]}_to_{comp_cols[j]}'] = df[comp_cols[i]] / (df[comp_cols[j]] + 1e-5)
    
    return df

def create_interaction_features(df, property_idx):
    """Create interaction features between component fractions and properties"""
    comp_cols = [f'Component{i}_fraction' for i in range(1, 6)]
    prop_cols = [f'Component{i}_Property{property_idx}' for i in range(1, 6)]
    
    for comp in comp_cols:
        for prop in prop_cols:
            df[f'{comp}_x_{prop}'] = df[comp] * df[prop]
    
    return df

def create_polynomial_features(df, degree=2):
    """Create polynomial features for component fractions"""
    from sklearn.preprocessing import PolynomialFeatures
    
    comp_cols = [f'Component{i}_fraction' for i in range(1, 6)]
    
    poly = PolynomialFeatures(degree=degree, include_bias=False, interaction_only=True)
    poly_features = poly.fit_transform(df[comp_cols])
    
    # Get feature names
    feature_names = poly.get_feature_names_out(comp_cols)
    
    # Add polynomial features to dataframe
    for i, name in enumerate(feature_names):
        if name not in comp_cols:  # Skip original features
            df[f'poly_{name}'] = poly_features[:, i]
    
    return df

def create_statistical_features(df, property_idx):
    """Create statistical features across components for each property"""
    prop_cols = [f'Component{i}_Property{property_idx}' for i in range(1, 6)]
    comp_cols = [f'Component{i}_fraction' for i in range(1, 6)]
    
    # Statistical features for properties
    df[f'Property{property_idx}_mean'] = df[prop_cols].mean(axis=1)
    df[f'Property{property_idx}_std'] = df[prop_cols].std(axis=1)
    df[f'Property{property_idx}_min'] = df[prop_cols].min(axis=1)
    df[f'Property{property_idx}_max'] = df[prop_cols].max(axis=1)
    df[f'Property{property_idx}_range'] = df[f'Property{property_idx}_max'] - df[f'Property{property_idx}_min']
    
    # Weighted averages using component fractions
    df[f'Property{property_idx}_weighted_avg'] = sum(df[f'Component{i}_fraction'] * df[f'Component{i}_Property{property_idx}'] 
                                                    for i in range(1, 6))
    
    # Component fraction statistics
    df['Component_fraction_mean'] = df[comp_cols].mean(axis=1)
    df['Component_fraction_std'] = df[comp_cols].std(axis=1)
    df['Component_fraction_entropy'] = -sum(df[col] * np.log(df[col] + 1e-10) for col in comp_cols)
    
    return df

In [3]:
# Advanced Hyperparameter Tuning for BlendProperty9
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_percentage_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import optuna
import warnings
warnings.filterwarnings('ignore')

def create_advanced_ratio_features(df):
    """Create advanced ratio features between component fractions"""
    comp_cols = df.columns[:5]
    for i in range(len(comp_cols)):
        for j in range(i+1, len(comp_cols)):
            df[f'ratio_{comp_cols[i]}_to_{comp_cols[j]}'] = df[comp_cols[i]] / (df[comp_cols[j]] + 1e-5)
    return df

def create_advanced_interaction_features(df):
    """Create advanced interaction features between component fractions and properties"""
    comp_cols = df.columns[:5]
    prop_cols = df.columns[5:55]
    for comp in comp_cols:
        for prop in prop_cols:
            df[f'{comp}_x_{prop}'] = df[comp] * df[prop]
    return df

def tune_model_blendproperty9(trial, model_type, X_scaled, y):
    """Hyperparameter tuning function for BlendProperty9"""
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    
    if model_type == 'lgb':
        params = {
            'num_leaves': trial.suggest_int('num_leaves', 31, 80),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15),
            'n_estimators': trial.suggest_int('n_estimators', 100, 300)
        }
        model = LGBMRegressor(**params)
    elif model_type == 'xgb':
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 8),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15),
            'n_estimators': trial.suggest_int('n_estimators', 100, 300)
        }
        model = XGBRegressor(**params)
    elif model_type == 'cat':
        params = {
            'depth': trial.suggest_int('depth', 4, 8),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15),
            'iterations': trial.suggest_int('iterations', 100, 300)
        }
        model = CatBoostRegressor(verbose=0, **params)

    score = cross_val_score(model, X_scaled, y, cv=kf, scoring='neg_mean_absolute_percentage_error', n_jobs=-1).mean()
    return -score

def train_advanced_blendproperty9_model(X, y):
    """
    Advanced training function specifically for BlendProperty9 with hyperparameter tuning
    """
    print("Starting advanced hyperparameter tuning for BlendProperty9...")
    
    # Feature Engineering for BlendProperty9
    X_enhanced = X.copy()
    X_enhanced = create_advanced_ratio_features(X_enhanced)
    X_enhanced = create_advanced_interaction_features(X_enhanced)
    
    # Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_enhanced)
    
    # Hyperparameter tuning with Optuna
    best_params = {}
    for model_type in ['lgb', 'xgb', 'cat']:
        print(f"Tuning {model_type} for BlendProperty9...")
        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: tune_model_blendproperty9(trial, model_type, X_scaled, y), n_trials=10)
        best_params[model_type] = study.best_params
        print(f"Best {model_type} params: {study.best_params}")
    
    # Train optimized models
    lgb_model = LGBMRegressor(**best_params['lgb']).fit(X_scaled, y)
    xgb_model = XGBRegressor(**best_params['xgb']).fit(X_scaled, y)
    cat_model = CatBoostRegressor(verbose=0, **best_params['cat']).fit(X_scaled, y)
    
    # Train other models with optimized parameters
    rf = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42).fit(X_scaled, y)
    elastic = ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42).fit(X_scaled, y)
    ridge = Ridge(alpha=1.0).fit(X_scaled, y)
    lasso = Lasso(alpha=0.001).fit(X_scaled, y)
    gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42).fit(X_scaled, y)
    
    # Create ensemble model wrapper
    class BlendProperty9EnsembleModel:
        def __init__(self, models, scaler, feature_enhancer):
            self.models = models
            self.scaler = scaler
            self.feature_enhancer = feature_enhancer
        
        def predict(self, X):
            # Apply same feature engineering as training
            X_enhanced = X.copy()
            X_enhanced = create_advanced_ratio_features(X_enhanced)
            X_enhanced = create_advanced_interaction_features(X_enhanced)
            
            # Scale features
            X_scaled = self.scaler.transform(X_enhanced)
            
            # Get predictions from all models
            preds = []
            for model in self.models:
                preds.append(model.predict(X_scaled))
            
            # Return ensemble average
            return np.mean(preds, axis=0)
    
    # Create ensemble model
    models = [lgb_model, xgb_model, cat_model, rf, elastic, ridge, lasso, gbr]
    ensemble_model = BlendProperty9EnsembleModel(models, scaler, None)
    
    # Evaluate ensemble on training data
    y_pred_train = ensemble_model.predict(X)
    mape = mean_absolute_percentage_error(y, y_pred_train)
    print(f'BlendProperty9 Ensemble MAPE: {mape:.4f}')
    
    return ensemble_model

OSError: dlopen(/Users/MacbookPro/LocalStorage/Developer/ShellAi/.venv/lib/python3.12/site-packages/lightgbm/lib/lib_lightgbm.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib
  Referenced from: <D44045CD-B874-3A27-9A61-F131D99AACE4> /Users/MacbookPro/LocalStorage/Developer/ShellAi/.venv/lib/python3.12/site-packages/lightgbm/lib/lib_lightgbm.dylib
  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file)

In [None]:
# Load and prepare data for BlendProperty9 training
try:
    # Load training data
    df = pd.read_csv('../../../dataset/train.csv')
    print(f"Training data loaded successfully. Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    
    # Check if BlendProperty9 exists
    if 'BlendProperty9' in df.columns:
        print("BlendProperty9 found in training data.")
        print(f"BlendProperty9 statistics:")
        print(df['BlendProperty9'].describe())
    else:
        print("Warning: BlendProperty9 not found in training data.")
        
except FileNotFoundError:
    print("Training data not found. Please ensure train.csv is in the correct location.")
    print("Expected location: ../../../dataset/train.csv")