In [None]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import os


In [24]:


df = pd.read_csv("/Users/MacbookPro/LocalStorage/Developer/ShellAi/dataset/train.csv")
# Define a function to get the trained model for each property based on the analysis
def get_trained_final_model(data, target, property_name):
    """
    Trains the best performing model for a specific blend property on the full training data.
    """
    # Define the final models and their parameters based on the analysis
    final_model_info = {
        'BlendProperty1': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty2': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty3': ('ElasticNet', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)), 
        'BlendProperty4': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty5': ('Random_Forest', RandomForestRegressor(n_estimators=100, max_depth=11, random_state=42, n_jobs=-1)),
        'BlendProperty6': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty7': ('SVR_Poly', make_pipeline(StandardScaler(), SVR(kernel='poly', C=1.0, epsilon=0.1))),
        'BlendProperty8': ('ElasticNet', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)),
        'BlendProperty9': ('ElasticNet', ElasticNet(alpha=1.0, l1_ratio=0, random_state=42)),
        'BlendProperty10': ('Neural_Network', Sequential([Dense(64, activation='relu', input_shape=(data.shape[1],)), Dropout(0.2), Dense(64, activation='relu'), Dense(1)]))
    }

    model_name, model = final_model_info[property_name]

    X = data
    y = target

    print(f"Training {model_name} for {property_name} on full dataset...")

    if model_name == 'Neural_Network':
        model.compile(optimizer='adam', loss='mae')
        model.fit(X, y, epochs=100, batch_size=32, verbose=0)
    elif model_name == 'TabNet':
         # TabNet requires numpy and potential scaling
         X_np = X.values
         y_np = y.values.reshape(-1, 1)
         scaler = StandardScaler()
         X_scaled = scaler.fit_transform(X_np)
         model.fit(X_scaled, y_np, max_epochs=200, patience=20, batch_size=256, virtual_batch_size=128, verbose=0)
         # Wrap TabNet model and scaler in a pipeline for consistent prediction interface
         class TabNetPipeline:
             def __init__(self, scaler, tabnet_model):
                 self.scaler = scaler
                 self.tabnet_model = tabnet_model
             def predict(self, X):
                 X_scaled = self.scaler.transform(X.values)
                 return self.tabnet_model.predict(X_scaled).flatten()
         model = TabNetPipeline(scaler, model) # Return the wrapped model
    elif isinstance(model, Pipeline): # Check against the Pipeline class
        model.fit(X, y) # Pipeline handles scaling internally
    else:
        model.fit(X, y)

    print(f"Training complete for {property_name}.")
    return model

# Load test data and sample submission
# Assuming test.csv and sample_solution.csv are in the current directory
try:
  test_df = pd.read_csv("/Users/MacbookPro/LocalStorage/Developer/ShellAi/dataset/test.csv")
  submission_df = pd.read_csv("/Users/MacbookPro/LocalStorage/Developer/ShellAi/dataset/sample_solution.csv")
  test_ids = test_df['ID']
  test_df_features = test_df.drop(columns=['ID'])
except FileNotFoundError:
    print("Make sure 'test.csv' and 'sample_solution.csv' are uploaded to your Colab session.")


if 'test_df_features' in locals(): # Check if test data was loaded
  # Generate predictions using the best model for each property
  for i in range(1, 11):
      property_name = f'BlendProperty{i}'
      print(f"\nProcessing {property_name} for final submission...")

      # Define features for this property
      features = ['Component1_fraction', 'Component2_fraction', 'Component3_fraction',
                 'Component4_fraction', 'Component5_fraction'] + \
                [f'Component{j}_Property{i}' for j in range(1, 6)]

      # Train the best model for this property on the full training data
      trained_model = get_trained_final_model(df[features], df[property_name], property_name)

      # Make predictions on the test data
      test_predictions = trained_model.predict(test_df_features[features])

      # Update the submission DataFrame
      submission_df[property_name] = test_predictions

  # Save the final submission file
  submission_df.to_csv('daddy.csv', index=False)

  print("\n" + "="*80)
  print("Final submission file 'daddy.csv' created successfully.")
  print("="*80)



Processing BlendProperty1 for final submission...
Training Gaussian_Process for BlendProperty1 on full dataset...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


KeyboardInterrupt: 

In [None]:

import pandas as pd
from sklearn.pipeline import Pipeline # Import Pipeline

# Define a function to get the trained model for each property based on the analysis
def get_trained_final_model(data, target, property_name):
    """
    Trains the best performing model for a specific blend property on the full training data.
    """
    # Define the final models and their parameters based on the analysis
    final_model_info = {
        'BlendProperty1': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty2': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty3': ('ElasticNet', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)),
        'BlendProperty4': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty5': ('Random_Forest', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)),
        'BlendProperty6': ('Gaussian_Process', make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(length_scale=2.0), n_restarts_optimizer=5, random_state=42))),
        'BlendProperty7': ('SVR_Poly', make_pipeline(StandardScaler(), SVR(kernel='poly', C=1.0, epsilon=0.1))),
        'BlendProperty8': ('ElasticNet', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)),
        'BlendProperty9': ('ElasticNet', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)),
        'BlendProperty10': ('Neural_Network', Sequential([Dense(64, activation='relu', input_shape=(data.shape[1],)), Dropout(0.2), Dense(64, activation='relu'), Dense(1)]))
    }

    model_name, model = final_model_info[property_name]

    X = data
    y = target

    print(f"Training {model_name} for {property_name} on full dataset...")

    if model_name == 'Neural_Network':
        model.compile(optimizer='adam', loss='mae')
        model.fit(X, y, epochs=100, batch_size=32, verbose=0)
    elif model_name == 'TabNet':
         # TabNet requires numpy and potential scaling
         X_np = X.values
         y_np = y.values.reshape(-1, 1)
         scaler = StandardScaler()
         X_scaled = scaler.fit_transform(X_np)
         model.fit(X_scaled, y_np, max_epochs=200, patience=20, batch_size=256, virtual_batch_size=128, verbose=0)
         # Wrap TabNet model and scaler in a pipeline for consistent prediction interface
         class TabNetPipeline:
             def __init__(self, scaler, tabnet_model):
                 self.scaler = scaler
                 self.tabnet_model = tabnet_model
             def predict(self, X):
                 X_scaled = self.scaler.transform(X.values)
                 return self.tabnet_model.predict(X_scaled).flatten()
         model = TabNetPipeline(scaler, model) # Return the wrapped model
    elif isinstance(model, Pipeline): # Check against the Pipeline class
        model.fit(X, y) # Pipeline handles scaling internally
    else:
        model.fit(X, y)

    print(f"Training complete for {property_name}.")
    return model

# Load test data and sample submission
# Assuming test.csv and sample_solution.csv are in the current directory
try:
  test_df = pd.read_csv("test.csv")
  submission_df = pd.read_csv("sample_solution.csv")
  test_ids = test_df['ID']
  test_df_features = test_df.drop(columns=['ID'])
except FileNotFoundError:
    print("Make sure 'test.csv' and 'sample_solution.csv' are uploaded to your Colab session.")


if 'test_df_features' in locals(): # Check if test data was loaded
  # Generate predictions using the best model for each property
  for i in range(1, 11):
      property_name = f'BlendProperty{i}'
      print(f"\nProcessing {property_name} for final submission...")

      # Define features for this property
      features = ['Component1_fraction', 'Component2_fraction', 'Component3_fraction',
                 'Component4_fraction', 'Component5_fraction'] + \
                [f'Component{j}_Property{i}' for j in range(1, 6)]

      # Train the best model for this property on the full training data
      trained_model = get_trained_final_model(df[features], df[property_name], property_name)

      # Make predictions on the test data
      test_predictions = trained_model.predict(test_df_features[features])

      # Update the submission DataFrame
      submission_df[property_name] = test_predictions

  # Save the final submission file
  submission_df.to_csv('final_model_submission.csv', index=False)

  print("\n" + "="*80)
  print("Final submission file 'final_model_submission.csv' created successfully.")
  print("="*80)
    

Make sure 'test.csv' and 'sample_solution.csv' are uploaded to your Colab session.

Processing BlendProperty1 for final submission...
Training Gaussian_Process for BlendProperty1 on full dataset...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
ABNORMAL: 

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL: 

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


Training complete for BlendProperty1.

Processing BlendProperty2 for final submission...
Training Gaussian_Process for BlendProperty2 on full dataset...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training complete for BlendProperty2.

Processing BlendProperty3 for final submission...
Training ElasticNet for BlendProperty3 on full dataset...
Training complete for BlendProperty3.

Processing BlendProperty4 for final submission...
Training Gaussian_Process for BlendProperty4 on full dataset...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
ABNORMAL: 

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


Training complete for BlendProperty4.

Processing BlendProperty5 for final submission...
Training Random_Forest for BlendProperty5 on full dataset...
Training complete for BlendProperty5.

Processing BlendProperty6 for final submission...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training Gaussian_Process for BlendProperty6 on full dataset...


ABNORMAL: 

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


Training complete for BlendProperty6.

Processing BlendProperty7 for final submission...
Training SVR_Poly for BlendProperty7 on full dataset...
Training complete for BlendProperty7.

Processing BlendProperty8 for final submission...
Training ElasticNet for BlendProperty8 on full dataset...
Training complete for BlendProperty8.

Processing BlendProperty9 for final submission...
Training ElasticNet for BlendProperty9 on full dataset...
Training complete for BlendProperty9.

Processing BlendProperty10 for final submission...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training Neural_Network for BlendProperty10 on full dataset...
Training complete for BlendProperty10.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

Final submission file 'final_model_submission.csv' created successfully.
