# ML Kaggle Competition

## Background

We will use the data contained in the train.csv file to train a model that will predict **dissolved inorganic carbon (DIC)** content in water samples.

## Setup

In [1]:
# load libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping
from keras_tuner import HyperModel, RandomSearch
from sklearn.model_selection import KFold
import itertools

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

2024-03-22 06:58:53.054430: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-22 06:58:53.054571: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-22 06:58:53.181918: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


/kaggle/input/copy-of-eds-232-ocean-chemistry-prediction-for/sample_submission.csv
/kaggle/input/copy-of-eds-232-ocean-chemistry-prediction-for/train.csv
/kaggle/input/copy-of-eds-232-ocean-chemistry-prediction-for/test.csv


## Import & pre-process training data

In [2]:
# import training data
train_df = pd.read_csv('/kaggle/input/copy-of-eds-232-ocean-chemistry-prediction-for/train.csv')
train_df.columns = train_df.columns.str.lower().str.replace(' ', '_') # clean column names

# inspect data
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1454 entries, 0 to 1453
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1454 non-null   int64  
 1   lat_dec            1454 non-null   float64
 2   lon_dec            1454 non-null   float64
 3   no2um              1454 non-null   float64
 4   no3um              1454 non-null   float64
 5   nh3um              1454 non-null   float64
 6   r_temp             1454 non-null   float64
 7   r_depth            1454 non-null   int64  
 8   r_sal              1454 non-null   float64
 9   r_dynht            1454 non-null   float64
 10  r_nuts             1454 non-null   float64
 11  r_oxy_micromol.kg  1454 non-null   float64
 12  unnamed:_12        0 non-null      float64
 13  po4um              1454 non-null   float64
 14  sio3um             1454 non-null   float64
 15  ta1.x              1454 non-null   float64
 16  salinity1          1454 

In [3]:
# remove 'id' and 'unnamed:_12' columns
train_df = train_df.drop(train_df.columns[[0, 12]], axis=1)

In [4]:
# define feature matrix for training data
X_train = train_df.drop('dic', axis=1).values

# define target vector for training data
y_train = train_df['dic'].values

## Build & train model

In [5]:
# initialize new HyperModel object
class MyHyperModel:
    
    def build(self, hp):
        model = Sequential()
        
        # add dense layer with ReLU (based on preliminary training results)
        model.add(Dense(units=hp['neurons_0'], # tune units (number of neurons)
                        activation='relu', # select ReLU activator (based on preliminary training results)
                        kernel_regularizer=l1_l2(l1=0.01, l2=0.01))) # set L1 and L2
        
        # add dense layer with ELU activator (based on preliminary training results)
        model.add(Dense(units=hp['neurons_0'], # tune units (number of neurons)
                        activation='elu', # select ELU activator (based on preliminary training results)
                        kernel_regularizer=l1_l2(l1=0.01, l2=0.01))) # set L1 and L2
        
        # add dropout layer
        model.add(Dropout(rate=hp['dropout_1'])) # tune dropout rate

        # add additional dense layer(s)
        for i in range(1, hp['num_layers']):
            model.add(Dense(units=hp['neurons_0'], # tune units (number of neurons)
                            activation=hp['activation'], # tune activation function
                            kernel_regularizer=l1_l2(l1=0.01, l2=0.01))) # set L1 and L2
        
        # add output layer with linear activation
        model.add(Dense(1, activation='linear', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
        
        # configure tuning for optimizer
        optimizer = Adam(learning_rate=hp['learning_rate'], beta_1=hp['beta_1'])
        
        # compile hypermodel and set MSE as loss function
        model.compile(optimizer=optimizer, loss='mean_squared_error')
        
        return model
    
# store HyperModel object with specified input shape based on number of columns in feature matrix
hypermodel = MyHyperModel()

In [6]:
# create hyperparameter grid for tuning
hyperparameter_grid = {
    'neurons_0': [128],
    'dropout_1': [0.0],
    'num_layers': [2],
    'activation': ['elu'],
    'learning_rate': [1e-3],
    'beta_1': [0.8]
}

# define function that creates all combinations of values stored in a dictionary
def generate_combinations(grid):
    keys, values = zip(*grid.items())
    combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
    return combinations

# store all combinations of hyperparameter values from grid
combinations = generate_combinations(hyperparameter_grid)

In [7]:
# create EarlyStopping object to use when tuning hypermodel
early_stopping = EarlyStopping(
    monitor='loss', # monitor loss function
    min_delta=0.1, # set minimum decrease in loss function to be read as improvement
    patience=10, # stop trial early if no improvement over 10 iterations
    verbose=0, # disable verbose
    mode='min', # specify that objective is to minimize function being monitored
    restore_best_weights=True) # after early stopping, revert model weights to those from the epoch with the best value of the monitored metric

# define custom function for performing a CV trial
def cross_validate_combination(X, y, combination):
    kf = KFold(n_splits=5) # initialize CV fold with 5 splits
    val_scores = [] # initialize empty vector for validation scores
    
    for train_index, val_index in kf.split(X):
        
        # build model with combination of hyperparameters
        model = hypermodel.build(combination)
        
        # build CV fold (with 10 splits) using all of training data
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]
        
        # fit model to CV fold
        model.fit(X_train_fold,
                  y_train_fold,
                  callbacks=[early_stopping], # use early stopping
                  epochs=50, # set number of epochs for each trial
                  verbose=0) # disable verbose
        
        # evaluate model performance
        val_score = model.evaluate(X_val_fold,
                                   y_val_fold,
                                   verbose=0)
        val_scores.append(val_score)
    
    # return average validation score across all 10 splits of CV fold
    return np.mean(val_scores)

# initialize objects for storing best CV score and best hyperparameter combination
best_score = float('inf')
best_combination = None

In [8]:
# determine best hyperparameter combination based on CV score
for combination in combinations:
    score = cross_validate_combination(X_train,
                                       y_train,
                                       combination)
    if score < best_score:
        best_score = score
        best_combination = combination
print("Best Hyperparameters:", best_combination)
print("Best Score:", best_score)

I0000 00:00:1711090747.524073      70 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Best Hyperparameters: {'neurons_0': 128, 'dropout_1': 0.0, 'num_layers': 2, 'activation': 'elu', 'learning_rate': 0.001, 'beta_1': 0.8}
Best Score: 10093.775256347657


In [9]:
# create EarlyStopping object to use when finalizing fit
early_stopping_final = EarlyStopping(
    monitor='loss', # monitor loss function
    min_delta=0.1, # set minimum decrease in loss function to be read as improvement
    patience=100, # stop trial early if no improvement over 100 iterations
    verbose=1, # enable verbose
    mode='min', # specify that objective is to minimize function being monitored
    restore_best_weights=True) # after early stopping, revert model weights to those from the epoch with the best value of the monitored metric

# build version of hypermodel with best combination of hyperparameters
best_model = hypermodel.build(best_combination)

# fit model to training data
best_model.fit(X_train,
               y_train,
               callbacks=[early_stopping_final],
               epochs=500,
               verbose=1)

Epoch 1/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - loss: 1662418.6250
Epoch 2/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3004.3757
Epoch 3/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 778.6844
Epoch 4/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 255.6922
Epoch 5/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 121.1205
Epoch 6/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 146.2394
Epoch 7/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 172.5870
Epoch 8/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 92.3466
Epoch 9/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 107.3500
Epoch 10/500
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

<keras.src.callbacks.history.History at 0x7d203420fbb0>

## Import & process testing data

In [10]:
# import testing data
test_df = pd.read_csv('/kaggle/input/copy-of-eds-232-ocean-chemistry-prediction-for/test.csv')
test_df.columns = test_df.columns.str.lower().str.replace(' ', '_') # clean column names

# remove 'id' and 'unnamed:_12' columns
test_df = test_df.drop(test_df.columns[[0]], axis=1)

# define feature matrix for testing data
X_test = test_df


## Predict DIC for testing data & export submission

In [11]:
# generate predictions for testing data
predictions = best_model.predict(X_test)

# import submission template
submission_df = pd.read_csv('/kaggle/input/copy-of-eds-232-ocean-chemistry-prediction-for/sample_submission.csv')
submission_df.columns = submission_df.columns.str.lower().str.replace(' ', '_')

# bind predictions to 'dic' column
submission_df['dic'] = predictions
submission_df

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


Unnamed: 0,id,dic
0,1455,2175.671143
1,1456,2199.070557
2,1457,2322.684082
3,1458,2001.954346
4,1459,2155.531494
...,...,...
480,1935,2000.285278
481,1936,2176.688232
482,1937,2241.894287
483,1938,2021.934082


In [12]:
# export submission
submission_df.to_csv('submission.csv', index=False)