# ML Kaggle Competition

## Background

We will use the data contained in the train.csv file to train a model that will predict **dissolved inorganic carbon (DIC)** content in water samples.

## Setup

In [1]:
# load libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping
from keras_tuner import HyperModel, RandomSearch
from sklearn.model_selection import KFold
import itertools

2024-03-21 21:29:22.885560: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-21 21:29:23.001203: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-21 21:29:23.005084: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/R/4.2.2/lib/R/lib:/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/j

## Import & pre-process training data

In [2]:
# import training data
train_df = pd.read_csv('data/train.csv')
train_df.columns = train_df.columns.str.lower().str.replace(' ', '_') # clean column names

# inspect data
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1454 entries, 0 to 1453
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1454 non-null   int64  
 1   lat_dec            1454 non-null   float64
 2   lon_dec            1454 non-null   float64
 3   no2um              1454 non-null   float64
 4   no3um              1454 non-null   float64
 5   nh3um              1454 non-null   float64
 6   r_temp             1454 non-null   float64
 7   r_depth            1454 non-null   int64  
 8   r_sal              1454 non-null   float64
 9   r_dynht            1454 non-null   float64
 10  r_nuts             1454 non-null   float64
 11  r_oxy_micromol.kg  1454 non-null   float64
 12  unnamed:_12        0 non-null      float64
 13  po4um              1454 non-null   float64
 14  sio3um             1454 non-null   float64
 15  ta1.x              1454 non-null   float64
 16  salinity1          1454 

We will remove column 12 b/c there are 0 non-null values. We will also remove the 'id' column because we don't expect it to be a relevant predictor.

In [3]:
# remove 'id' and 'unnamed:_12' columns
train_df = train_df.drop(['id', 'unnamed:_12'], axis=1)

In [4]:
# define feature matrix for training data
X_train = train_df.drop('dic', axis=1).values

# define target vector for training data
y_train = train_df['dic'].values

## Build & train model

In [5]:
# initialize new HyperModel object
class MyHyperModel:
    
    def __init__(self, input_shape):
        self.input_shape = input_shape # store input shape as an instance attribute

    def build(self, hp):
        model = Sequential()
        
        # add dense layer with ReLU (based on preliminary training results)
        model.add(Dense(units=hp['neurons_0'], # tune units (number of neurons)
                        activation='relu', # select ReLU activator (based on preliminary training results)
                        kernel_regularizer=l1_l2(l1=0.01, l2=0.01), # set L1 and L2
                        input_shape=self.input_shape)) # specify input shape
        
        # add dense layer with ELU activator (based on preliminary training results)
        model.add(Dense(units=hp['neurons_1'], # tune units (number of neurons)
                        activation='elu', # select ELU activator (based on preliminary training results)
                        kernel_regularizer=l1_l2(l1=0.01, l2=0.01))) # set L1 and L2
        
        # add dropout layer
        model.add(Dropout(rate=hp['dropout_1'])) # tune dropout rate

        # add additional dense layer
        for i in range(1, hp['num_layers']):
            model.add(Dense(units=hp['neurons_2'], # tune units (number of neurons)
                            activation=hp['activation'], # tune activation function
                            kernel_regularizer=l1_l2(l1=0.01, l2=0.01))) # set L1 and L2
        
        # add output layer with linear activation
        model.add(Dense(1, activation='linear', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
        
        # configure tuning for optimizer
        optimizer = Adam(learning_rate=hp['learning_rate'], beta_1=hp['beta_1'])
        
        # compile hypermodel and set MSE as loss function
        model.compile(optimizer=optimizer, loss='mean_squared_error')
        
        return model
    
# store HyperModel object with specified input shape based on number of columns in feature matrix
hypermodel = MyHyperModel(input_shape=X_train.shape[1:])


In [6]:
# create hyperparameter grid for tuning
hyperparameter_grid = {
    'neurons_0': [64, 128],
    'neurons_1': [64, 128],
    'dropout_1': [0.0, 0.1, 0.2],
    'num_layers': [2],
    'neurons_2': [64, 128],
    'activation': ['relu', 'elu'],
    'learning_rate': [1e-4, 1e-3],
    'beta_1': [0.8, 0.9, 0.99]
}

# define function that creates all combinations of values stored in a dictionary
def generate_combinations(grid):
    keys, values = zip(*grid.items())
    combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
    return combinations

# store all combinations of hyperparameter values from grid
combinations = generate_combinations(hyperparameter_grid)

In [7]:
# create EarlyStopping object to use when tuning hypermodel
early_stopping = EarlyStopping(
    monitor='loss', # monitor loss function
    min_delta=0.1, # set minimum decrease in loss function to be read as improvement
    patience=10, # stop trial early if no improvement over 10 iterations
    verbose=0, # disable verbose
    mode='min', # specify that objective is to minimize function being monitored
    restore_best_weights=True) # after early stopping, revert model weights to those from the epoch with the best value of the monitored metric

# define custom function for performing a CV trial
def cross_validate_combination(X, y, combination):
    kf = KFold(n_splits=10) # initialize CV fold with 10 splits
    val_scores = [] # initialize empty vector for validation scores
    
    for train_index, val_index in kf.split(X):
        
        # build model with combination of hyperparameters
        model = hypermodel.build(combination)
        
        # build CV fold (with 10 splits) using all of training data
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]
        
        # fit model to CV fold
        model.fit(X_train_fold,
                  y_train_fold,
                  callbacks=[early_stopping], # use early stopping
                  epochs=50, # set number of epochs for each trial
                  verbose=0) # disable verbose
        
        # evaluate model performance
        val_score = model.evaluate(X_val_fold,
                                   y_val_fold,
                                   verbose=0)
        val_scores.append(val_score)
    
    # return average validation score across all 10 splits of CV fold
    return np.mean(val_scores)

# initialize objects for storing best CV score and best hyperparameter combination
best_score = float('inf')
best_combination = None

In [None]:
# determine best hyperparameter combination based on CV score
for combination in combinations:
    score = cross_validate_combination(X_train,
                                       y_train,
                                       combination)
    if score < best_score:
        best_score = score
        best_combination = combination
print("Best Hyperparameters:", best_combination)
print("Best Score:", best_score)

2024-03-21 21:29:24.648838: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/R/4.2.2/lib/R/lib:/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/java-11-openjdk-amd64/lib/server
2024-03-21 21:29:24.648878: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2024-03-21 21:29:24.648895: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (taylor): /proc/driver/nvidia/version does not exist
2024-03-21 21:29:24.649146: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them i

In [None]:
# build version of hypermodel with best combination of hyperparameters
best_model = hypermodel(best_combination, input_shape=X_train.shape[1:])

# fit model to training data
best_model.fit(X_train, y_train, epochs=100, verbose=1)

## Import & process testing data

In [None]:
# import testing data
test_df = pd.read_csv('data/test.csv')
test_df.columns = train_df.columns.str.lower().str.replace(' ', '_') # clean column names

# define feature matrix for testing data
X_test = test_df.drop('dic', axis=1).values

# remove 'id' and 'unnamed:_12' columns
train_df = train_df.drop(['id', 'unnamed:_12'], axis=1)

## Predict DIC for testing data & export submission

In [None]:
# generate predictions for testing data
predictions = best_model.predict(X_test)

# import submission template
submission_df = pd.read_csv('data/sample_submission.csv')
submission_df.columns = submission_df.columns.str.lower().str.replace(' ', '_')

# bind predictions to 'dic' column
submission_df['dic'] = predictions
submission_df

In [None]:
# export submission
submission_df.to_csv('linus_submission5.csv', index=False)