## Background

You will use the data contained in the train.csv file to train a model that will predict **dissolved inorganic carbon (DIC)** content in the water samples.

In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l1_l2
from keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

2024-03-20 13:34:48.200456: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-20 13:34:48.313773: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-20 13:34:48.317701: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/R/4.2.2/lib/R/lib:/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/j

In [2]:
# Set the environment variable to change the log level
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0 = default, 1 = no INFO, 2 = no INFO and WARNING, 3 = no INFO, WARNING, and ERROR

In [3]:
# Turn off scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Set seed
np.random.seed(123)

In [4]:
# Import data
train_df = pd.read_csv('data/train.csv')
train_df.columns = train_df.columns.str.lower().str.replace(' ', '_')

# Data exploration
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1454 entries, 0 to 1453
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1454 non-null   int64  
 1   lat_dec            1454 non-null   float64
 2   lon_dec            1454 non-null   float64
 3   no2um              1454 non-null   float64
 4   no3um              1454 non-null   float64
 5   nh3um              1454 non-null   float64
 6   r_temp             1454 non-null   float64
 7   r_depth            1454 non-null   int64  
 8   r_sal              1454 non-null   float64
 9   r_dynht            1454 non-null   float64
 10  r_nuts             1454 non-null   float64
 11  r_oxy_micromol.kg  1454 non-null   float64
 12  unnamed:_12        0 non-null      float64
 13  po4um              1454 non-null   float64
 14  sio3um             1454 non-null   float64
 15  ta1.x              1454 non-null   float64
 16  salinity1          1454 

In [5]:
# Check distribution of outcome variable
#sns.histplot(train_df['dic'], kde=False)
#plt.show()


## Preprocess

In [6]:
# Remove 'id' and 'unnamed:_12' columns (for reasons specified above)
train_df = train_df.drop(['id', 'unnamed:_12'], axis=1)

## Build & train model

In [255]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Lambda, LeakyReLU
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [140]:
# Define feature matrix and target vector
X_train = train_df.drop('dic', axis=1).values
y_train = train_df['dic'].values

# Initialize the scalers
feature_scaler = StandardScaler()
target_scaler = MinMaxScaler()

# Scale the features
X_train_scaled = feature_scaler.fit_transform(X_train)

# Scale the target variable
y_train_scaled = target_scaler.fit_transform(y_train.reshape(-1, 1))

# Now X_train_scaled and y_train_scaled contain the scaled training data,
# and X_val_scaled and y_val_scaled contain the scaled validation data.


In [361]:
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor the validation loss
    min_delta=0,  # Minimum change to qualify as an improvement
    patience=10,  # Stop after 10 epochs with no improvement
    verbose=1,  # Print messages
    mode='min',  # Stop when the quantity monitored has stopped decreasing
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)


def create_model(input_shape=X.shape[1], dropout_rate=0.1, learning_rate=0.001, beta_1=0.8):
    optimizer = Adam(learning_rate=learning_rate, beta_1=beta_1)
    
    model = Sequential([
        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01), input_shape=input_shape),
        LeakyReLU(alpha=0.01),
        Dense(64, activation='elu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),
        Dropout(dropout_rate),
        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01), input_shape=input_shape),
        LeakyReLU(alpha=0.01),
        Dropout(dropout_rate),
        Dense(64, activation='swish', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),
        Dropout(dropout_rate),
        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01), input_shape=input_shape),
        LeakyReLU(alpha=0.01),
        Dense(64, activation='elu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),
        Dropout(dropout_rate),
        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01), input_shape=input_shape),
        LeakyReLU(alpha=0.01),
        Dense(64, kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),  # New layer
        Lambda(lambda x: x * tf.tanh(tf.math.log(1 + tf.exp(x)))),  # Mish activation
        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01), input_shape=input_shape),
        LeakyReLU(alpha=0.01),
        Dense(1, activation='linear', kernel_regularizer=l1_l2(l1=0.01, l2=0.01))
    ])
    
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Now create the model with input_shape defined
test_model = create_model(input_shape=(X.shape[1],))  # Replace X_train.shape[1] with the number of features

In [692]:
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor the validation loss
    min_delta=0.05,  # Minimum change to qualify as an improvement
    patience=10,  # Stop after 10 epochs with no improvement
    verbose=1,  # Print messages
    mode='min',  # Stop when the quantity monitored has stopped decreasing
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)


def create_model(input_shape=X.shape[1], dropout_rate=0.1, learning_rate=0.0001, beta_1=0.8):
    optimizer = Adam(learning_rate=learning_rate, beta_1=beta_1)
    
    model = Sequential([
        Dense(64, activation='elu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01), input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),
        Dropout(dropout_rate),
        Dense(1, activation='linear', kernel_regularizer=l1_l2(l1=0.01, l2=0.01))
    ])
    
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Now create the model with input_shape defined
test_model = create_model(input_shape=(X.shape[1],)) # Replace X_train.shape[1] with the number of features

In [693]:
# Assuming X_scaled (input features) and y_scaled (target values) are already defined

# Train the model
history = test_model.fit(
    X_train_scaled,  # Training data features
    y_train_scaled,  # Training data target values
    epochs=50,  # Number of epochs to train for
    batch_size=16,  # Batch size
    validation_split=0.5,  # Fraction of the training data to be used as validation data
    verbose=0,  # Show progress
    callbacks=[early_stopping]  # Include the early stopping callback here
)

# The 'history' object holds a record of the loss values and metric values during training


In [694]:
history.history['loss'][-1]


0.5991277694702148

In [695]:
history.history['val_loss'][-1]

0.5750105381011963

In [696]:
y_scaled_pred = test_model.predict(X_train_scaled)  # Assume X_scaled_val is your scaled validation or test features



In [697]:
y_pred_original = target_scaler.inverse_transform(y_scaled_pred)

In [698]:
# Assuming y_val contains the original target values for your validation set
mean_squared_error(y_train, y_pred_original)

1429.7196134847188

In [None]:
# Later, to transform predictions back to the original scale:
# predictions_original_scale = scaler.inverse_transform(predictions_scaled)

# Retrieve the index of the best model
best_index = grid_result.best_index_

# Extract the mean and standard deviation of the MSE for the best model
mse = grid_result.cv_results_['mean_test_score'][best_index]
std_mse = grid_result.cv_results_['std_test_score'][best_index]

# Print the results
print(f"MSE for the best model during CV: {mse}")
print(f"Standard deviation of MSE for the best model during CV: {std_mse}")

## Predict testing data

In [None]:
# Import data
test_df = pd.read_csv('data/test.csv')
test_df.columns = train_df.columns.str.lower().str.replace(' ', '_')

# define feature matrix
X_test = test_df.drop('dic', axis=1).values

# define target vector
y_test = test_df['dic'].values

scaler = StandardScaler()
X_test_scaled['dic'] = scaler.fit_transform(X_test[['dic']])

In [None]:

# Step 1: Retrieve the best model from grid search
best_model = grid_result.best_estimator_

# Step 2: Prepare test dataset (Make sure it's prepared in the same way as your training data)
# If you've applied any transformations to your training dataset, apply the same here
X_test = test_df.drop('dic', axis=1, errors='ignore')  # Assuming 'dic' might not be in your test dataset

# Step 3: Use the best model to make predictions on the test dataset
predictions = history.predict(X_test)

In [None]:
# Import data
submission_df = pd.read_csv('data/sample_submission.csv')
submission_df.columns = submission_df.columns.str.lower().str.replace(' ', '_')

# Assuming 'predictions' is your vector of predicted values
submission_df['dic'] = predictions

submission_df

In [None]:
submission_df.to_csv('linus_submission5.csv', index=False)