In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
df = pd.read_csv('train.csv')
df.set_index('id', inplace=True)

In [3]:
df

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,I,1.5250,1.1750,0.3750,28.973189,12.728926,6.647958,8.348928,9
1,I,1.1000,0.8250,0.2750,10.418441,4.521745,2.324659,3.401940,8
2,M,1.3875,1.1125,0.3750,24.777463,11.339800,5.556502,6.662133,9
3,F,1.7000,1.4125,0.5000,50.660556,20.354941,10.991839,14.996885,11
4,I,1.2500,1.0125,0.3375,23.289114,11.977664,4.507570,5.953395,8
...,...,...,...,...,...,...,...,...,...
74046,F,1.6625,1.2625,0.4375,50.660556,20.680960,10.361742,12.332033,10
74047,I,1.0750,0.8625,0.2750,10.446791,4.323299,2.296310,3.543687,6
74048,F,1.4875,1.2000,0.4125,29.483480,12.303683,7.540967,8.079607,10
74049,I,1.2125,0.9625,0.3125,16.768729,8.972617,2.919999,4.280774,8


## Creating dummy variblaes for Sex

In [4]:
# Create dummy variables for the 'Sex' column
df_with_dummies = pd.get_dummies(df, columns=['Sex']).astype(float)

# Print the updated DataFrame with dummy variables
df_with_dummies

Unnamed: 0_level_0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age,Sex_F,Sex_I,Sex_M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1.5250,1.1750,0.3750,28.973189,12.728926,6.647958,8.348928,9.0,0.0,1.0,0.0
1,1.1000,0.8250,0.2750,10.418441,4.521745,2.324659,3.401940,8.0,0.0,1.0,0.0
2,1.3875,1.1125,0.3750,24.777463,11.339800,5.556502,6.662133,9.0,0.0,0.0,1.0
3,1.7000,1.4125,0.5000,50.660556,20.354941,10.991839,14.996885,11.0,1.0,0.0,0.0
4,1.2500,1.0125,0.3375,23.289114,11.977664,4.507570,5.953395,8.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
74046,1.6625,1.2625,0.4375,50.660556,20.680960,10.361742,12.332033,10.0,1.0,0.0,0.0
74047,1.0750,0.8625,0.2750,10.446791,4.323299,2.296310,3.543687,6.0,0.0,1.0,0.0
74048,1.4875,1.2000,0.4125,29.483480,12.303683,7.540967,8.079607,10.0,1.0,0.0,0.0
74049,1.2125,0.9625,0.3125,16.768729,8.972617,2.919999,4.280774,8.0,0.0,1.0,0.0


## Scaling Dataframe

In [5]:
df_scaled = df_with_dummies.copy()  # Create a copy of the DataFrame

# Get the columns to scale (excluding 'id' and 'Sex')
columns_to_scale = df.columns.drop(['Sex', 'Age'])

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the columns
df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])

# Output the scaled DataFrame
df_scaled

Unnamed: 0_level_0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age,Sex_F,Sex_I,Sex_M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.721238,0.633982,0.292400,0.441804,0.467188,0.569186,0.453376,9.0,0.0,1.0,0.0
1,-0.755712,-0.840356,-0.794163,-1.025198,-0.993688,-0.978880,-0.926788,8.0,0.0,1.0,0.0
2,0.243401,0.370707,0.292400,0.110076,0.219924,0.178363,-0.017224,9.0,0.0,0.0,1.0
3,1.329394,1.634426,1.650603,2.156483,1.824616,2.124622,2.308095,11.0,1.0,0.0,0.0
4,-0.234435,-0.050532,-0.115061,-0.007598,0.333464,-0.197233,-0.214955,8.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
74046,1.199075,1.002567,0.971501,2.156483,1.882648,1.899000,1.564626,10.0,1.0,0.0,0.0
74047,-0.842591,-0.682391,-0.794163,-1.022957,-1.029011,-0.989031,-0.887242,6.0,0.0,1.0,0.0
74048,0.590919,0.739292,0.699861,0.482150,0.391495,0.888951,0.378238,10.0,1.0,0.0,0.0
74049,-0.364754,-0.261152,-0.386702,-0.523122,-0.201434,-0.765703,-0.681601,8.0,0.0,1.0,0.0


## Splitting Data into Train and Test sets

In [6]:
from sklearn.model_selection import train_test_split

# Assuming the df_scaled DataFrame is already created

# Separate the features (X) from the target variable (y)
X = df_scaled.drop(['Age'], axis=1)
y = df_scaled['Age']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("Training set shapes:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("\nTesting set shapes:")
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


Training set shapes:
X_train: (59240, 10)
y_train: (59240,)

Testing set shapes:
X_test: (14811, 10)
y_test: (14811,)


## Model 1: OLS

In [7]:
# Add a constant column to the X_train dataset
X_train_OLS = sm.add_constant(X_train)

# Fit the OLS regression model
model = sm.OLS(y_train, X_train_OLS)
results = model.fit()

# Print the summary of the regression results
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:                    Age   R-squared:                       0.549
Model:                            OLS   Adj. R-squared:                  0.549
Method:                 Least Squares   F-statistic:                     8018.
Date:                Mon, 12 Jun 2023   Prob (F-statistic):               0.00
Time:                        11:19:24   Log-Likelihood:            -1.2882e+05
No. Observations:               59240   AIC:                         2.577e+05
Df Residuals:                   59230   BIC:                         2.578e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              7.4717      0.007   1136.

In [8]:
y_pred = results.predict(sm.add_constant(X_test))

# Calculate the squared differences between the actual and predicted values
squared_diff = (y_test - y_pred) ** 2

# Calculate the mean squared error
mse = np.mean(squared_diff)

# Calculate the absolute MSE
abs_mse = np.sqrt(mse)

# Print the absolute MSE
print("Absolute MSE:", abs_mse)


Absolute MSE: 2.1248660923595035


## Model 2: Neural Network

In [9]:
X_train.shape

(59240, 10)

In [10]:
input_dim = 10

# Define the model architecture
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(input_dim,)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(8, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='linear')
])

# Compile the model
model.compile(optimizer='RMSProp', loss='mse')

# Print the model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1408      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 32)                2

In [11]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# Evaluate the model on test data
loss = model.evaluate(X_test, y_test)
print("Test loss:", loss)


Test loss: 4.4857401847839355


### Optuna Tuning

In [13]:
# input_dim=10

In [16]:
import time
import optuna
from tensorflow import keras
from tensorflow.keras import layers
import logging
from optuna.samplers import TPESampler

def create_model(n_layers, n_units, dropout_rate, activation):
    model = keras.Sequential()
    model.add(layers.Dense(n_units[0], activation=activation, input_shape=(input_dim,)))
    model.add(layers.Dropout(dropout_rate[0]))
    for i in range(1, n_layers):
        model.add(layers.Dense(n_units[i], activation=activation))
        model.add(layers.Dropout(dropout_rate[i]))
    model.add(layers.Dense(1, activation='linear'))
    return model

def objective(trial):
    # Define the hyperparameters
    n_layers = trial.suggest_int('n_layers', 1, 4)  # number of layers
    n_units = [trial.suggest_int(f'n_units_{i}', 4, 96) for i in range(n_layers)]  # units per layer
    dropout_rate = [trial.suggest_float(f'dropout_rate_{i}', 0.0, 0.3) for i in range(n_layers)]  # dropout rate
    activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'sigmoid'])  # activation function

    # Record the start time
    start_time = time.time()

    # Build, compile and fit the model
    model = create_model(n_layers, n_units, dropout_rate, activation)
    model.compile(optimizer='RMSProp', loss='mse')
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, verbose=0)

    # Record the end time and calculate the duration
    end_time = time.time()
    trial.set_user_attr('duration', end_time - start_time)

    # Evaluate and return the validation loss
    val_loss = history.history['val_loss'][-1]
    return val_loss

def print_trial(study, trial):
    print('Number of finished trials: ', len(study.trials))
    print('Best trial so far:')
    print('  Value: ', study.best_value)
    print('  Duration: ', study.best_trial.user_attrs['duration'])
    print('  Params: ')
    for key, value in study.best_params.items():
        print(f'    {key}: {value}')

# Configure the logging level
optuna.logging.set_verbosity(optuna.logging.INFO)

def gamma():
    return 0.2

my_sampler = TPESampler(n_startup_trials=7,    # Number of trials to use random sampling
                        n_ei_candidates=24,
                       gamma=gamma())     # Number of samples for EI candidates

# Run the optimization
study = optuna.create_study(direction='minimize', sampler=my_sampler)
study.optimize(objective, n_trials=100, callbacks=[print_trial])

# Print the final result
best_trial = study.best_trial
print('Best trial:')
print(f'  Value: {best_trial.value}')
print('  Params: ')
for key, value in best_trial.params.items():
    print(f'    {key}: {value}')


[I 2023-06-12 12:58:11,728] A new study created in memory with name: no-name-713bbf58-ad81-4a18-b4a9-b04ca4d127bb
[I 2023-06-12 13:00:17,302] Trial 0 finished with value: 4.236907005310059 and parameters: {'n_layers': 4, 'n_units_0': 61, 'n_units_1': 47, 'n_units_2': 70, 'n_units_3': 67, 'dropout_rate_0': 0.060084085887160486, 'dropout_rate_1': 0.15256935769204352, 'dropout_rate_2': 0.2916238563585122, 'dropout_rate_3': 0.02533094673172136, 'activation': 'sigmoid'}. Best is trial 0 with value: 4.236907005310059.


Number of finished trials:  1
Best trial so far:
  Value:  4.236907005310059
  Duration:  125.57206773757935
  Params: 
    n_layers: 4
    n_units_0: 61
    n_units_1: 47
    n_units_2: 70
    n_units_3: 67
    dropout_rate_0: 0.060084085887160486
    dropout_rate_1: 0.15256935769204352
    dropout_rate_2: 0.2916238563585122
    dropout_rate_3: 0.02533094673172136
    activation: sigmoid


[I 2023-06-12 13:01:37,961] Trial 1 finished with value: 4.308379650115967 and parameters: {'n_layers': 2, 'n_units_0': 55, 'n_units_1': 4, 'dropout_rate_0': 0.003866842456680497, 'dropout_rate_1': 0.10257705096907463, 'activation': 'relu'}. Best is trial 0 with value: 4.236907005310059.


Number of finished trials:  2
Best trial so far:
  Value:  4.236907005310059
  Duration:  125.57206773757935
  Params: 
    n_layers: 4
    n_units_0: 61
    n_units_1: 47
    n_units_2: 70
    n_units_3: 67
    dropout_rate_0: 0.060084085887160486
    dropout_rate_1: 0.15256935769204352
    dropout_rate_2: 0.2916238563585122
    dropout_rate_3: 0.02533094673172136
    activation: sigmoid


[I 2023-06-12 13:03:02,088] Trial 2 finished with value: 4.370059490203857 and parameters: {'n_layers': 2, 'n_units_0': 23, 'n_units_1': 53, 'dropout_rate_0': 0.27939390147482546, 'dropout_rate_1': 0.2895580591588589, 'activation': 'tanh'}. Best is trial 0 with value: 4.236907005310059.


Number of finished trials:  3
Best trial so far:
  Value:  4.236907005310059
  Duration:  125.57206773757935
  Params: 
    n_layers: 4
    n_units_0: 61
    n_units_1: 47
    n_units_2: 70
    n_units_3: 67
    dropout_rate_0: 0.060084085887160486
    dropout_rate_1: 0.15256935769204352
    dropout_rate_2: 0.2916238563585122
    dropout_rate_3: 0.02533094673172136
    activation: sigmoid


[I 2023-06-12 13:04:36,549] Trial 3 finished with value: 4.311262130737305 and parameters: {'n_layers': 2, 'n_units_0': 93, 'n_units_1': 62, 'dropout_rate_0': 0.235718296826511, 'dropout_rate_1': 0.1026221976537813, 'activation': 'relu'}. Best is trial 0 with value: 4.236907005310059.


Number of finished trials:  4
Best trial so far:
  Value:  4.236907005310059
  Duration:  125.57206773757935
  Params: 
    n_layers: 4
    n_units_0: 61
    n_units_1: 47
    n_units_2: 70
    n_units_3: 67
    dropout_rate_0: 0.060084085887160486
    dropout_rate_1: 0.15256935769204352
    dropout_rate_2: 0.2916238563585122
    dropout_rate_3: 0.02533094673172136
    activation: sigmoid


[I 2023-06-12 13:05:52,240] Trial 4 finished with value: 4.248249530792236 and parameters: {'n_layers': 2, 'n_units_0': 17, 'n_units_1': 24, 'dropout_rate_0': 0.06740527212840927, 'dropout_rate_1': 0.0031468702279227154, 'activation': 'relu'}. Best is trial 0 with value: 4.236907005310059.


Number of finished trials:  5
Best trial so far:
  Value:  4.236907005310059
  Duration:  125.57206773757935
  Params: 
    n_layers: 4
    n_units_0: 61
    n_units_1: 47
    n_units_2: 70
    n_units_3: 67
    dropout_rate_0: 0.060084085887160486
    dropout_rate_1: 0.15256935769204352
    dropout_rate_2: 0.2916238563585122
    dropout_rate_3: 0.02533094673172136
    activation: sigmoid


[I 2023-06-12 13:07:37,376] Trial 5 finished with value: 4.246252059936523 and parameters: {'n_layers': 3, 'n_units_0': 90, 'n_units_1': 56, 'n_units_2': 32, 'dropout_rate_0': 0.04979146064049926, 'dropout_rate_1': 0.2462140279516597, 'dropout_rate_2': 0.04878254650094064, 'activation': 'tanh'}. Best is trial 0 with value: 4.236907005310059.


Number of finished trials:  6
Best trial so far:
  Value:  4.236907005310059
  Duration:  125.57206773757935
  Params: 
    n_layers: 4
    n_units_0: 61
    n_units_1: 47
    n_units_2: 70
    n_units_3: 67
    dropout_rate_0: 0.060084085887160486
    dropout_rate_1: 0.15256935769204352
    dropout_rate_2: 0.2916238563585122
    dropout_rate_3: 0.02533094673172136
    activation: sigmoid


[I 2023-06-12 13:09:27,144] Trial 6 finished with value: 4.555200576782227 and parameters: {'n_layers': 4, 'n_units_0': 52, 'n_units_1': 96, 'n_units_2': 8, 'n_units_3': 19, 'dropout_rate_0': 0.24227921084970258, 'dropout_rate_1': 0.272840335532881, 'dropout_rate_2': 0.1651310543057998, 'dropout_rate_3': 0.02168386822798377, 'activation': 'tanh'}. Best is trial 0 with value: 4.236907005310059.
[W 2023-06-12 13:09:27,146] Trial 7 failed with parameters: {} because of the following error: TypeError("'float' object is not callable").
Traceback (most recent call last):
  File "C:\Users\jamie\miniconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\jamie\AppData\Local\Temp\ipykernel_11500\1056886358.py", line 20, in objective
    n_layers = trial.suggest_int('n_layers', 1, 4)  # number of layers
  File "C:\Users\jamie\miniconda3\lib\site-packages\optuna\trial\_trial.py", line 321, in suggest_int
    suggested_value =

Number of finished trials:  7
Best trial so far:
  Value:  4.236907005310059
  Duration:  125.57206773757935
  Params: 
    n_layers: 4
    n_units_0: 61
    n_units_1: 47
    n_units_2: 70
    n_units_3: 67
    dropout_rate_0: 0.060084085887160486
    dropout_rate_1: 0.15256935769204352
    dropout_rate_2: 0.2916238563585122
    dropout_rate_3: 0.02533094673172136
    activation: sigmoid


TypeError: 'float' object is not callable

Best trial:
  Value: 4.191659450531006
  Params: 
    n_layers: 2
    n_units_0: 96
    n_units_1: 80
    dropout_rate_0: 0.0990228063551524
    dropout_rate_1: 0.031520467478229425
    activation: sigmoid