<a href="https://colab.research.google.com/github/jstephens/tideprediction/blob/main/Tide_Prediction_CNN_and_Regression_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import datetime
import joblib
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.neighbors import KDTree
from scipy.spatial.distance import euclidean
from tensorflow.python.framework.errors_impl import ResourceExhaustedError
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense, Reshape, Conv2DTranspose,Lambda, Dropout, TimeDistributed
from tensorflow.keras.models import Model
from sklearn.cluster import KMeans
import random
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import normalize

In [2]:
from google.colab import drive
drive.mount('/content/drive')

base_dir = 'drive/MyDrive/'

Mounted at /content/drive


# Data Load and Transformation

In [3]:
X_train = np.load(base_dir + 'source/X_train_surge_new.npz')
X_test = np.load(base_dir +'source/X_test_surge_new.npz')
Y_train = pd.read_csv(base_dir +'/source/Y_train_surge.csv',index_col = 'id_sequence')

df_X_test = pd.DataFrame.from_dict({item: X_test[item] for item in X_test.files}, orient='index').T.set_index('id_sequence')
df_X_train = pd.DataFrame.from_dict({item: X_train[item] for item in X_train.files}, orient='index').T.set_index('id_sequence')

In [4]:
def normalization(df):
  for i in range(len(df['slp'])):
      # Access the inner arrays
      inner_arrays = df['slp'].iloc[i]

      # Normalize each inner array
      normalized_inner_arrays = [normalize(arr) for arr in inner_arrays]

      # Update the DataFrame with the normalized inner arrays
      df['slp'].iloc[i] = normalized_inner_arrays

  return df

In [5]:
X_test = normalization(df_X_test)
X_train = normalization(df_X_train)

In [6]:
def load_and_preprocess_sequences(df, use_shortened_data=True):
    new_data = df['slp'].apply(lambda x: x[::3])   # select every third image in every row
    seqcount = 13
    new_df = pd.DataFrame(new_data, columns=['slp']) 

    if use_shortened_data:
        image_sequences = np.array(new_df['slp'].values.tolist())   
    else:
        image_sequences = np.array(new_df['slp'].values.tolist())
    image_sequences = image_sequences[:, -seqcount:]
    reshaped_sequences = image_sequences.reshape(-1, seqcount, 41, 41, 1)
    return reshaped_sequences

In [7]:
reshaped_seq_train = load_and_preprocess_sequences(X_train, use_shortened_data=True)
reshaped_seq_test = load_and_preprocess_sequences(X_test, use_shortened_data=True)

In [8]:
def flatten_array(row):
    new_row = []
    for col in row:
        if isinstance(col, np.ndarray):
            new_row.extend(col.ravel())
        else:
            new_row.append(col)
    return pd.Series(new_row)

def X_prep(df):
  included_columns = ['surge1_input','surge2_input','t_surge1_output','t_surge2_output']

  df = df[included_columns]
  expandedcol_list = []
  count = 0
  for x in included_columns:
    for count in range(10):
      expandedcol_list.append(f"{x}_{count}")

  df = df.apply(flatten_array, axis=1)
  df.columns = range(1, len(df.columns) + 1)
  df.columns = expandedcol_list
  return df

In [9]:
X_train = X_prep(X_train)
X_test = X_prep(X_test)

# GPU Maintenance

In [10]:
# Check if a GPU is available
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Configure TensorFlow to use the first GPU
    tf.config.set_visible_devices(gpus[0], 'GPU')
    tf.config.experimental.set_memory_growth(gpus[0], True)

In [7]:
tf.keras.backend.clear_session()

# Regression Modeling

In [12]:
def regression_model(cluster_train_labels, cluster_test_labels,clustercount,iteration):
  num_rows = len(X_test)

  X_train['Cluster'] = cluster_train_labels
  X_test['Cluster'] = cluster_test_labels
  cluster_scores = []

  data = np.zeros((num_rows, 20))  # initialize with zeros
  entire_y_pred_df = pd.DataFrame(data)
  entire_y_pred_df = entire_y_pred_df.set_index(X_test.index)

  for cluster_no in range(clustercount):
    c_X_train = X_train[X_train['Cluster'] == cluster_no].copy()
    c_X_test = X_test[X_test['Cluster'] == cluster_no].copy()
    c_y_train = Y_train.loc[c_X_train.index]

    # Create a multi-output linear regression model
    model = MultiOutputRegressor(LinearRegression())

    # Perform cross-validation
    scores = cross_val_score(model, c_X_train, c_y_train, cv=5, scoring='neg_mean_squared_error')
        
    # building out prediction file
    model.fit(c_X_train, c_y_train)
    y_pred = model.predict(c_X_test)
    y_pred_df = pd.DataFrame(y_pred)
    y_pred_df = y_pred_df.set_index(c_X_test.index)

    entire_y_pred_df.update(y_pred_df)
    # Calculate the mean squared error
    mean_mse = -scores.mean()
    cluster_scores.append(mean_mse)

  entire_y_pred_df.columns=Y_train.columns
  predictions_file = f"{base_dir}Predictions/{timestamp}_{iteration}_y_predictions_{cluster_no}_clusters.csv"

  entire_y_pred_df.to_csv(predictions_file)
  print(f"Iteration {iteration}: predictions for {cluster_no} clusters exported successfully.")
  avg_mse = np.mean(cluster_scores)
  median_mse = np.median(cluster_scores)
  mse_scores= [avg_mse, median_mse]

  return mse_scores

# Grid Search

export_grid_search_results collects all the relevant statistics for each parameter outcome, then by cluster, and exports it to CSV. Unsuccessful iterations - where the clustering couldn't be applied properly or there wasn't enough memory to test the set of parameters - are listed separately.

In [13]:
def export_grid_search_results(grid_search_df,unsuccessful_iterations,oom_list):
    summary_file_path = f'{base_dir}{timestamp}_summary_stats.csv'

    grid_search_df.to_csv(summary_file_path, index=False, sep=',')
    
    output = '\n\n\n\n'
    if len(unsuccessful_iterations) >= 1:
      output += '\nUnsuccessful Parameter Combinations:'
      for unsuccessful_item in unsuccessful_iterations:
        output += f'\n{unsuccessful_item}'
    
    if len(oom_list) >= 1:
      output += '\nInsufficient Memory:'
      for oom_item in oom_list:
        output += f'\n{oom_item}'

    with open(summary_file_path, "a") as file:
      file.write(output)
    
    print("Grid search summary successfully exported.")

In [14]:
def create_autoencoder_model(latent_dim, conv1_filters, conv2_filters, dropout_rate, l_r):
    input_shape = (13, 41, 41, 1)
    inputs = Input(shape=input_shape)
    x = TimeDistributed(Conv2D(conv1_filters, 3, activation='softmax', padding='same'))(inputs)
    x = TimeDistributed(Conv2D(conv2_filters, 3, activation='softmax', padding='same'))(x)
    x = TimeDistributed(Flatten())(x)
    x = TimeDistributed(Dropout(dropout_rate))(x)
    latent = TimeDistributed(Dense(latent_dim, activation='softmax'))(x)
    x = TimeDistributed(Dense(np.prod((41, 41, 64)), activation='softmax'))(latent)
    x = TimeDistributed(Reshape((41, 41, 64)))(x)
    x = TimeDistributed(Conv2DTranspose(32, 3, activation='softmax', padding='same'))(x)
    outputs = TimeDistributed(Conv2DTranspose(1, 3, activation='sigmoid', padding='same'))(x)
    autoencoder = Model(inputs, outputs)
    optimizer = Adam(learning_rate=l_r)
    autoencoder.compile(optimizer=optimizer, loss='mse')
    return autoencoder

In [17]:
def perform_grid_search(param_grid, num_iterations,use_lin_reg):
    
    exportdf_columns = ['Iteration No','Latent Dimensions','Batch Size','Epochs',
               'Conv1 Filters','Conv2 Filters','Dropout Rate','Learning Rate',
               'Clusters','Silhouette Score - Train','Calinski Score - Train',
               'Davies Bouldin Score - Train','Avg MSE','Median MSE',
               'Silhouette Score - Test','Calinski Score - Test','Davies Bouldin Score - Test']

    grid_search_results = pd.DataFrame(columns=exportdf_columns)
    unsuccessful_iterations = []
    oom_list = []
    for iteration_no in range(num_iterations):
        print(f"Iteration: {iteration_no}")
        try:
            params = {
                'latent_dim': random.choice(param_grid['latent_dim']),
                'batch_size': random.choice(param_grid['batch_size']),
                'epochs': random.choice(param_grid['epochs']),
                'conv1_filters': random.choice(param_grid['conv1_filters']),
                'conv2_filters': random.choice(param_grid['conv2_filters']),
                'dropout_rate': random.choice(param_grid['dropout_rate']),
                'learning_rate': random.choice(param_grid['learning_rate'])
            }

            autoencoder_model = create_autoencoder_model(params['latent_dim'], params['conv1_filters'],
                                                         params['conv2_filters'], params['dropout_rate'],
                                                         params['learning_rate'])

            early_stopping = EarlyStopping(monitor='loss', patience=3)
            autoencoder_model.fit(reshaped_seq_train, reshaped_seq_train, epochs=params['epochs'],
                                  batch_size=params['batch_size'], callbacks=[early_stopping], verbose=1)

            encoder = Model(autoencoder_model.input, autoencoder_model.layers[-5].output)
            latent_space = encoder.predict(reshaped_seq_train)
            reshaped_latent_space_train = latent_space.reshape(-1, 13 * params['latent_dim'])

            for cluster_no in range(2, 6):
                kmeans = KMeans(n_clusters=cluster_no, random_state=42, n_init='auto')
                cluster_labels_train = kmeans.fit_predict(reshaped_latent_space_train)
                try:
                    if use_lin_reg == True and len(set(cluster_labels_train)) == cluster_no:
                        latent_space_test = encoder.predict(reshaped_seq_test)
                        reshaped_latent_space_test = latent_space_test.reshape(-1, 13 * params['latent_dim'])
                        cluster_labels_test = kmeans.fit_predict(reshaped_latent_space_test)
                        mse_scores = regression_model(cluster_labels_train,cluster_labels_test,cluster_no,iteration_no)

                    silhouette_avg = silhouette_score(reshaped_latent_space_train, cluster_labels_train)
                    ch_score = calinski_harabasz_score(reshaped_latent_space_train, cluster_labels_train)
                    db_score = davies_bouldin_score(reshaped_latent_space_train, cluster_labels_train)

                    silhouette_avg_test = silhouette_score(reshaped_latent_space_test, cluster_labels_test)
                    ch_score_test = calinski_harabasz_score(reshaped_latent_space_test, cluster_labels_test)
                    db_score_test = davies_bouldin_score(reshaped_latent_space_test, cluster_labels_test)

                    results_list = [iteration_no,params['latent_dim'],params['batch_size'],params['epochs'],
                                       params['conv1_filters'],params['conv2_filters'],params['dropout_rate'],
                                       params['learning_rate'],cluster_no,silhouette_avg,ch_score,db_score,
                                       mse_scores[0],mse_scores[1],silhouette_avg_test,ch_score_test,
                                       db_score_test]
                    
                    clustering_df = pd.DataFrame([results_list], columns=grid_search_results.columns)

                    grid_search_results = grid_search_results.append(clustering_df, ignore_index=True)

                except Exception as e:
                    print(e)
                    unsuccessful_iterations.append([params,e])

        except ResourceExhaustedError:
            oom_list.append(params)
            print(f"Resource Exhausted Error for {params}")
            continue

    export_grid_search_results(grid_search_results,unsuccessful_iterations,oom_list)

In [22]:
# Step 4: Create the parameter grid for hyperparameter tuning
param_grid = {
    'latent_dim': [64, 128, 175],
    'batch_size': [32, 64, 128,175],
    'epochs': [10, 20, 30],
    'conv1_filters': [100, 150, 175],
    'conv2_filters': [100, 150, 175],
    'dropout_rate': [0.2, 0.4, 0.6],
    'learning_rate': [0.01, 0.001, 0.0001],
}

num_iterations = 1
timestamp = datetime.datetime.now()
timestamp = timestamp.strftime("%Y_%m_%d_%H%M%S")

perform_grid_search(param_grid, num_iterations,True)


Iteration: 0
Epoch 1/20
Resource Exhausted Error for {'latent_dim': 175, 'batch_size': 128, 'epochs': 20, 'conv1_filters': 150, 'conv2_filters': 150, 'dropout_rate': 0.6, 'learning_rate': 0.0001}
Grid search summary successfully exported.
