## Import Libraries and Data

In [None]:
# libraries to use
from pathlib import Path

import pandas as pd
import hvplot.pandas

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.covariance import EmpiricalCovariance

from keras.wrappers.scikit_learn import KerasClassifier

from scipy.spatial import distance

import tensorflow as tf

import itertools

import numpy as np

In [None]:
# Save CSV's as pandas DF variables
tool_0028AA_df = pd.read_csv("Resources/Output_data/tool_0028AA_df.csv")
tool_9622C_df = pd.read_csv("Resources/Output_data/tool_9622C_df.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")

In [None]:
tool_0028AA_df.head()

In [None]:
tool_9622C_df.describe()

In [None]:
tool_9622C_df.head()
# 'SUSCEP_CGS E-5' 'SANGB_DEG' 'TEMP_CPS' 

In [None]:
# join the datasets together based on well and Depth_M
merged_df = pd.merge(tool_0028AA_df, tool_9622C_df, on=['well', 'Depth_M'], how='inner')

In [None]:
merged_df.columns

## Clean and Prepare Data

In [None]:
#REMOVE LATER
merged_df = merged_df.sample(n=500000)

In [None]:
# Filter for a single well prefix RH
prefix_counts = merged_df['well'].str.extract('^(.*?)\d+')[0].value_counts()
prefix_counts.head(10)

In [None]:
# prefix_counts = tool_0028A_df['well'].str.extract('^(.*?)\d+')[0].value_counts()
# prefix_counts.head(10)

In [None]:
filtered_df = merged_df[merged_df['well'].str.startswith('RHRC')]
filtered_df.tail()

In [None]:
# Remove unnecessary columns
merged_cleaned_df = filtered_df[['well', 'Depth_M', 'SUSCEP_CGS E-5', 'DENSITY_G/CC']]

In [None]:
# Clean nulls
merged_data = merged_cleaned_df.replace(-999.25, pd.NA)
merged_data.dropna(inplace=True)
merged_data.describe()

In [None]:
# Prior to scaling, obviously invalid data such as negative Susceptability readings and datapoints that fall well outside the tools expected rages will be removed
# filter out rows where Susceptability falls below the tools specified minimum value using tool documentation https://www.century-geo.com/9622
merged_filtered_data = merged_data[
    (merged_data["SUSCEP_CGS E-5"] >= 0) &
    (merged_data["DENSITY_G/CC"] >= 0) &
    (merged_data["DENSITY_G/CC"] <= 6) &
    (merged_data["SUSCEP_CGS E-5"] <= 35)
].copy()

merged_filtered_data.describe()

In [None]:
# remove rows that contain outliers that fall outside 3 standard deviations for the columns in order to create clusters on legitimate datapoints
# Calculate the mean and standard deviation for each column
mean_values = merged_filtered_data.mean()
std_values = merged_filtered_data.std()

# Define the upper and lower bounds for filtering
lower_bounds = mean_values - (3 * std_values)
upper_bounds = mean_values + (3 * std_values)
merged_filtered_data = merged_filtered_data[
    (merged_filtered_data['SUSCEP_CGS E-5'] >= lower_bounds['SUSCEP_CGS E-5']) &
    (merged_filtered_data['SUSCEP_CGS E-5'] <= upper_bounds['SUSCEP_CGS E-5']) &
    (merged_filtered_data['DENSITY_G/CC'] >= lower_bounds['DENSITY_G/CC']) &
    (merged_filtered_data['DENSITY_G/CC'] <= upper_bounds['DENSITY_G/CC']) #&
    #(merged_filtered_data['TEMP_CPS'] >= lower_bounds['TEMP_CPS']) &
    #(merged_filtered_data['TEMP_CPS'] <= upper_bounds['TEMP_CPS'])
]

merged_filtered_data.describe()

# Predict clusters using Empirical Covariance

In [None]:
# Scale the filtered Data Remove data that is not to be scaled (Well and Depth)
scaled_merged_filtered_data = merged_filtered_data.drop(columns=['well', 'Depth_M'])
scaler = StandardScaler()
scaled_data = scaler.fit_transform(scaled_merged_filtered_data)

In [None]:
# Choose number of clusters
k = 4
mahalanobis_distances = []

model = KMeans(n_clusters=k)
model.fit(scaled_data)
scaled_merged_df = pd.DataFrame(scaled_data, columns=['SUSCEP_CGS E-5', 'DENSITY_G/CC'])
scaled_merged_df['cluster_label'] = model.labels_

for i in range(k):
    cluster_points = scaled_merged_df.loc[scaled_merged_df['cluster_label'] == i, ['SUSCEP_CGS E-5', 'DENSITY_G/CC']]
    cov = EmpiricalCovariance().fit(cluster_points)
    cluster_center_reshaped = np.reshape(model.cluster_centers_[i], (1, -1))
    mahalanobis_dist = distance.cdist(cluster_points, cluster_center_reshaped, 'mahalanobis', VI=cov.covariance_)
    mahalanobis_distances.extend(mahalanobis_dist)

In [None]:
scaled_merged_df['min_mahalanobis_distance'] = np.min(mahalanobis_distances, axis=1)

In [None]:
# Determine outliers based on a threshold (e.g., 3 standard deviations from the mean) CAN BE CHANGED DEPENDING ON OUTPUT
threshold = np.mean(scaled_merged_df['min_mahalanobis_distance']) + 3 * np.std(scaled_merged_df['min_mahalanobis_distance'])
scaled_merged_df['is_outlier'] = scaled_merged_df['min_mahalanobis_distance'] > threshold

In [None]:
scaled_merged_df

In [None]:
# Plot the DataFrame
scaled_merged_df.hvplot.scatter(
    x="SUSCEP_CGS E-5",
    y="DENSITY_G/CC",
    color="cluster_label"
)

In [None]:
# Plot the DataFrame
scaled_merged_df.hvplot.scatter(
    x="SUSCEP_CGS E-5",
    y="DENSITY_G/CC",
    color="is_outlier"
)

# Predict outliers using Elliptic Enveliope

In [None]:
# Choose number of clusters
k = 3
mahalanobis_distances = np.empty((len(scaled_merged_df),))  # Initialize an empty NumPy array

model = KMeans(n_clusters=k)
model.fit(scaled_data)
scaled_merged_df = pd.DataFrame(scaled_data, columns=['SUSCEP_CGS E-5', 'DENSITY_G/CC'])
scaled_merged_df['cluster_label'] = model.labels_

In [None]:
for cluster_label in np.unique(scaled_merged_df['cluster_label']):
    cluster_data = scaled_merged_df.loc[scaled_merged_df['cluster_label'] == cluster_label, ["DENSITY_G/CC", "SUSCEP_CGS E-5"]]  # Adjust the features accordingly

    # Fit the Elliptic Envelope on the cluster data
    envelope = EllipticEnvelope()
    envelope.fit(cluster_data)

    # Calculate the Mahalanobis distance for each data point in the cluster
    cluster_distances = envelope.mahalanobis(cluster_data)

    # Assign the Mahalanobis distances to the corresponding indices in the mahalanobis_distances array
    mahalanobis_distances[scaled_merged_df['cluster_label'] == cluster_label] = cluster_distances

In [None]:
# Calculate the average Mahalanobis distance across all clusters
scaled_merged_df['Mahalanobis_Distance'] = mahalanobis_distances

In [None]:
# Set a threshold to determine outliers
threshold = 8  # Adjust as needed

# Identify outliers based on the threshold
scaled_merged_df['is_outlier'] = scaled_merged_df['Mahalanobis_Distance'] > threshold
scaled_merged_df

In [None]:
# Plot outliers with data to see if it is classifying properly
scaled_merged_df.hvplot.scatter(
    x="SUSCEP_CGS E-5", 
    y="DENSITY_G/CC", 
    by="cluster_label"
)

In [None]:
# Plot outliers with data to see if it is classifying properly
scaled_merged_df.hvplot.scatter(
    x="SUSCEP_CGS E-5", 
    y="DENSITY_G/CC", 
    by="is_outlier"
)

## Model optimisation

In [None]:
class CustomCSVLogger(CSVLogger):
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 5 == 0:
            super().on_epoch_end(epoch, logs)

In [None]:
# Tuning Parameters
activation_functions = ['relu', 'sigmoid', 'tanh']
hidden_nodes_layer1_values = [32, 64, 128]
hidden_nodes_layer2_values = [16, 32, 64]
optimizers = ['adam', 'rmsprop']
losses = ['binary_crossentropy', 'mean_squared_error']

In [None]:
parameter_combinations = list(itertools.product(activation_functions, hidden_nodes_layer1_values, hidden_nodes_layer2_values, optimizers, losses))
parameter_combinations[103]

In [None]:
def train_model(activation, hidden_nodes_layer1, hidden_nodes_layer2, optimizer, loss):
    nn = tf.keras.models.Sequential()
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation=activation))
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=activation))
    nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
    nn.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    nn.fit(X_train_scaled, y_train, epochs=20, callbacks=[CustomCSVLogger('model_tuning_results.csv', append=True)])

In [None]:
results_df = pd.DataFrame(columns=['Activation', 'Hidden Nodes Layer 1', 'Hidden Nodes Layer 2', 'Optimizer', 'Loss'])
for params in parameter_combinations:
    activation, hidden_nodes_layer1, hidden_nodes_layer2, optimizer, loss = params
    train_model(activation, hidden_nodes_layer1, hidden_nodes_layer2, optimizer, loss)
    results_df = results_df.append({'Activation': activation,
                                    'Hidden Nodes Layer 1': hidden_nodes_layer1,
                                    'Hidden Nodes Layer 2': hidden_nodes_layer2,
                                    'Optimizer': optimizer,
                                    'Loss': loss}, ignore_index=True)
    
results_df.to_csv('model_parameters.csv', index=False)