## Import Libraries and Data

In [None]:
# libraries to use
from pathlib import Path

import pandas as pd
import hvplot.pandas

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope

from keras.wrappers.scikit_learn import KerasClassifier

import tensorflow as tf

import itertools

import numpy as np

In [None]:
# Save CSV's as pandas DF variables
tool_0028AA_df = pd.read_csv("Resources/Output_data/tool_0028AA_df.csv")
tool_9622C_df = pd.read_csv("Resources/Output_data/tool_9622C_df.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")
# tool_xxx_df = pd.read_csv("Resources/xxx.csv")

In [None]:
tool_9622C_df.describe()

In [None]:
tool_9622C_df.head()
# 'SUSCEP_CGS E-5' 'SANGB_DEG' 'TEMP_CPS' 

## Clean and Prepare Data

In [None]:
# merge or alter tables here
# Join multiple tools on Well ID and Depth? or just Well ID?

In [None]:
# Remove unnecessary columns
tool_9622C_cleaned_df = tool_9622C_df[['SUSCEP_CGS E-5', 'SANGB_DEG', 'TEMP_CPS']]
tool_9622C_cleaned_df.head()

In [None]:
# Clean nulls
tool_9622C_data = tool_9622C_cleaned_df.replace(-999.25, pd.NA)
tool_9622C_data.dropna(inplace=True)
tool_9622C_data.describe()

In [None]:
# Scale Data
scaler = StandardScaler()
scaled_9622C_data = scaler.fit_transform(tool_9622C_data)

## K-means clustering

In [None]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11)) 

In [None]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(scaled_9622C_data)
    inertia.append(k_model.inertia_)

In [None]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

In [None]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [None]:
# Choose number of clusters
model = KMeans(n_clusters=4)
model.fit(scaled_9622C_data)

In [None]:
scaled_data_df = pd.DataFrame(scaled_9622C_data, columns = ['SUSCEP_CGS E-5', 'SANGB_DEG', 'TEMP_CPS'])

In [None]:
#TEST 
# Fit the model
model.fit(scaled_data_df)

# Make predictions
k_4 = model.predict(scaled_data_df)

# Create a copy of the DataFrame
tool_9622C_predictions_df = scaled_data_df.copy()

# Add a class column with the labels
tool_9622C_predictions_df['Cluster'] = k_4

In [None]:
tool_9622C_predictions_df.describe()

In [None]:
# Plot the clusters
tool_9622C_predictions_df.hvplot.scatter(
    x="SUSCEP_CGS E-5",
    y="TEMP_CPS",
    by="Cluster"
)

In [None]:
#
clusters = model.fit_predict(scaled_data_df.iloc[:, [1, 2]])  # Corrected indexing
scaled_data_df['Cluster'] = clusters

In [None]:
# Create an empty array to store the Mahalanobis distances
mahalanobis_distances = np.zeros((len(scaled_data_df),))

In [None]:
mahalanobis_distances

In [None]:
for cluster_label in np.unique(clusters):
    cluster_data = scaled_data_df.loc[clusters == cluster_label, ["DENSITY_G/CC", "POR(DEN)_PERCENT"]]  # Adjust the features accordingly
    
    # Fit the Elliptic Envelope on the cluster data
    envelope = EllipticEnvelope()
    envelope.fit(cluster_data)
    
    # Calculate the Mahalanobis distance for each data point in the cluster
    cluster_distances = envelope.mahalanobis(cluster_data)
    
    # Assign the Mahalanobis distances to the corresponding indices in the mahalanobis_distances array
    mahalanobis_distances[clusters == cluster_label] = cluster_distances

In [None]:
# Calculate the average Mahalanobis distance across all clusters
scaled_data_df['Mahalanobis_Distance'] = mahalanobis_distances

In [None]:
# Set a threshold to determine outliers
threshold = 2.5  # Adjust as needed

In [None]:
# Identify outliers based on the threshold
scaled_data_df['Is_Outlier'] = scaled_data_df['Mahalanobis_Distance'] > threshold
scaled_data_df

In [None]:
# Plot outliers with data to see if it is classifying properly
scaled_data_df.hvplot.scatter(
    x="DENSITY_G/CC", 
    y="POR(DEN)_PERCENT", 
    by="Cluster"
)

## Initialise model and anomaly detection

In [None]:
# create dummies if needed

In [None]:
outlier_detector = EllipticEnvelope(contamination=0.05)  # Adjust the contamination parameter as needed, 0.05 suggests 5% of data are outliers, can lower if needed
outlier_detector.fit(scaled_data)
outlier_labels = outlier_detector.predict(scaled_data)
data['Anomaly'] = outlier_labels

## Model optimisation

In [None]:
class CustomCSVLogger(CSVLogger):
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 5 == 0:
            super().on_epoch_end(epoch, logs)

In [None]:
# Tuning Parameters
activation_functions = ['relu', 'sigmoid', 'tanh']
hidden_nodes_layer1_values = [32, 64, 128]
hidden_nodes_layer2_values = [16, 32, 64]
optimizers = ['adam', 'rmsprop']
losses = ['binary_crossentropy', 'mean_squared_error']

In [None]:
parameter_combinations = list(itertools.product(activation_functions, hidden_nodes_layer1_values, hidden_nodes_layer2_values, optimizers, losses))
parameter_combinations[103]

In [None]:
def train_model(activation, hidden_nodes_layer1, hidden_nodes_layer2, optimizer, loss):
    nn = tf.keras.models.Sequential()
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation=activation))
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=activation))
    nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
    nn.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    nn.fit(X_train_scaled, y_train, epochs=20, callbacks=[CustomCSVLogger('model_tuning_results.csv', append=True)])

In [None]:
results_df = pd.DataFrame(columns=['Activation', 'Hidden Nodes Layer 1', 'Hidden Nodes Layer 2', 'Optimizer', 'Loss'])
for params in parameter_combinations:
    activation, hidden_nodes_layer1, hidden_nodes_layer2, optimizer, loss = params
    train_model(activation, hidden_nodes_layer1, hidden_nodes_layer2, optimizer, loss)
    results_df = results_df.append({'Activation': activation,
                                    'Hidden Nodes Layer 1': hidden_nodes_layer1,
                                    'Hidden Nodes Layer 2': hidden_nodes_layer2,
                                    'Optimizer': optimizer,
                                    'Loss': loss}, ignore_index=True)
    
results_df.to_csv('model_parameters.csv', index=False)