In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load the dataset from the provided Excel file
file_path = '/Users/veer/Desktop/RA-Projects/Completed Accuracy calculation/Agriculture building/Normal water consumption.xlsx'  # Adjust the file path as necessary
data = pd.read_excel(file_path)

In [3]:
# Convert the 'Date' column to datetime and round to nearest hour
data['Date'] = pd.to_datetime(data['Date']).dt.round('h')

In [4]:
# Creating lag features for the past 24 hours
for i in range(1, 25):
    data[f'Hour_{i}'] = data['Water Consumption(GPM)'].shift(i)

In [5]:
# Interpolating the 'Water Consumption(GPM)' column
data['Water Consumption(GPM)'] = data['Water Consumption(GPM)'].interpolate(method='linear')

In [6]:
data = data.ffill()

In [7]:
data = data.bfill()

In [8]:


# Assuming 'data' is your DataFrame and it's ready to be used
features = [f'Hour_{i}' for i in range(1, 25)]  # The features to be used
X = data[features].values
data.dropna(inplace=True)

# Normalize the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)


# Define early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor the validation loss
    patience=10,         # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restores model weights from the epoch with the best value of the monitored quantity
)

# Autoencoder architecture
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim, ))
encoder = Dense(16, activation="relu")(input_layer)
encoder = Dense(8, activation="relu")(encoder)
encoder = Dense(4, activation="relu")(encoder)
decoder = Dense(8, activation="relu")(encoder)
decoder = Dense(16, activation="relu")(decoder)
decoder = Dense(input_dim, activation='sigmoid')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the autoencoder with early stopping
autoencoder.fit(
    X_train, 
    X_train, 
    epochs=500,  # You can set a higher number since early stopping will likely stop the training earlier
    batch_size=32, 
    shuffle=True, 
    validation_data=(X_test, X_test),
    callbacks=[early_stopping]  # Include the early stopping callback here
)

# After training, early stopping might have stopped the training at an optimal point


# Predict on the training set
X_train_pred = autoencoder.predict(X_train)
train_mse = np.mean(np.power(X_train - X_train_pred, 2), axis=1)

# Try lowering the percentile to 95th or 90th to see if it improves anomaly detection
threshold = np.percentile(train_mse, 95)
#threshold=0.0035565515548837113

print(f"Reconstruction error threshold: {threshold}")

Epoch 1/500
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 581us/step - loss: 0.1244 - val_loss: 0.0091
Epoch 2/500
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347us/step - loss: 0.0074 - val_loss: 0.0058
Epoch 3/500
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 343us/step - loss: 0.0056 - val_loss: 0.0056
Epoch 4/500
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338us/step - loss: 0.0055 - val_loss: 0.0056
Epoch 5/500
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 343us/step - loss: 0.0054 - val_loss: 0.0055
Epoch 6/500
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342us/step - loss: 0.0053 - val_loss: 0.0051
Epoch 7/500
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333us/step - loss: 0.0048 - val_loss: 0.0044
Epoch 8/500
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321us/step - loss: 0.0042 - val_loss: 0.0042
Epoch 9/500
[1m

In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the new dataset
new_data_path = '/Users/veer/Desktop/RA-Projects/Completed Accuracy calculation/Agriculture building/+8_ water consumption.xlsx'
new_data = pd.read_excel(new_data_path)

# Assume the same preprocessing steps
new_data['Date'] = pd.to_datetime(new_data['Date']).dt.round('h')
for i in range(1, 25):
    new_data[f'lag_{i}'] = new_data['Water Consumption(GPM)'].shift(i)

# Proper way to apply interpolation without using inplace=True
new_data['Water Consumption(GPM)'] = new_data['Water Consumption(GPM)'].interpolate(method='linear')
new_data.ffill()
new_data.bfill()

# Check the number of rows after preprocessing
print("Data rows after preprocessing:", new_data.shape[0])

# Drop rows with NaN values that resulted from lag feature creation
new_data_cleaned = new_data.dropna()
print("Data rows after dropping NaNs:", new_data_cleaned.shape[0])

# Extract features and normalize
features = [f'lag_{i}' for i in range(1, 25)]
scaler = MinMaxScaler()
X_new = scaler.fit_transform(new_data_cleaned[features])
X_new_test = new_data_cleaned['Anomalies'].astype(int)

# Predict on the new dataset
X_new_pred = autoencoder.predict(X_new)

# Calculate the Mean Squared Error (MSE) for each prediction
new_mse = np.mean(np.power(X_new - X_new_pred, 2), axis=1)

# Detect anomalies by checking where MSE exceeds the threshold
anomalies = (new_mse > threshold).astype(int)

# Display the results
print("Anomaly Detected at Indices:", np.where(anomalies == 1)[0])
print("Number of Anomalies Detected:", np.sum(anomalies))

# Ensure the length of anomalies and X_new_test are the same
print("Length of anomalies:", len(anomalies))
print("Length of X_new_test:", len(X_new_test))

# Calculate metrics
f1 = f1_score(X_new_test, anomalies)
precision = precision_score(X_new_test, anomalies)
recall = recall_score(X_new_test, anomalies)
accuracy = accuracy_score(X_new_test, anomalies)

print(f"Accuracy: {accuracy:.5f}, F1: {f1:.5f}, Precision: {precision:.5f}, Recall: {recall:.5f}")


Data rows after preprocessing: 8760
Data rows after dropping NaNs: 8644
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188us/step
Anomaly Detected at Indices: [4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045
 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059
 4060 4061 4062 4063 4064 4131 4195 4196 4197 4198 4199 4200 4201 4202
 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216
 4217 4218 4223 4226 4227 4237 4238 4239 4240 4241 4242 4247 4248 4249
 4250 4251 4252 4253 4254 4255 4256 4257 4258 4260 4261 4262 4263 4264
 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278
 4279 4280 4281 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293
 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4307 4308 4309 4310
 4311 4312 4313 4314 4315 4316 4317 4320 4321 4322 4323 4324 4325 4326
 4327 4328 4329 4332 4333 4334 4335 4336 4337 4338 4339 4340 4344 4345
 4346 4347 4348 4349 4350 4351 4352 4353 