In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
file_path = 'cleaned_electricity_data.csv'
df = pd.read_csv(file_path)


print(df.head())

                            CONS_NO  FLAG  2014/1/1  2014/1/10  2014/1/11  \
0  0387DD8A07E07FDA6271170F86AD9151     1       0.0       0.00       0.00   
1  01D6177B5D4FFE0CABA9EF17DAFC2B84     1       0.0       0.00       0.00   
2  4B75AC4F2D8434CFF62DB64D0BB43103     1       0.0       0.00       0.00   
3  B32AC8CC6D5D805AC053557AB05F5343     1       0.0       0.00       0.00   
4  EDFC78B07BA2908B3395C4EB2304665E     1       2.9       3.42       3.81   

   2014/1/12  2014/1/13  2014/1/14  2014/1/15  2014/1/16  ...  2016/9/29  \
0       0.00       0.00       0.00       0.00       0.00  ...       9.96   
1       0.00       0.00       0.00       0.00       0.00  ...       0.00   
2       0.00       0.00       0.00       0.00       0.00  ...       0.00   
3       0.00       0.00       0.00       0.00       0.00  ...       9.99   
4       4.58       3.56       4.25       3.86       3.53  ...      10.37   

   2016/9/3  2016/9/30  2016/9/4  2016/9/5  2016/9/6  2016/9/7  2016/9/8  \
0   

# New Section

In [None]:
X = df.iloc[:, 2:-1]  # all columns except the 'FLAG' and 'CONSUMER_ID'
y = df['FLAG']        # the anomaly flag column


In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Reshape data to match LSTM expected input shape (samples, time steps, features)
time_steps = 30  #  a time window of 30 days for prediction
X_sequences = []

for i in range(time_steps, X_scaled.shape[0]):
    X_sequences.append(X_scaled[i - time_steps:i, :])

X_sequences = np.array(X_sequences)
y_sequences = y[time_steps:].values


In [None]:
# The LSTM model
model = Sequential()
model.add(LSTM(units=50, activation='relu', input_shape=(X_sequences.shape[1], X_sequences.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_sequences, y_sequences, epochs=10, batch_size=32, validation_split=0.2)


  super().__init__(**kwargs)


Epoch 1/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 64ms/step - loss: 1.2585 - val_loss: 0.2369
Epoch 2/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.0893 - val_loss: 0.0293
Epoch 3/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.0229 - val_loss: 0.0080
Epoch 4/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0142 - val_loss: 0.0020
Epoch 5/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0122 - val_loss: 0.0014
Epoch 6/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0094 - val_loss: 0.0010
Epoch 7/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0088 - val_loss: 7.1655e-04
Epoch 8/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0069 - val_loss: 0.0022
Epoch 9/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f1a4b0bf4c0>

In [None]:
# Extract features by getting the output of the last Dense layer
feature_extractor = Sequential(model.layers[:-1])  # Removing the output layer to get intermediate features
X_features = feature_extractor.predict(X_sequences)

# Print a sample of the extracted features
print("Extracted Features Shape:", X_features.shape)
print("Extracted Features (first 5 samples):")
print(X_features[:5])


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
Extracted Features Shape: (2042, 10)
Extracted Features (first 5 samples):
[[0.38635355 0.25735572 0.16999331 0.17480128 0.         0.37769836
  0.15511006 0.05177077 0.31398988 0.        ]
 [0.37446958 0.26298475 0.18496445 0.18253282 0.         0.38744682
  0.14893351 0.05569894 0.30963832 0.        ]
 [0.36682892 0.2690866  0.19528565 0.1884736  0.         0.38966894
  0.14271034 0.05824789 0.3074931  0.        ]
 [0.35761344 0.26483947 0.19284377 0.18421258 0.         0.39511377
  0.14416952 0.06248213 0.30878794 0.        ]
 [0.35886446 0.2600603  0.18248373 0.17988916 0.         0.3935637
  0.15016992 0.06582925 0.31927687 0.        ]]


In [None]:
# Train-test split for SVM
X_train, X_test, y_train, y_test = train_test_split(X_features, y_sequences, test_size=0.2, random_state=42)

print("Class distribution of y_train:")
print(pd.Series(y_train).value_counts())

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train the OneClassSVM
# svm_model = OneClassSVM(kernel='rbf', gamma='auto', nu=0.05)  # Adjust nu based on the expected percentage of anomalies
# svm_model.fit(X_train)

# # Predict anomalies on the test set
# y_pred = svm_model.predict(X_test)
# y_pred = np.where(y_pred == -1, 1, 0)  # Convert -1 to 1 (anomaly), 1 to 0 (normal)

# 4. Alternative model: Random Forest
# rf_model = RandomForestClassifier(class_weight='balanced')
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)
y_pred = rf_model.predict(X_test)
# cv_scores = cross_val_score(rf_model, X_train_resampled, y_train_resampled, cv=5)
# print(f"Cross-validation scores: {cv_scores}")
# print(f"Mean CV score: {cv_scores.mean()}")







NameError: name 'train_test_split' is not defined

In [None]:
# from sklearn.metrics import classification_report, confusion_matrix

# print("Confusion Matrix:")
# print(confusion_matrix(y_test, y_pred))

# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))


In [None]:
print("\nConfusion Matrix (RandomForest):")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report (RandomForest):")
print(classification_report(y_test, y_pred))



In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selecting a subset of predictions and actual labels for simplicity
subset_size = 50  # Adjust this size if you want a larger subset
y_test_subset = y_test[:subset_size]
y_pred_subset = y_pred[:subset_size]

# Define indices for each case
correct_anomalies = np.where((y_test_subset == 1) & (y_pred_subset == 1))[0]  # True Positives
missed_anomalies = np.where((y_test_subset == 1) & (y_pred_subset == 0))[0]   # False Negatives
correct_normals = np.where((y_test_subset == 0) & (y_pred_subset == 0))[0]     # True Negatives
false_alarms = np.where((y_test_subset == 0) & (y_pred_subset == 1))[0]        # False Positives

# Plotting
plt.figure(figsize=(12, 5))
plt.scatter(correct_anomalies, y_test_subset[correct_anomalies], color='green', label='Correct Anomaly', marker='x')
plt.scatter(missed_anomalies, y_test_subset[missed_anomalies], color='red', label='Missed Anomaly', marker='o')
plt.scatter(correct_normals, y_test_subset[correct_normals], color='blue', label='Correct Normal', marker='x')
plt.scatter(false_alarms, y_test_subset[false_alarms], color='orange', label='False Alarm', marker='o')

# Adding labels and title
plt.xlabel('Sample Index')
plt.ylabel('Anomaly (0 = No, 1 = Yes)')
plt.title('Anomaly Detection: Correct and Misclassified Predictions')
plt.legend()
plt.show()
