# Anomaly Detection - Pipeline

In [248]:
import DataProcess
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

In [249]:
data = DataProcess.DataProcess()
df = data.getData()

In [250]:
df.drop(columns=["Unnamed: 0"], inplace=True)
df.head()

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820


In [251]:
test = df.iloc[2]["timestamp"]
test = pd.to_datetime(test)

In [252]:
df["timestamp"] = df["timestamp"].map(lambda x: pd.to_datetime(x))

In [253]:
type(df.iloc[2]["timestamp"])

pandas.Timestamp

In [254]:
# for i in range(45, 55):
#     print(f"{df.iloc[i]["timestamp"]} --- {df.iloc[i]["timestamp"].dayofweek}")
df["is_weekend"] = df["timestamp"].map(lambda x: int(x.dayofweek >= 5))

In [255]:
df["is_weekend"].value_counts()/48

is_weekend
0    154.0
1     61.0
Name: count, dtype: float64

In [256]:
hour = test.hour
minute = test.minute
dow = test.dayofweek
month = test.month

In [257]:
df["slot"] = df["timestamp"].map(lambda x: x.hour*2 + (x.minute == 30))

In [258]:
df["slot"].value_counts().sum()

np.int64(10320)

In [260]:
df["sin_slot"] = np.sin(2 * np.pi * df["slot"] / 48)
df["cos_slot"] = np.cos(2 * np.pi * df["slot"] / 48)

df["sin_month"] = np.sin(2 * np.pi * df["timestamp"].dt.month / 12)
df["cos_month"] = np.cos(2 * np.pi * df["timestamp"].dt.month / 12)

In [281]:
df["sin_month"].value_counts()

sin_month
-8.660254e-01    1488
-8.660254e-01    1488
-2.449294e-16    1488
 5.000000e-01    1488
-5.000000e-01    1440
-1.000000e+00    1440
-5.000000e-01    1440
Name: count, dtype: int64

In [262]:
df["lag_1"] = df["value"].shift(1)
df["lag_48"] = df["value"].shift(48)
df["delta_1"] = df["value"] - df["lag_1"]
df["delta_48"] = df["value"] - df["lag_48"]

In [263]:
df.drop(columns=["timestamp"], inplace=True)

In [264]:
df.dropna(inplace=True)

In [265]:
df.head()

Unnamed: 0,value,is_weekend,slot,sin_slot,cos_slot,sin_month,cos_month,lag_1,lag_48,delta_1,delta_48
48,13370,0,0,0.0,1.0,-0.5,-0.866025,16111.0,10844.0,-2741.0,2526.0
49,9945,0,1,0.130526,0.991445,-0.5,-0.866025,13370.0,8127.0,-3425.0,1818.0
50,7571,0,2,0.258819,0.965926,-0.5,-0.866025,9945.0,6210.0,-2374.0,1361.0
51,5917,0,3,0.382683,0.92388,-0.5,-0.866025,7571.0,4656.0,-1654.0,1261.0
52,4820,0,4,0.5,0.866025,-0.5,-0.866025,5917.0,3820.0,-1097.0,1000.0


In [266]:
mean = df["value"].mean()
std = df["value"].std()
mean, std

(np.float64(15135.684287383177), np.float64(6936.938604536594))

In [267]:
df["anomaly"] = (
    (df["value"] > (mean + 2 * std)) |
    (df["value"] < (mean - 2 * std))
).astype(int)

In [268]:
df["anomaly"].value_counts()

anomaly
0    10244
1       28
Name: count, dtype: int64

In [269]:
df.head()

Unnamed: 0,value,is_weekend,slot,sin_slot,cos_slot,sin_month,cos_month,lag_1,lag_48,delta_1,delta_48,anomaly
48,13370,0,0,0.0,1.0,-0.5,-0.866025,16111.0,10844.0,-2741.0,2526.0,0
49,9945,0,1,0.130526,0.991445,-0.5,-0.866025,13370.0,8127.0,-3425.0,1818.0,0
50,7571,0,2,0.258819,0.965926,-0.5,-0.866025,9945.0,6210.0,-2374.0,1361.0,0
51,5917,0,3,0.382683,0.92388,-0.5,-0.866025,7571.0,4656.0,-1654.0,1261.0,0
52,4820,0,4,0.5,0.866025,-0.5,-0.866025,5917.0,3820.0,-1097.0,1000.0,0


In [270]:
scalar = RobustScaler()
df['value'] = scalar.fit_transform(df[['value']])
df["delta_1"] = scalar.fit_transform(df[["delta_1"]])
df["delta_48"] = scalar.fit_transform(df[["delta_48"]])
df.drop(["slot", "lag_1", "lag_48"], axis=1, inplace=True)

In [271]:
df.head()

Unnamed: 0,value,is_weekend,sin_slot,cos_slot,sin_month,cos_month,delta_1,delta_48,anomaly
48,-0.355618,0,0.0,1.0,-0.5,-0.866025,-1.352418,0.82963,0
49,-0.713536,0,0.130526,0.991445,-0.5,-0.866025,-1.705222,0.560684,0
50,-0.961622,0,0.258819,0.965926,-0.5,-0.866025,-1.163121,0.387085,0
51,-1.134467,0,0.382683,0.92388,-0.5,-0.866025,-0.791747,0.349098,0
52,-1.249105,0,0.5,0.866025,-0.5,-0.866025,-0.504449,0.249953,0


In [272]:
from sklearn.model_selection import train_test_split

X = df.drop("anomaly", axis=1)
y = df["anomaly"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [273]:
from sklearn.ensemble import IsolationForest

model = IsolationForest(
    n_estimators=300,
    contamination=0.003,
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [274]:
for i in range(len(y_pred)):
    if y_pred[i] == -1:
        y_pred[i] = 0

In [275]:
from sklearn.metrics import precision_score, recall_score

print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Precision: 0.0013003901170351106
Recall: 0.6666666666666666


In [276]:
from sklearn.linear_model import Ridge

features = ["sin_slot", "cos_slot", "is_weekend", "delta_1", "delta_48"]
X = df[features]
y = df["value"]

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [277]:
m = y_pred.mean()

y_pred = pd.Series(y_pred)
y_pred = y_pred.map(lambda x: 1 if x >= m else 0)

In [278]:
from sklearn.metrics import precision_score, recall_score

print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Precision:  0.004010695187165776
Recall:  1.0


In [279]:
import torch
import torch.nn as nn
import numpy as np

class LinearNN(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.fc = nn.Linear(n_features, 1)

    def forward(self, x):
        return self.fc(x)

model = LinearNN(X_train.shape[1])
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

X_train_t = torch.tensor(X_train.values, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

for epoch in range(1000):
    optimizer.zero_grad()

    y_pred = model(X_train_t)
    loss = loss_fn(y_pred, y_train_t)

    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

print("="*50)
y_pred = model(torch.tensor(np.array(X_test), dtype=torch.float32))
print(y_pred)

Epoch 0, Loss: 0.4130929708480835
Epoch 100, Loss: 0.030821658670902252
Epoch 200, Loss: 0.012477589771151543
Epoch 300, Loss: 0.008029856719076633
Epoch 400, Loss: 0.005940861534327269
Epoch 500, Loss: 0.004795600660145283
Epoch 600, Loss: 0.004130692686885595
Epoch 700, Loss: 0.0037292877677828074
Epoch 800, Loss: 0.003479351755231619
Epoch 900, Loss: 0.00331987626850605
tensor([[-0.0011],
        [-0.0051],
        [-0.0091],
        ...,
        [ 0.0263],
        [ 0.0118],
        [ 0.0107]], grad_fn=<AddmmBackward0>)


In [280]:
# y_pred is a PyTorch tensor from the model

# 1. Detach ONCE and convert to numpy
y_pred_np = y_pred.detach().cpu().numpy().flatten()

# 2. Convert to pandas Series
y_pred_s = pd.Series(y_pred_np)

# 3. Threshold by mean (mechanically correct, conceptually weak)
m = y_pred_s.mean()
y_pred_bin = (y_pred_s >= m).astype(int)

# 4. Ground truth
y_test_np = y_test.values

# 5. Metrics
from sklearn.metrics import precision_score, recall_score

print("Precision:", precision_score(y_test_np, y_pred_bin))
print("Recall:", recall_score(y_test_np, y_pred_bin))

Precision: 0.004252303330970942
Recall: 1.0
