In [1]:
import pandas as pd
from PIL.TiffImagePlugin import PREDICTOR
from sklearn.ensemble import IsolationForest
import numpy as np

In [2]:
df = pd.read_csv(r"dataset_cleaned.csv")
df.head()

Unnamed: 0,utc_timestamp,power,power_diff,machine_1,machine_2,machine_3,machine_4,machine_5
0,2015-10-15 15:15:00+00:00,22.156,0.0,1.0,0.0,0.0,0.0,0.0
1,2015-10-15 15:15:00+00:00,10.969,0.0,0.0,0.0,1.0,0.0,0.0
2,2015-10-15 15:15:00+00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2015-10-15 15:15:00+00:00,10.906,0.0,0.0,1.0,0.0,0.0,0.0
4,2015-10-15 15:15:00+00:00,0.288,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:

df['utc_timestamp'] = pd.to_datetime(df['utc_timestamp'], errors='coerce', utc=True)

valid = df['utc_timestamp'].dropna()
if valid.empty:
    raise ValueError("No parseable utc_timestamp values in the DataFrame.")

# convert to tz-naive (UTC) then to int64 nanoseconds, compute 85th percentile
valid_naive = valid.dt.tz_convert('UTC').dt.tz_localize(None)
split_ns = valid_naive.astype('int64').quantile(0.85)

# build split point using nanoseconds (ns)
split_point = pd.Timestamp(int(round(split_ns)), unit='ns', tz='UTC')

In [4]:
# split manually the dataset because in probleme serie
train_data = df[df['utc_timestamp'] <= split_point]
test_data = df[df['utc_timestamp'] > split_point]

## isolation forest model

In [None]:
tr_features = df[["power_diff", "machine_1", "machine_2", "machine_3", "machine_4", "machine_5"]]

model = IsolationForest(contamination=0.05,
                        random_state=46,
                        n_jobs=-1,
                        n_estimators=350 ,
                        max_samples=256)
model.fit(tr_features)
prediction = model.predict(tr_features)
prediction = (prediction == -1 ).astype(int)

In [13]:
# check number of anomalies detected
num_anomalies = np.sum(prediction)
num_anomalies

np.int64(14268)

In [15]:
import joblib
joblib.dump(model, "Isolation_Forest_Model3.joblib")


['Isolation_Forest_Model3.joblib']