In [2]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import numpy as np

In [3]:
df = pd.read_csv(r"dataset_cleaned.csv")
df.head()

Unnamed: 0,utc_timestamp,power,power_diff,machine_1,machine_2,machine_3,machine_4,machine_5
0,2015-10-15 15:15:00+00:00,22.156,0.0,1.0,0.0,0.0,0.0,0.0
1,2015-10-15 15:15:00+00:00,10.969,0.0,0.0,0.0,1.0,0.0,0.0
2,2015-10-15 15:15:00+00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2015-10-15 15:15:00+00:00,10.906,0.0,0.0,1.0,0.0,0.0,0.0
4,2015-10-15 15:15:00+00:00,0.288,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:

df['utc_timestamp'] = pd.to_datetime(df['utc_timestamp'], errors='coerce', utc=True)

valid = df['utc_timestamp'].dropna()
if valid.empty:
    raise ValueError("No parseable utc_timestamp values in the DataFrame.")

# convert to tz-naive (UTC) then to int64 nanoseconds, compute 85th percentile
valid_naive = valid.dt.tz_convert('UTC').dt.tz_localize(None)
split_ns = valid_naive.astype('int64').quantile(0.85)

# build split point using nanoseconds (ns)
split_point = pd.Timestamp(int(round(split_ns)), unit='ns', tz='UTC')

In [None]:
# split manually the dataset because in probleme serie
train_data = df[df['utc_timestamp'] <= split_point]
test_data = df[df['utc_timestamp'] > split_point]

saving the test data for batch and real-time evaluation later, since we don't have labels we can't evaluate the model on the training data, but we can check the distribution of predictions

In [14]:
# save the test data to use it later for evaluation
test_data.to_csv("test_data.csv", index=False)
# save random sample of the training data to use it later for evaluation
train_data.sample(n=1, random_state=42).to_csv("RealTime_sample.csv", index=False)


## isolation forest model

In [6]:
tr_features = train_data[["power_diff","machine_1","machine_2","machine_3","machine_4","machine_5"]]

model = IsolationForest(contamination=0.05,
                        random_state=42, n_jobs=-1,
                        n_estimators=350 ,

                        max_samples=256)
model.fit(tr_features)
prediction = model.predict(tr_features)

prediction = (prediction == -1 ).astype(int)

#transorm prediction to dataframe
prediction_df = pd.DataFrame(prediction, columns=["anomaly"])
# we can't evaluate the model on the training data since we don't have labels, be can check the distribution of predictions

In [7]:

unique, counts = np.unique(prediction, return_counts=True)
#print(dict(zip(unique, counts)))
#count the number of anomalies detected
num_anomalies = np.sum(prediction)

In [8]:
# saving model
import joblib
joblib.dump(model, "isolation_forest_model.joblib")


['isolation_forest_model.joblib']