In [1]:
import json
import os
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

# Load the JSON dataset
path = "D:\Datasets\Yak_Webshop_1\json_files"
file_path = os.path.join(path, 'sample_diverse_dataset.json')

with open(file_path, 'r') as file:
    data = json.load(file)

# Convert JSON to DataFrame
df = pd.DataFrame(data['herd'])
df.head(6)

Unnamed: 0,name,age,health,behavior
0,Betty-1,100.0,special_health,special_behavior
1,Betty-2,5.9,excellent,positive
2,Betty-3,18.2,fair,stubborn
3,Betty-4,23.3,fair,stubborn
4,Betty-5,26.8,poor,negative
5,Betty-6,22.7,fair,stubborn


Data Preprocessing - Drop unwanted featrure

In [2]:
df.drop('name', axis=1, inplace=True)
df.head(5)

Unnamed: 0,age,health,behavior
0,100.0,special_health,special_behavior
1,5.9,excellent,positive
2,18.2,fair,stubborn
3,23.3,fair,stubborn
4,26.8,poor,negative


Data Preprocessing - Cheking unique categories of data columns 

In [3]:
print("Different health categories in Data: ", df['health'].unique())
print("\nDifferent Behaviour patterns in Data: ",df['behavior'].unique())

Different health categories in Data:  ['special_health' 'excellent' 'fair' 'poor' 'good']

Different Behaviour patterns in Data:  ['special_behavior' 'positive' 'stubborn' 'negative' 'playful' 'calm']


Data Preprocessing - Checking missing values

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       100 non-null    float64
 1   health    100 non-null    object 
 2   behavior  100 non-null    object 
dtypes: float64(1), object(2)
memory usage: 2.5+ KB


Data Preprocessing - Raw Check for Actual_Anomalies

In [5]:
actual_anomalies = df[df['age'] == 100]
print("Total number of anomalies present: ", len(actual_anomalies))
actual_anomalies

Total number of anomalies present:  10


Unnamed: 0,age,health,behavior
0,100.0,special_health,special_behavior
10,100.0,special_health,special_behavior
20,100.0,special_health,special_behavior
30,100.0,special_health,special_behavior
40,100.0,special_health,special_behavior
50,100.0,special_health,special_behavior
60,100.0,special_health,special_behavior
70,100.0,special_health,special_behavior
80,100.0,special_health,special_behavior
90,100.0,special_health,special_behavior


Feature Engineering  - ['health', 'behavior'] are converted into numerical representation

In [6]:
df = pd.get_dummies(df, columns=['health', 'behavior'])
df.columns

Index(['age', 'health_excellent', 'health_fair', 'health_good', 'health_poor',
       'health_special_health', 'behavior_calm', 'behavior_negative',
       'behavior_playful', 'behavior_positive', 'behavior_special_behavior',
       'behavior_stubborn'],
      dtype='object')

In [7]:
df['age'] = df['age'].astype(float)

Model Training and HyperParameter Tuning

In [76]:
model = IsolationForest(n_estimators=100, contamination=0.13, random_state=42)
predictions = model.fit_predict(df)
# anamoly predictions
df['model_prediction'] = predictions
df

Unnamed: 0,age,health_excellent,health_fair,health_good,health_poor,health_special_health,behavior_calm,behavior_negative,behavior_playful,behavior_positive,behavior_special_behavior,behavior_stubborn,model_prediction
0,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
1,5.9,True,False,False,False,False,False,False,False,True,False,False,1
2,18.2,False,True,False,False,False,False,False,False,False,False,True,1
3,23.3,False,True,False,False,False,False,False,False,False,False,True,1
4,26.8,False,False,False,True,False,False,True,False,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29.6,False,False,False,True,False,False,True,False,False,False,False,-1
96,15.8,False,True,False,False,False,False,False,False,False,False,True,1
97,22.0,False,True,False,False,False,False,False,False,False,False,True,1
98,10.0,False,False,True,False,False,True,False,False,False,False,False,-1


In [77]:
df_anomalies = df[df['model_prediction'] == -1]
print("Anomalies Predicted by the model: ", len(df_anomalies))
df_anomalies

Anomalies Predicted by the model:  20


Unnamed: 0,age,health_excellent,health_fair,health_good,health_poor,health_special_health,behavior_calm,behavior_negative,behavior_playful,behavior_positive,behavior_special_behavior,behavior_stubborn,model_prediction
0,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
8,28.2,False,False,False,True,False,False,True,False,False,False,False,-1
10,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
20,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
30,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
40,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
42,11.1,False,False,True,False,False,True,False,False,False,False,False,-1
49,11.1,False,False,True,False,False,True,False,False,False,False,False,-1
50,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
60,100.0,False,False,False,False,True,False,False,False,False,True,False,-1


Rows where data is normal

In [78]:
df_inliers = df[df['model_prediction'] == 1]
print("No of (Inliers - non anomalies predicted by the model): ", len(df_inliers))
df_inliers 

No of (Inliers - non anomalies predicted by the model):  80


Unnamed: 0,age,health_excellent,health_fair,health_good,health_poor,health_special_health,behavior_calm,behavior_negative,behavior_playful,behavior_positive,behavior_special_behavior,behavior_stubborn,model_prediction
1,5.9,True,False,False,False,False,False,False,False,True,False,False,1
2,18.2,False,True,False,False,False,False,False,False,False,False,True,1
3,23.3,False,True,False,False,False,False,False,False,False,False,True,1
4,26.8,False,False,False,True,False,False,True,False,False,False,False,1
5,22.7,False,True,False,False,False,False,False,False,False,False,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,13.5,False,False,True,False,False,True,False,False,False,False,False,1
92,5.7,True,False,False,False,False,False,False,False,True,False,False,1
94,3.7,True,False,False,False,False,False,False,True,False,False,False,1
96,15.8,False,True,False,False,False,False,False,False,False,False,True,1


In [79]:
# Summary statistics
print("Summary statistics for anomalies:")
print(df_anomalies.describe())

print("\nSummary statistics for inliers:")
print(df_inliers.describe())

# Anomaly ratio
predicted_anomaly_ratio = len(df_anomalies) / len(df)
print(f"\nPredicted_Anomaly ratio: {predicted_anomaly_ratio * 100:.2f}%")

actual_anomaly_ratio = len(actual_anomalies) / len(df)
print(f"Actual_Anomaly ratio: {actual_anomaly_ratio * 100:.2f}%")

incorrect_predictions = len(df_anomalies) - len(actual_anomalies)
error_rate = incorrect_predictions / len(df)
print(f"\nError rate: {error_rate * 100: .2f}%")

Summary statistics for anomalies:
              age  model_prediction
count   20.000000              20.0
mean    58.255000              -1.0
std     43.440564               0.0
min      1.100000              -1.0
25%     13.950000              -1.0
50%     64.800000              -1.0
75%    100.000000              -1.0
max    100.000000              -1.0

Summary statistics for inliers:
             age  model_prediction
count  80.000000              80.0
mean   14.361250               1.0
std     7.732072               0.0
min     1.800000               1.0
25%     7.525000               1.0
50%    15.700000               1.0
75%    21.325000               1.0
max    27.000000               1.0

Predicted_Anomaly ratio: 20.00%
Actual_Anomaly ratio: 10.00%

Error rate:  10.00%


In [None]:

'''
What is Isolation Forest: 
Assume - you are in forest having where every looks same except for the standalone  (odd ones) trees - anomalies

- Random Partitioning: It randomly selects features and values to separate data points into smaller groups

- Isolating Anomalies: Anomalies, being different from the majority, 
                       are easier to isolate and need fewer partitions to separate them from normal data

Depth as Anomaly Indicator: The algorithm identifies anomalies based on the fewer partitions (depth)
needed to isolate them, considering anomalies as points that stand out from the norm.
'''