In [157]:
import json
import os
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

# Load the JSON dataset
path = "D:\Datasets\Yak_Webshop_1\json_files"
file_path = os.path.join(path, 'sample_diverse_dataset.json')

with open(file_path, 'r') as file:
    data = json.load(file)

# Convert JSON to DataFrame
df = pd.DataFrame(data['herd'])
df.head(6)

Unnamed: 0,name,age,health,behavior
0,Betty-1,100.0,special_health,special_behavior
1,Betty-2,5.9,excellent,positive
2,Betty-3,18.2,fair,stubborn
3,Betty-4,23.3,fair,stubborn
4,Betty-5,26.8,poor,negative
5,Betty-6,22.7,fair,stubborn


In [158]:
# drop the unwanted attribute
df.drop('name', axis=1, inplace=True)
df.head(5)

Unnamed: 0,age,health,behavior
0,100.0,special_health,special_behavior
1,5.9,excellent,positive
2,18.2,fair,stubborn
3,23.3,fair,stubborn
4,26.8,poor,negative


In [159]:
df['health'].unique()

array(['special_health', 'excellent', 'fair', 'poor', 'good'],
      dtype=object)

In [160]:
df['behavior'].unique()

array(['special_behavior', 'positive', 'stubborn', 'negative', 'playful',
       'calm'], dtype=object)

Applying one hot encoding - as my attibutes are categories. 
Model accepts numerical representation. 
So ['health', 'behavior'] are converted into numerical representation

In [161]:
df = pd.get_dummies(df, columns=['health', 'behavior'])

In [162]:
# cheking after one-hot encoding
df.columns

Index(['age', 'health_excellent', 'health_fair', 'health_good', 'health_poor',
       'health_special_health', 'behavior_calm', 'behavior_negative',
       'behavior_playful', 'behavior_positive', 'behavior_special_behavior',
       'behavior_stubborn'],
      dtype='object')

In [163]:
df['age'] = df['age'].astype(float)

In [164]:
# MODEL INITIATION AND TRAINING
model = IsolationForest(n_estimators=100, contamination=0.13, random_state=42)
model.fit(df)

In [165]:
# anamoly predictions
predictions = model.predict(df)
df['anomaly'] = predictions

In [166]:
df

Unnamed: 0,age,health_excellent,health_fair,health_good,health_poor,health_special_health,behavior_calm,behavior_negative,behavior_playful,behavior_positive,behavior_special_behavior,behavior_stubborn,anomaly
0,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
1,5.9,True,False,False,False,False,False,False,False,True,False,False,1
2,18.2,False,True,False,False,False,False,False,False,False,False,True,1
3,23.3,False,True,False,False,False,False,False,False,False,False,True,1
4,26.8,False,False,False,True,False,False,True,False,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29.6,False,False,False,True,False,False,True,False,False,False,False,-1
96,15.8,False,True,False,False,False,False,False,False,False,False,True,1
97,22.0,False,True,False,False,False,False,False,False,False,False,True,1
98,10.0,False,False,True,False,False,True,False,False,False,False,False,-1


From the data_set, age = 100 are assigned with "special_health" and "special_behaviour", 
thus model has to detect rows where age = 100

In [167]:
df_anomalies = df[df['anomaly'] == -1]
print("No of anomalies: ", len(df_anomalies))
df_anomalies

No of anomalies:  13


Unnamed: 0,age,health_excellent,health_fair,health_good,health_poor,health_special_health,behavior_calm,behavior_negative,behavior_playful,behavior_positive,behavior_special_behavior,behavior_stubborn,anomaly
0,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
10,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
20,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
30,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
40,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
50,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
60,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
66,1.1,True,False,False,False,False,False,False,True,False,False,False,-1
70,100.0,False,False,False,False,True,False,False,False,False,True,False,-1
80,100.0,False,False,False,False,True,False,False,False,False,True,False,-1


Rows where data is normal

In [170]:
df_inliers = df[df['anomaly'] == 1]
print("No of normal data points (Inliers): ", len(df_inliers))
df_inliers 

No of normal data points (Inliers):  87


Unnamed: 0,age,health_excellent,health_fair,health_good,health_poor,health_special_health,behavior_calm,behavior_negative,behavior_playful,behavior_positive,behavior_special_behavior,behavior_stubborn,anomaly
1,5.9,True,False,False,False,False,False,False,False,True,False,False,1
2,18.2,False,True,False,False,False,False,False,False,False,False,True,1
3,23.3,False,True,False,False,False,False,False,False,False,False,True,1
4,26.8,False,False,False,True,False,False,True,False,False,False,False,1
5,22.7,False,True,False,False,False,False,False,False,False,False,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,25.9,False,False,False,True,False,False,True,False,False,False,False,1
94,3.7,True,False,False,False,False,False,False,True,False,False,False,1
96,15.8,False,True,False,False,False,False,False,False,False,False,True,1
97,22.0,False,True,False,False,False,False,False,False,False,False,True,1


In [173]:
outliers_in_data_inliers = df_inliers[df_inliers["age"] == 100]
outliers_in_data_inliers

Unnamed: 0,age,health_excellent,health_fair,health_good,health_poor,health_special_health,behavior_calm,behavior_negative,behavior_playful,behavior_positive,behavior_special_behavior,behavior_stubborn,anomaly


In [171]:
# output and evaluation
print(classification_report(df['anomaly'], predictions))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        87

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [172]:
# Calculating accuracy
correct_predictions = (df['anomaly'] == predictions).sum()
total_samples = len(df)
accuracy = correct_predictions / total_samples

# Calculating False Positive Rate (FPR)
# FPR = False Positives / (False Positives + True Negatives)
false_positives = ((df['anomaly'] == -1) & (predictions == 1)).sum()
true_negatives = ((df['anomaly'] == 1) & (predictions == 1)).sum()
fpr = false_positives / (false_positives + true_negatives)

print("Total Samples: ", total_samples)
print(f"Accuracy: {accuracy:.4f}")
print(f"False Positive Rate (FPR): {fpr:.4f}")

Total Samples:  100
Accuracy: 1.0000
False Positive Rate (FPR): 0.0000
