In [88]:
import json
import os
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

# Load the JSON dataset
path = "D:\Datasets\Yak_Webshop_1\json_files"
file_path = os.path.join(path, 'sample_diverse_dataset.json')

with open(file_path, 'r') as file:
    data = json.load(file)

# Convert JSON to DataFrame
df = pd.DataFrame(data['herd'])
df.head(6)

Unnamed: 0,name,age,health,behavior
0,Betty-1,100.0,special_health,special_behavior
1,Betty-2,5.9,excellent,positive
2,Betty-3,18.2,fair,stubborn
3,Betty-4,23.3,fair,stubborn
4,Betty-5,26.8,poor,negative
5,Betty-6,22.7,fair,stubborn


Data Preprocessing - Drop unwanted featrure

In [89]:
df.drop('name', axis=1, inplace=True)
df.head(5)

Unnamed: 0,age,health,behavior
0,100.0,special_health,special_behavior
1,5.9,excellent,positive
2,18.2,fair,stubborn
3,23.3,fair,stubborn
4,26.8,poor,negative


Data Preprocessing - Cheking unique categories of data columns 

In [90]:
print("Different health categories in Data: ", df['health'].unique())
print("\nDifferent Behaviour patterns in Data: ",df['behavior'].unique())

Different health categories in Data:  ['special_health' 'excellent' 'fair' 'poor' 'good']

Different Behaviour patterns in Data:  ['special_behavior' 'positive' 'stubborn' 'negative' 'playful' 'calm']


Data Preprocessing - Checking missing values

In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       100 non-null    float64
 1   health    100 non-null    object 
 2   behavior  100 non-null    object 
dtypes: float64(1), object(2)
memory usage: 2.5+ KB


Data Preprocessing - Raw Check for Actual_Anomalies

In [92]:
actual_anomalies = df[df['age'] == 100]
print("Total number of anomalies present: ", len(actual_anomalies))
actual_anomalies

Total number of anomalies present:  10


Unnamed: 0,age,health,behavior
0,100.0,special_health,special_behavior
10,100.0,special_health,special_behavior
20,100.0,special_health,special_behavior
30,100.0,special_health,special_behavior
40,100.0,special_health,special_behavior
50,100.0,special_health,special_behavior
60,100.0,special_health,special_behavior
70,100.0,special_health,special_behavior
80,100.0,special_health,special_behavior
90,100.0,special_health,special_behavior


Feature Engineering  - ['health', 'behavior'] are converted into numerical representation

In [93]:
df = pd.get_dummies(df, columns=['health', 'behavior'])
df.columns

Index(['age', 'health_excellent', 'health_fair', 'health_good', 'health_poor',
       'health_special_health', 'behavior_calm', 'behavior_negative',
       'behavior_playful', 'behavior_positive', 'behavior_special_behavior',
       'behavior_stubborn'],
      dtype='object')

In [94]:
df['age'] = df['age'].astype(float)

Model Training and HyperParameter Tuning

In [95]:
contamination_values = [0.01, 0.09, 0.11, 0.13, 0.15, 0.18, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

error_rates = {}  # Dictionary to store error rates for different contamination values
zero_error_rates = []  # List to store contamination values with 0 error rate
for contamination_value in contamination_values:
    model = IsolationForest(n_estimators=100, contamination=contamination_value, random_state=42)
    predictions = model.fit_predict(df.drop(columns=['model_prediction'], errors='ignore'))

    df['model_prediction'] = predictions
    df_anomalies = df[df['model_prediction'] == -1]
    df_inliers = df[df['model_prediction'] == 1]

    incorrect_predictions = max(len(df_anomalies) - len(actual_anomalies), 0)
    error_rate = incorrect_predictions / len(df)

    if error_rate > 0.00:  # Selecting error rates greater than 0.00%
        error_rates[contamination_value] = error_rate
        print(f"Contamination Value: {contamination_value}, Error Rate: {error_rate * 100:.2f}%")
    else:
        zero_error_rates.append(contamination_value)
        print(f"Contamination Value: {contamination_value}, Error Rate: {error_rate * 100:.2f}% (Error rate became 0, as coming negative)")

# Find the minimum error rate greater than 0.00% and its corresponding contamination value
if error_rates:
    min_error_rate = min(error_rates.values())
    best_contamination = [contamination for contamination, rate in error_rates.items() if rate == min_error_rate]
    print(f"\nBest Contamination Value(s) with Minimum Error Rate (> 0.00%): {best_contamination}")
    print(f"Minimum Error Rate (> 0.00%): {min_error_rate * 100:.2f}%")
else:
    print("No contamination value found with error rate greater than 0.00%.")


Contamination Value: 0.01, Error Rate: 0.00% (Error rate became 0, as coming negative)
Contamination Value: 0.09, Error Rate: 0.00% (Error rate became 0, as coming negative)
Contamination Value: 0.11, Error Rate: 0.00% (Error rate became 0, as coming negative)
Contamination Value: 0.13, Error Rate: 3.00%
Contamination Value: 0.15, Error Rate: 5.00%
Contamination Value: 0.18, Error Rate: 7.00%
Contamination Value: 0.2, Error Rate: 10.00%
Contamination Value: 0.25, Error Rate: 15.00%
Contamination Value: 0.3, Error Rate: 20.00%
Contamination Value: 0.35, Error Rate: 25.00%
Contamination Value: 0.4, Error Rate: 30.00%
Contamination Value: 0.45, Error Rate: 35.00%
Contamination Value: 0.5, Error Rate: 39.00%

Best Contamination Value(s) with Minimum Error Rate (> 0.00%): [0.13]
Minimum Error Rate (> 0.00%): 3.00%


In [None]:
'''
What is Isolation Forest: 
Assume - you are in forest having where every looks same except for the standalone  (odd ones) trees - anomalies

- Random Partitioning: It randomly selects features and values to separate data points into smaller groups

- Isolating Anomalies: Anomalies, being different from the majority, 
                       are easier to isolate and need fewer partitions to separate them from normal data

Depth as Anomaly Indicator: The algorithm identifies anomalies based on the fewer partitions (depth)
needed to isolate them, considering anomalies as points that stand out from the norm.
'''