In [37]:
import numpy as np
import pandas as pd

from google.colab import drive

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import IsolationForest

from xgboost import XGBClassifier

In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
DATA_PATH = "/content/drive/MyDrive/agriculture_anomaly_dataset_15000_final_realistic.csv"

In [44]:
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()

(15000, 26)


Unnamed: 0,event_id,timestamp,asset_type,user_id,action_type,resource_type,auth_status,failed_auth_attempts_past_10min,ip_address,geo_lat,...,soil_moisture_percent,soil_temp_c,humidity_percent,weather_consistency_score,irrigation_water_volume_liters,sensor_reading_variance,config_change_flag,sensor_signal_strength,anomaly_score,reason
0,EVT_00001,2026-01-03 04:12,gateway,FARMER_084,transmit,sensor_data,success,1,70.63.188.112,11.12837,...,57.02,28.27,28.92,80.49,140.29,12.83,0,75.07,40.96,Normal agricultural operation
1,EVT_00002,2026-01-28 10:02,gateway,FARMER_024,transmit,sensor_data,success,1,83.52.40.84,20.68446,...,36.78,34.88,77.07,58.68,1641.49,26.74,0,64.35,33.34,Normal agricultural operation
2,EVT_00003,2026-01-20 19:04,weather_station,FARMER_082,read,sensor_data,success,0,20.207.186.132,21.24052,...,46.73,30.73,65.55,66.84,676.5,23.69,0,56.95,42.77,Normal agricultural operation
3,EVT_00004,2026-02-13 04:22,weather_station,FARMER_048,read,sensor_data,success,1,196.39.70.136,36.68598,...,29.98,35.7,64.56,49.21,1139.97,27.76,0,94.63,53.28,Normal agricultural operation
4,EVT_00005,2026-02-02 00:59,irrigation_controller,FARMER_028,update,irrigation_config,success,1,178.251.102.32,15.27967,...,65.71,29.72,100.0,71.68,795.49,1.32,0,63.4,32.19,Normal agricultural operation


In [45]:
ANOMALY_THRESHOLD = 60
df["label"] = (df["anomaly_score"] >= ANOMALY_THRESHOLD).astype(int)

print(df["label"].value_counts())

label
0    13656
1     1344
Name: count, dtype: int64


In [46]:
y = df["label"]

drop_cols = [
    "event_id",
    "timestamp",
    "reason",
    "anomaly_score",
    "label"
]

X = df.drop(columns=drop_cols)


In [47]:
categorical_cols = X.select_dtypes(include="object").columns.tolist()
numerical_cols = X.select_dtypes(exclude="object").columns.tolist()

In [48]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_cols),
        ("num", "passthrough", numerical_cols)
    ]
)


In [49]:
class IsolationForestFeature(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0.12, n_estimators=300, random_state=42):
        self.contamination = contamination
        self.n_estimators = n_estimators
        self.random_state = random_state

    def fit(self, X, y=None):
        self.model_ = IsolationForest(
            contamination=self.contamination,
            n_estimators=self.n_estimators,
            random_state=self.random_state,
            n_jobs=-1
        )
        self.model_.fit(X)
        return self

    def transform(self, X):
        scores = -self.model_.score_samples(X)
        return scores.reshape(-1, 1)


In [53]:
pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("iso_feature", IsolationForestFeature(contamination=0.12)),
    ("xgb", XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="logloss",
        scale_pos_weight=5,  # ðŸ”¥ recall booster
        random_state=42
    ))
])


In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [55]:
pipeline.fit(X_train, y_train)

In [56]:
y_proba = pipeline.predict_proba(X_test)[:, 1]

THRESHOLD = 0.25  # NOT 0.5 â€” THIS IS WHY YOUR RECALL WAS SHIT
y_pred = (y_proba >= THRESHOLD).astype(int)

In [58]:
print("Recall   :", recall_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Recall   : 0.7961538461538461
Precision: 0.33713355048859933
F1 Score : 0.47368421052631576

Confusion Matrix:
 [[2333  407]
 [  53  207]]


In [59]:
from sklearn.metrics import accuracy_score

In [60]:
print("Accuracy :", accuracy_score(y_test, y_pred))

Accuracy : 0.8466666666666667
