In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix, classification_report)
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

import joblib
import time
from collections import defaultdict

SEED = 42
np.random.seed(SEED)


print("="*80)
print("CUSTOM RANDOM FOREST VARIANTS")
print("="*80)

class CascadeRandomForest(ClassifierMixin, BaseEstimator):
    
    def __init__(self, n_layers=3, n_estimators_per_layer=50, max_depth=15, min_samples_split=5, random_state=42):
        self.n_layers = n_layers
        self.n_estimators_per_layer = n_estimators_per_layer
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.random_state = random_state
        self.layers = []
        self.feature_importances_ = None
        
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        
        n_features = X.shape[1]
        self.feature_importances_ = np.zeros(n_features)
        
        print(f"\n[Layer 1] Training Random Forest on all data...")
        rf_layer1 = RandomForestClassifier(
            n_estimators=self.n_estimators_per_layer,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            random_state=self.random_state,
            n_jobs=-1
        )
        rf_layer1.fit(X, y)
        self.layers.append(rf_layer1)
        self.feature_importances_ += rf_layer1.feature_importances_
        
        y_pred_layer1 = rf_layer1.predict(X)
        misclassified_mask = (y_pred_layer1 != y)
        
        if misclassified_mask.sum() == 0:
            print("All instances correctly classified in first layer!")
            return self
        
        X_misclassified = X[misclassified_mask]
        y_misclassified = y[misclassified_mask]
        
        print(f"  Misclassified instances: {len(X_misclassified)}/{len(X)} ({len(X_misclassified)/len(X)*100:.1f}%)")
        
        for layer_idx in range(1, self.n_layers):
            if len(X_misclassified) < 10:
                print(f"\nStopping early: Only {len(X_misclassified)} misclassified instances remain")
                break
            
            print(f"\n[Layer {layer_idx+1}] Training Random Forest on misclassified instances...")
            print(f"  Training on {len(X_misclassified)} instances")
            
            rf_layer = RandomForestClassifier(
                n_estimators=self.n_estimators_per_layer,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                random_state=self.random_state + layer_idx,
                n_jobs=-1
            )
            rf_layer.fit(X_misclassified, y_misclassified)
            self.layers.append(rf_layer)
            self.feature_importances_ += rf_layer.feature_importances_
            
            y_pred_layer = rf_layer.predict(X_misclassified)
            new_misclassified_mask = (y_pred_layer != y_misclassified)
            
            if new_misclassified_mask.sum() == 0:
                print(f"✓ All remaining instances correctly classified!")
                break
            
            X_misclassified = X_misclassified[new_misclassified_mask]
            y_misclassified = y_misclassified[new_misclassified_mask]
            
            print(f"  Still misclassified: {len(X_misclassified)} instances")
        
        self.feature_importances_ /= len(self.layers)
        return self
    
    def predict_proba(self, X):
        check_is_fitted(self)
        X = check_array(X)
        
        proba = np.zeros((X.shape[0], self.n_classes_))
        total_weight = sum([2.0 ** (len(self.layers) - i - 1) for i in range(len(self.layers))])
        
        for i, layer in enumerate(self.layers):
            layer_weight = (2.0 ** (len(self.layers) - i - 1)) / total_weight
            layer_proba = layer.predict_proba(X)
            
            layer_proba_aligned = np.zeros_like(proba)
            for cls_idx, cls in enumerate(self.classes_):
                if cls in layer.classes_:
                    class_idx_in_layer = np.where(layer.classes_ == cls)[0][0]
                    layer_proba_aligned[:, cls_idx] = layer_proba[:, class_idx_in_layer]
            
            proba += layer_weight * layer_proba_aligned
        
        proba_sum = proba.sum(axis=1, keepdims=True)
        proba_sum[proba_sum == 0] = 1
        return proba / proba_sum
    
    def predict(self, X):
        proba = self.predict_proba(X)
        return self.classes_[np.argmax(proba, axis=1)]

class HierarchicalRandomForest(ClassifierMixin, BaseEstimator):
    
    def __init__(self, n_clusters=3, n_estimators_global=50, n_estimators_local=30, max_depth=12, random_state=42):
        self.n_clusters = n_clusters
        self.n_estimators_global = n_estimators_global
        self.n_estimators_local = n_estimators_local
        self.max_depth = max_depth
        self.random_state = random_state
        self.global_rf = None
        self.cluster_models = {}
        self.kmeans = None
        self.feature_importances_ = None
        
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        
        print(f"\n[Hierarchical RF] Training with {self.n_clusters} clusters...")
        
        print("  Clustering feature space")
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state, n_init=10)
        clusters = self.kmeans.fit_predict(X)
        print(f"  Cluster distribution: {np.bincount(clusters)}")
        
        print("  Training global Random Forest")
        self.global_rf = RandomForestClassifier(
            n_estimators=self.n_estimators_global,
            max_depth=self.max_depth,
            random_state=self.random_state,
            n_jobs=-1
        )
        self.global_rf.fit(X, y)
        self.feature_importances_ = self.global_rf.feature_importances_.copy()
        
        print("  Training specialized forests per cluster")
        for cluster_id in range(self.n_clusters):
            cluster_mask = (clusters == cluster_id)
            
            if cluster_mask.sum() < 10:
                print(f"    Cluster {cluster_id}: Skipped (only {cluster_mask.sum()} samples)")
                continue
            
            X_cluster = X[cluster_mask]
            y_cluster = y[cluster_mask]
            
            print(f"    Cluster {cluster_id}: Training on {len(X_cluster)} samples ({len(np.unique(y_cluster))} classes)")
            
            cluster_rf = RandomForestClassifier(
                n_estimators=self.n_estimators_local,
                max_depth=self.max_depth,
                random_state=self.random_state + cluster_id,
                n_jobs=-1
            )
            cluster_rf.fit(X_cluster, y_cluster)
            self.cluster_models[cluster_id] = cluster_rf
            self.feature_importances_ += cluster_rf.feature_importances_
        
        self.feature_importances_ /= (1 + len(self.cluster_models))
        return self
    
    def predict_proba(self, X):
        check_is_fitted(self)
        X = check_array(X)
        
        clusters = self.kmeans.predict(X)
        proba = np.zeros((X.shape[0], self.n_classes_))
        
        global_proba = self.global_rf.predict_proba(X)
        global_proba_aligned = np.zeros_like(proba)
        
        for cls_idx, cls in enumerate(self.classes_):
            if cls in self.global_rf.classes_:
                class_idx_in_global = np.where(self.global_rf.classes_ == cls)[0][0]
                global_proba_aligned[:, cls_idx] = global_proba[:, class_idx_in_global]
        
        proba = 0.25 * global_proba_aligned
        
        for cluster_id, cluster_model in self.cluster_models.items():
            cluster_mask = (clusters == cluster_id)
            
            if cluster_mask.sum() == 0:
                continue
            
            X_cluster = X[cluster_mask]
            cluster_proba = cluster_model.predict_proba(X_cluster)
            cluster_proba_aligned = np.zeros((len(X_cluster), self.n_classes_))
            
            for cls_idx, cls in enumerate(self.classes_):
                if cls in cluster_model.classes_:
                    class_idx_in_cluster = np.where(cluster_model.classes_ == cls)[0][0]
                    cluster_proba_aligned[:, cls_idx] = cluster_proba[:, class_idx_in_cluster]
            
            proba[cluster_mask] += 0.75 * cluster_proba_aligned
        
        proba_sum = proba.sum(axis=1, keepdims=True)
        proba_sum[proba_sum == 0] = 1
        return proba / proba_sum
    
    def predict(self, X):
        proba = self.predict_proba(X)
        return self.classes_[np.argmax(proba, axis=1)]

# Load dataset
data = pd.read_csv('/kaggle/input/crop-dataset/crop_dataset.csv')
print(f"Dataset loaded successfully")
print(f"  Shape: {data.shape[0]} rows × {data.shape[1]} columns")

# Data preprocessing
print("\nPreprocessing data...")

error_values = ['#DIV/0!', '#N/A', '#VALUE!', '#REF!', '#NAME?', '#NUM!', '#NULL!']
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].replace(error_values, np.nan)
        try:
            data[col] = pd.to_numeric(data[col], errors='ignore')
        except:
            pass

data_cleaned = data.dropna()
if len(data) != len(data_cleaned):
    print(f"  Removed {len(data) - len(data_cleaned)} rows with missing values/errors")
    data = data_cleaned

label_encoders = {}
categorical_cols = ['District', 'Season', 'Crop Name', 'Transplant', 'Growth', 'Harvest']

for col in categorical_cols:
    if col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
        print(f"  Encoded '{col}' with {len(le.classes_)} classes")

print("\nFeature engineering...")
if 'Max Temp' in data.columns and 'Min Temp' in data.columns:
    data['Temp_Range'] = data['Max Temp'] - data['Min Temp']
    print("  Added: Temp_Range")

if 'Max Relative Humidity' in data.columns and 'Min Relative Humidity' in data.columns:
    data['Humidity_Range'] = data['Max Relative Humidity'] - data['Min Relative Humidity']
    print("  Added: Humidity_Range")

if 'Avg Temp' in data.columns and 'Avg Humidity' in data.columns:
    data['Temp_Humidity_Index'] = data['Avg Temp'] * data['Avg Humidity'] / 100
    print("  Added: Temp_Humidity_Index")

X = data.drop(['Crop Name'], axis=1)
y = data['Crop Name']

season_data = None
if 'Season' in data.columns:
    season_data = data['Season'].values.copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

if season_data is not None:
    X_train, X_test, y_train, y_test, season_train, season_test = train_test_split(
        X_scaled, y, season_data, test_size=0.2, random_state=SEED, stratify=y
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=SEED, stratify=y
    )
    season_train, season_test = None, None

print(f"\nData split:")
print(f"  Training set: {X_train.shape[0]} samples")
print(f"  Test set: {X_test.shape[0]} samples")
print(f"  Features: {X_train.shape[1]}")
print(f"  Classes: {len(np.unique(y))}")

print("\n" + "="*80)
print("TRAINING AND EVALUATING MODELS")
print("="*80)

models = {
    'Standard Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=15,
        random_state=SEED,
        n_jobs=-1,
        verbose=0
    ),
    'Cascade Random Forest': CascadeRandomForest(
        n_layers=4,
        n_estimators_per_layer=80,
        max_depth=18,
        random_state=SEED
    ),
    'Hierarchical Random Forest': HierarchicalRandomForest(
        n_clusters=5,
        n_estimators_global=80,
        n_estimators_local=60,
        max_depth=18,
        random_state=SEED
    )
}

temporal_data_train = season_train
temporal_data_test = season_test

results = {}
predictions = {}
training_times = {}

print("\nTraining models...")
print("-" * 60)

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training: {name}")
    print(f"{'='*50}")
    
    start_time = time.time()
    
    try:
        if name == 'Temporal Random Forest' and temporal_data_train is not None:
            model.fit(X_train, y_train, temporal_data=temporal_data_train)
        else:
            model.fit(X_train, y_train)
        
        train_time = time.time() - start_time
        training_times[name] = train_time
        
        if name == 'Temporal Random Forest' and temporal_data_test is not None:
            y_pred = model.predict(X_test, temporal_data=temporal_data_test)
            y_pred_proba = model.predict_proba(X_test, temporal_data=temporal_data_test)
        else:
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)
        
        predictions[name] = (y_pred, y_pred_proba)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        cv_scores = cross_val_score(model, X_scaled, y, cv=3, scoring='accuracy', n_jobs=-1)
        cv_accuracy_mean = cv_scores.mean()
        cv_accuracy_std = cv_scores.std()
        
        results[name] = {
            'Test Accuracy': accuracy,
            'Test Precision': precision,
            'Test Recall': recall,
            'Test F1': f1,
            'CV Accuracy Mean': cv_accuracy_mean,
            'CV Accuracy Std': cv_accuracy_std,
            'Training Time': train_time
        }
        
        print(f"  ✓ Training completed in {train_time:.2f}s")
        print(f"  Test Accuracy: {accuracy:.4f}")
        print(f"  CV Accuracy: {cv_accuracy_mean:.4f} ± {cv_accuracy_std:.4f}")
        
        if hasattr(model, 'feature_importances_'):
            top_features = np.argsort(model.feature_importances_)[-5:][::-1]
            print(f"  Top 5 features by importance: {top_features}")
        
    except Exception as e:
        print(f"  ✗ Error training {name}: {str(e)}")
        results[name] = {
            'Test Accuracy': 0,
            'Test Precision': 0,
            'Test Recall': 0,
            'Test F1': 0,
            'CV Accuracy Mean': 0,
            'CV Accuracy Std': 0,
            'Training Time': 0
        }

results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('Test Accuracy', ascending=False)

print("\nPERFORMANCE COMPARISON:")
print("-" * 80)
print(results_df.round(4).to_string())

print("\nSUMMARY:")
print("-" * 80)
best_model_name = results_df.index[0]
best_accuracy = results_df.loc[best_model_name, 'Test Accuracy']
print(f"Best Model: {best_model_name}")
print(f"Best Test Accuracy: {best_accuracy:.4f}")

print("\nRANKING BY METRIC:")
print("-" * 80)

for metric in ['Test Accuracy', 'Test F1', 'Training Time']:
    if metric == 'Training Time':
        sorted_df = results_df.sort_values(metric)
        best_val = sorted_df.iloc[0][metric]
        print(f"{metric}:")
        for idx, (model, row) in enumerate(sorted_df.iterrows()):
            print(f"  {idx+1}. {model}: {row[metric]:.4f}")
    else:
        sorted_df = results_df.sort_values(metric, ascending=False)
        best_val = sorted_df.iloc[0][metric]
        print(f"{metric}:")
        for idx, (model, row) in enumerate(sorted_df.iterrows()):
            print(f"  {idx+1}. {model}: {row[metric]:.4f}")
    print()

print("\nExporting models and preprocessing artifacts...")

# Create a dictionary to store all artifacts
model_artifacts = {
    'scaler': scaler,
    'label_encoders': label_encoders,
    'best_model_name': best_model_name,
    'models': {}
}

for name, model in models.items():
    try:
        filename = f"{name.replace(' ', '_').lower()}_model.joblib"
        joblib.dump(model, filename)
        model_artifacts['models'][name] = filename
        print(f"✓ Exported {name} to {filename}")
    except Exception as e:
        print(f"✗ Error exporting {name}: {str(e)}")

joblib.dump(scaler, 'scaler.joblib')
joblib.dump(label_encoders, 'label_encoders.joblib')

config = {
    'feature_columns': list(X.columns) if hasattr(X, 'columns') else list(range(X.shape[1])),
    'target_column': 'Crop Name',
    'categorical_columns': categorical_cols,
    'model_files': model_artifacts['models']
}

joblib.dump(config, 'model_config.joblib')

print("\n" + "="*80)
print("RECOMMENDATIONS")

print(f"Primary Model for Deployment: {best_model_name}")
print(f"   • Accuracy: {results_df.loc[best_model_name, 'Test Accuracy']:.4f}")
print(f"   • F1-Score: {results_df.loc[best_model_name, 'Test F1']:.4f}")

print(f"\nAlternative Models (if primary fails):")
for model_name in results_df.index:
    if model_name != best_model_name:
        acc = results_df.loc[model_name, 'Test Accuracy']
        f1 = results_df.loc[model_name, 'Test F1']
        print(f"   • {model_name}: Accuracy={acc:.4f}, F1={f1:.4f}")

print(f"\nFiles available for application use:")
for name, filename in model_artifacts['models'].items():
    print(f"   • {filename} - {name}")

print("\n" + "="*80)
print("MODELS TRAINED AND EXPORTED SUCCESSFULLY")

CUSTOM RANDOM FOREST VARIANTS
Dataset loaded successfully
  Shape: 4608 rows × 15 columns

Preprocessing data...
  Removed 418 rows with missing values/errors
  Encoded 'District' with 64 classes
  Encoded 'Season' with 3 classes
  Encoded 'Crop Name' with 72 classes
  Encoded 'Transplant' with 19 classes
  Encoded 'Growth' with 32 classes
  Encoded 'Harvest' with 34 classes

Feature engineering...
  Added: Temp_Range
  Added: Humidity_Range
  Added: Temp_Humidity_Index

Data split:
  Training set: 3352 samples
  Test set: 838 samples
  Features: 17
  Classes: 72

TRAINING AND EVALUATING MODELS

Training models...
------------------------------------------------------------

Training: Standard Random Forest
  ✓ Training completed in 0.49s
  Test Accuracy: 0.9642
  CV Accuracy: 0.9654 ± 0.0035
  Top 5 features by importance: [ 8  7 16  4  6]

Training: Cascade Random Forest

[Layer 1] Training Random Forest on all data...
  Misclassified instances: 30/3352 (0.9%)

[Layer 2] Training Ran

In [3]:
!zip -r exported_files.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/cascade_random_forest_model.joblib (deflated 96%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)
  adding: kaggle/working/.virtual_documents/__notebook_source__.ipynb (deflated 79%)
  adding: kaggle/working/standard_random_forest_model.joblib (deflated 96%)
  adding: kaggle/working/scaler.joblib (deflated 28%)
  adding: kaggle/working/label_encoders.joblib (deflated 58%)
  adding: kaggle/working/model_config.joblib (deflated 49%)
  adding: kaggle/working/hierarchical_random_forest_model.joblib (deflated 95%)
