In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# Read the CSV file
df = pd.read_csv("../datasets/crop_production.csv")
df = df.dropna(subset=["Production"])
# Display the first few rows
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [3]:
df.fillna(df.median(numeric_only=True), inplace=True)
 #Define categorical and numerical columns
categorical_cols = ["State_Name", "District_Name", "Season", "Crop"]
numerical_cols = ["Area", "Crop_Year"]  # Crop_Year is numerical, not categorical!

In [4]:
# One-Hot Encoding for categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

In [5]:
# Define features and target
X = df.drop(columns=["Production"])
y = df["Production"]

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Model Pipeline
rf = RandomForestRegressor(random_state=42)

In [6]:
# Define the parameter grid with reduced combinations
param_dist = {
    "n_estimators": np.arange(40, 80, 10),  # Fewer values than before
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=10,  # Limits the number of models trained (instead of 108 in GridSearch)
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,  # Use all available cores
    scoring="r2",
    verbose=1,
    random_state=42
)


In [7]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", SelectFromModel(RandomForestRegressor(n_estimators=50), threshold="median")),
    ("model", random_search),
])


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [None]:
# Calculate R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")
