In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
# Read the CSV file
df = pd.read_csv("D:/RBL PROJECT/crop_production.csv/crop_production.csv")
df = df.dropna(subset=["Production"])
# Display the first few rows
df.head()


Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [12]:
df.fillna(df.median(numeric_only=True), inplace=True)
 #Define categorical and numerical columns
categorical_cols = ["State_Name", "District_Name", "Season", "Crop"]
numerical_cols = ["Area", "Crop_Year"]  # Crop_Year is numerical, not categorical!

In [13]:
# One-Hot Encoding for categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

In [14]:
# Define features and target
X = df.drop(columns=["Production"])
y = df["Production"]

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Model Pipeline
rf = RandomForestRegressor(random_state=42)

In [15]:
# Define the parameter grid with reduced combinations
param_dist = {
    "n_estimators": np.arange(10, 50, 10),  # Fewer values than before
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=10,  # Limits the number of models trained (instead of 108 in GridSearch)
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,  # Use all available cores
    scoring="r2",
    verbose=1,
    random_state=42
)

In [16]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", SelectFromModel(RandomForestRegressor(n_estimators=50), threshold="median")),
    ("model", random_search),
])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
pipeline.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [19]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [20]:
# Calculate R² Score
r2 = r2_score(y_test, y_pred)
print(f" Prediction Accuracy: {r2*100:.4f}")

 Prediction Accuracy: 93.0341


In [21]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

# Calculate R² Score (Higher is better, best = 1)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Training R² Score: {train_r2*100:.4f}")
print(f"Testing R² Score: {test_r2*100:.4f}")

Training R² Score: 97.5454
Testing R² Score: 93.0341


In [22]:
import joblib
import os

# Define the path where we save the model
os.makedirs("models", exist_ok=True)  # Ensure the directory exists
joblib_filename = "models/crop_production_model.joblib"

# Save the trained model (pipeline) with compression
joblib.dump(pipeline, joblib_filename, compress=9)

['models/crop_production_model.joblib']

In [23]:
# from google.colab import files

# # Download the pickle file
# files.download("crop_production_model.pkl")