In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [43]:
df = pd.read_csv('data/final_data.csv')
df

Unnamed: 0,Crop,Season,State,Area,Production,Annual_Rainfall
0,Arecanut,Whole Year,Assam,73814.0,56708,High
1,Arhar/Tur,Kharif,Assam,6637.0,4685,High
2,Castor seed,Kharif,Assam,796.0,22,High
3,Cotton(lint),Kharif,Assam,1739.0,794,High
4,Dry chillies,Whole Year,Assam,13587.0,9073,High
...,...,...,...,...,...,...
18044,Small millets,Kharif,Nagaland,4000.0,2000,Mid
18045,Wheat,Rabi,Nagaland,1000.0,3000,Mid
18046,Maize,Kharif,Jammu and Kashmir,310883.0,440900,Mid
18047,Rice,Kharif,Jammu and Kashmir,275746.0,5488,Mid


In [44]:
X = df.drop('Production', axis=1)
y = df['Production']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train

Unnamed: 0,Crop,Season,State,Area,Annual_Rainfall
613,Garlic,Whole Year,Kerala,538.0,High
1282,Rapeseed &Mustard,Rabi,Karnataka,6048.0,Low
14323,Guar seed,Whole Year,Gujarat,199513.0,Low
6927,Maize,Rabi,Bihar,187399.0,Mid
11920,Maize,Kharif,Tripura,3948.0,High
...,...,...,...,...,...
11284,Linseed,Rabi,Madhya Pradesh,91173.0,Low
11964,Soyabean,Kharif,Uttar Pradesh,13679.0,Low
5390,Tobacco,Rabi,Andhra Pradesh,78051.0,Low
860,Arecanut,Whole Year,Kerala,93193.0,High


# PIPELINE

In [45]:
numeric_trf = Pipeline(steps=[
    ('log', FunctionTransformer(np.log1p, validate=True)),
    ('scaler', StandardScaler())
])

categoric_trf = Pipeline(steps=[
    ('ohe', OneHotEncoder(sparse_output=False, dtype=np.int32, drop='first', handle_unknown='ignore'))
])

ordinal_trf = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=[['Low', 'Mid', 'High']], dtype=np.int32))
])

preprocessor = ColumnTransformer(transformers=[
    ('numeric_trf', numeric_trf, ['Area']), 
    ('categoric_trf', categoric_trf, ['Crop', 'Season', 'State']), 
    ('ordinal_trf', ordinal_trf, ['Annual_Rainfall'])
])

model = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('algo', RandomForestRegressor(
        n_estimators=620,
        max_depth=33,
        min_samples_split=8,
        min_samples_leaf=1,
        max_features=None,
        random_state=42,
        n_jobs=-1
    ))
])

final_model = TransformedTargetRegressor(
    regressor = model, 
    func = np.log1p, 
    inverse_func = np.expm1
)

In [46]:
final_model.fit(X_train, y_train)

0,1,2
,regressor,Pipeline(step...m_state=42))])
,transformer,
,func,<ufunc 'log1p'>
,inverse_func,<ufunc 'expm1'>
,check_inverse,True

0,1,2
,transformers,"[('numeric_trf', ...), ('categoric_trf', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,True
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.int32'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Low', 'Mid', ...]]"
,dtype,<class 'numpy.int32'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,620
,criterion,'squared_error'
,max_depth,33
,min_samples_split,8
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [47]:
y_pred = final_model.predict(X_test)
y_pred

array([2.85408712e+05, 2.87062256e+05, 6.02491092e+04, ...,
       2.12993915e+01, 7.30669114e+03, 1.72211829e+06], shape=(5957,))

In [54]:
r2_score(y_test, y_pred)

0.9373884323771414

In [48]:
# from sklearn.model_selection import cross_val_predict, KFold
# from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# import numpy as np

# def cross_val_regression_metrics(model, X, y, cv=10):
#     # KFold cross-validation
#     kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    
#     # Get cross-validated predictions
#     y_pred = cross_val_predict(model, X, y, cv=kf)
    
#     # Calculate metrics
#     r2 = r2_score(y, y_pred)
#     mae = mean_absolute_error(y, y_pred)
#     mse = mean_squared_error(y, y_pred)
#     rmse = np.sqrt(mse)
#     mape = np.mean(np.abs((y - y_pred) / y)) * 100
    
#     # Print results
#     print(f"Cross-Validated R² Score   : {r2:.4f}")
#     print(f"Cross-Validated MAE        : {mae:.4f}")
#     print(f"Cross-Validated MSE        : {mse:.4f}")
#     print(f"Cross-Validated RMSE       : {rmse:.4f}")
#     print(f"Cross-Validated MAPE (%)   : {mape:.2f}")
    
#     return {"R2": r2, "MAE": mae, "MSE": mse, "RMSE": rmse, "MAPE": mape}

# # Example usage:
# # results = cross_val_regression_metrics(final_model, X, y, cv=10)


In [49]:
# cross_val_regression_metrics(final_model, X, y)

In [50]:
# np.mean(cross_val_score(final_model, df.drop(['Production'], axis=1), df['Production'], cv = 10, scoring='r2'))

In [51]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint

# # Define parameter grid for Random Forest
# param_dist = {
#     'regressor__algo__n_estimators': randint(100, 1000),   # number of trees
#     'regressor__algo__max_depth': randint(5, 50),          # depth of each tree
#     'regressor__algo__min_samples_split': randint(2, 20),  # min samples to split node
#     'regressor__algo__min_samples_leaf': randint(1, 10),   # min samples per leaf
#     'regressor__algo__max_features': ['sqrt', 'log2', None] # features per split
# }

# # Setup RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=final_model,
#     param_distributions=param_dist,
#     n_iter=20,                  # number of random combinations
#     cv=5,                       # 5-fold CV
#     scoring='r2',               # optimize for R²
#     verbose=2,
#     n_jobs=-1,
#     random_state=42
# )

# # Fit search on data
# random_search.fit(df.drop(['Production'], axis=1), df['Production'])

# # Best parameters and score
# print("Best Parameters:", random_search.best_params_)
# print("Best CV R² Score:", random_search.best_score_)

# Export the model

In [53]:
# import joblib

# joblib.dump(final_model, 'models/crop_yield_model.pkl')

import joblib
joblib.dump(final_model, 'models/yield_model.joblib', compress=('lzma', 3))
print("Model saved successfully!")

Model saved successfully!
