# Fertilizer Recommendation

This notebook builds a machine learning model to recommend fertilizers based on soil and crop conditions.

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder,  OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [10]:
# Load dataset using relative path
df = pd.read_csv('../data/fertilizer_recommendation.csv')

# Rename columns to remove spaces and match code conventions
df.rename(columns={
    'Soil Type': 'Soil_Type', 
    'Crop Type': 'Crop_Type',
    'N': 'Nitrogen',
    'P': 'Phosphorous',
    'K': 'Potassium'
}, inplace=True)

df.head()

Unnamed: 0,Temperature,Humidity,Moisture,Soil_Type,Crop_Type,Nitrogen,Potassium,Phosphorous,Fertilizer
0,26.0,52.0,38.0,Sandy,Maize,37,0,0,Urea
1,29.0,52.0,45.0,Loamy,Sugarcane,12,0,36,DAP
2,34.0,65.0,62.0,Black,Cotton,7,9,30,14-35-14
3,32.0,62.0,34.0,Red,Tobacco,22,0,20,28-28
4,28.0,54.0,46.0,Clayey,Rice,35,0,0,Urea


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  8000 non-null   float64
 1   Humidity     8000 non-null   float64
 2   Moisture     8000 non-null   float64
 3   Soil_Type    8000 non-null   object 
 4   Crop_Type    8000 non-null   object 
 5   Nitrogen     8000 non-null   int64  
 6   Potassium    8000 non-null   int64  
 7   Phosphorous  8000 non-null   int64  
 8   Fertilizer   8000 non-null   object 
dtypes: float64(3), int64(3), object(3)
memory usage: 562.6+ KB


In [12]:
# Target Encoding
# We encode the target column 'Fertilizer' so the model can predict numerical classes.
# We will save this encoder later to decode predictions.
le_target = LabelEncoder()
df['Fertilizer'] = le_target.fit_transform(df['Fertilizer'])

# Show mapping
mapping = dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))
print("Target Mapping:", mapping)

Target Mapping: {'10-26-26': np.int64(0), '14-35-14': np.int64(1), '17-17-17': np.int64(2), '20-20': np.int64(3), '28-28': np.int64(4), 'DAP': np.int64(5), 'Urea': np.int64(6)}


In [13]:
# Split Data
X = df.drop('Fertilizer', axis=1)
y = df['Fertilizer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (6400, 8), Test shape: (1600, 8)


In [14]:
# Define Preprocessing Pipeline
# We explicitly identify categorical features to one-hot encode.
# This ensures the model can handle raw input strings for these columns.

categorical_cols = ['Soil_Type', 'Crop_Type']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

print("Categorical:", categorical_cols)
print("Numerical:", numerical_cols)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

Categorical: ['Soil_Type', 'Crop_Type']
Numerical: ['Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']


In [15]:
# Create Modeling Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [16]:
# Grid Search
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best params: {'classifier__max_depth': 20, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best score: 0.15359351339602764


In [17]:
# Evaluation
y_pred = grid_search.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.1525
              precision    recall  f1-score   support

           0       0.17      0.16      0.16       241
           1       0.16      0.15      0.16       241
           2       0.08      0.09      0.08       215
           3       0.19      0.14      0.16       236
           4       0.13      0.13      0.13       219
           5       0.18      0.18      0.18       239
           6       0.17      0.22      0.19       209

    accuracy                           0.15      1600
   macro avg       0.15      0.15      0.15      1600
weighted avg       0.15      0.15      0.15      1600



In [21]:
# Save Models
# 1. Save the trained pipeline (includes preprocessing and model)
pickle.dump(grid_search.best_estimator_, open('fertilizer_recommendation_pipeline.pkl', 'wb'))

# 2. Save the target label encoder (to decode predictions back to strings)
pickle.dump(le_target, open('fertilizer_target_encoder.pkl', 'wb'))

print("Models saved successfully.")

Models saved successfully.


In [38]:
# Example Inference
# Notice we can pass raw 'Sandy', 'Sugarcane' strings thanks to the pipeline!

input_data = pd.DataFrame([{
    'Temperature': 20,
    'Humidity': 80,
    'Moisture': 25,
    'Soil_Type': 'Sandy',
    'Crop_Type': 'Sugarcane',
    'Nitrogen': 90,
    'Potassium': 42,
    'Phosphorous': 43
}])

# Load mapping to print result
loaded_pipeline = pickle.load(open('../models/fertilizer_recommendation_pipeline.pkl', 'rb'))
loaded_encoder = pickle.load(open('../models/fertilizer_target_encoder.pkl', 'rb'))

prediction_idx = loaded_pipeline.predict(input_data)[0]
prediction_label = loaded_encoder.inverse_transform([prediction_idx])[0]

print(f"Predicted Fertilizer ID: {prediction_idx}")
print(f"Predicted Fertilizer Name: {prediction_label}")

Predicted Fertilizer ID: 2
Predicted Fertilizer Name: 17-17-17
