# Wheat Growth Suitability Prediction 

In [19]:
import pandas as pd

file_path = 'data.csv'
df = pd.read_csv(file_path)

In [20]:
df.head()

Unnamed: 0,Region,Nitrogen_N_mg/kg,Phosphorus_P_mg/kg,Potassium_K_mg/kg,Temperature_C,Humidity_%,Rainfall_mm,Soil_Moisture_%,LDR
0,Punjab,750.965158,28.193154,191.991431,24.48,53.64,3.09,27.84436,452.35
1,Punjab,784.305016,27.764688,254.701099,21.31,53.2,16.07,34.519483,309.73
2,Punjab,794.938568,15.75859,149.447401,25.24,33.07,4.6,26.836535,453.07
3,Punjab,992.682541,32.693723,182.747921,29.62,55.05,23.1,35.164865,322.42
4,Punjab,686.475356,8.605565,201.999587,20.83,70.99,40.76,24.459244,477.59


In [21]:
df = df.drop(columns=['Region'])

In [22]:
def classify_suitability(row):
    if (
        20 <= row['Soil_Moisture_%'] <= 40 and
        15 <= row['Temperature_C'] <= 25 and
        50 <= row['Humidity_%'] <= 70 and
        50 <= row['Rainfall_mm'] <= 200 and 
        600 <= row['Nitrogen_N_mg/kg'] <= 1000 and 
        150 <= row['Potassium_K_mg/kg'] <= 300 and
        30 <= row['Phosphorus_P_mg/kg'] <= 60 
    ):
        return 1  # Suitable
    else:
        return 0  # Not Suitable

# Apply the function to create the target column
df['Wheat_Suitability'] = df.apply(classify_suitability, axis=1)

- Wheat is classified as "suitable" if environmental and soil conditions fall within predefined optimal ranges (e.g., temperature between 15–25°C, soil moisture between 20–40%, adequate nutrient levels).
- Otherwise, it's classified as "not suitable", meaning one or more of the key parameters fall outside those optimal thresholds.

In [23]:
df['Wheat_Suitability'].value_counts()

Wheat_Suitability
0    4977
1      23
Name: count, dtype: int64

- Imbalanced classes, using SMOTE to oversample the minority class (which is 1).

In [24]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split data
X = df.drop(columns=['LDR', 'Wheat_Suitability'])  
y = df['Wheat_Suitability']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [25]:
print(pd.Series(y_train_resampled).value_counts())

Wheat_Suitability
0    3982
1    3982
Name: count, dtype: int64


### Training Model:

In [26]:
# Train classifier on resampled data
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

y_pred = clf.predict(X_test)

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       995
           1       0.57      0.80      0.67         5

    accuracy                           1.00      1000
   macro avg       0.79      0.90      0.83      1000
weighted avg       1.00      1.00      1.00      1000



In [28]:
print(confusion_matrix(y_test, y_pred))

[[992   3]
 [  1   4]]


- True Positives (TP) = 4 → Correctly predicted suitability
- True Negatives (TN) = 992 → Correctly predicted not suitable
- False Positives (FP) = 3 → Predicted suitability when not
- False Negatives (FN) = 1 → Missed one that was suitable

- Precision (Class 1) = 4 / (4 + 3) = 0.57
- Recall (Class 1) = 4 / (4 + 1) = 0.80
- F1 Score (Class 1) = Harmonic mean ≈ 0.67
- Accuracy = (992 + 4) / 1000 = 99.6%

### Making Predictions on Unseen data:

In [31]:
import pandas as pd

sample_data = pd.DataFrame([
    {
        'Soil_Moisture_%': 30.0,
        'Nitrogen_N_mg/kg': 800,
        'Phosphorus_P_mg/kg': 45,
        'Potassium_K_mg/kg': 160,
        'Temperature_C': 22,
        'Humidity_%': 60,
        'Rainfall_mm': 75
    },
    {
        'Soil_Moisture_%': 18.0,
        'Nitrogen_N_mg/kg': 400,
        'Phosphorus_P_mg/kg': 25,
        'Potassium_K_mg/kg': 100,
        'Temperature_C': 27,
        'Humidity_%': 80,
        'Rainfall_mm': 15
    },
    {
        'Soil_Moisture_%': 24.0,
        'Nitrogen_N_mg/kg': 620,
        'Phosphorus_P_mg/kg': 35,
        'Potassium_K_mg/kg': 140,
        'Temperature_C': 20,
        'Humidity_%': 55,
        'Rainfall_mm': 95
    },
    {
        'Soil_Moisture_%': 10.0,
        'Nitrogen_N_mg/kg': 300,
        'Phosphorus_P_mg/kg': 10,
        'Potassium_K_mg/kg': 90,
        'Temperature_C': 35,
        'Humidity_%': 25,
        'Rainfall_mm': 5
    }
])

# Reorder to match training data
sample_data = sample_data[[
    'Nitrogen_N_mg/kg',
    'Phosphorus_P_mg/kg',
    'Potassium_K_mg/kg',
    'Temperature_C',
    'Humidity_%',
    'Rainfall_mm',
    'Soil_Moisture_%'
]]

predictions = clf.predict(sample_data)
sample_data['Predicted_Wheat_Suitability'] = predictions

In [33]:
sample_data

Unnamed: 0,Nitrogen_N_mg/kg,Phosphorus_P_mg/kg,Potassium_K_mg/kg,Temperature_C,Humidity_%,Rainfall_mm,Soil_Moisture_%,Predicted_Wheat_Suitability
0,800,45,160,22,60,75,30.0,0
1,400,25,100,27,80,15,18.0,0
2,620,35,140,20,55,95,24.0,0
3,300,10,90,35,25,5,10.0,0


### Saving Model:

In [34]:
import joblib

joblib.dump(clf, 'wheat_suitability_model.pkl')

['wheat_suitability_model.pkl']