In [1]:
import pandas as pd

# Load Dataset 3
df = pd.read_csv("Dataset3(2)(Sheet1).csv")

print("Shape:", df.shape)
print("\nColumns:", df.columns)
print("\nFirst 5 rows:")
print(df.head())

Shape: (27065, 8)

Columns: Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')

First 5 rows:
       N      P      K  temperature  humidity   ph  rainfall        label
0  200.0   9.00  103.5        28.13     74.94  7.9    773.64  Cauliflower
1  213.0  13.50   81.0        25.72     70.44  7.1    734.94       Potato
2  200.0   9.00  103.5        25.40     51.78  7.9    520.93      Spinach
3  188.0  15.75   76.5        27.36     53.06  7.4    535.04      Brinjal
4  188.0  15.75   76.5        29.06     70.07  7.4    692.30      Cabbage


In [2]:
# Check duplicate feature combinations
duplicate_features = df.duplicated(subset=['N','P','K','temperature','humidity','ph','rainfall'], keep=False)

df[duplicate_features].sort_values(['N','P','K']).head(20)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
17098,,,,,,,,
17099,,,,,,,,
17100,,,,,,,,
17101,,,,,,,,
17102,,,,,,,,
17103,,,,,,,,
17104,,,,,,,,
17105,,,,,,,,
17106,,,,,,,,
17107,,,,,,,,


In [3]:
df.groupby(['N','P','K','temperature','humidity','ph','rainfall'])['label'].nunique().sort_values(ascending=False).head()

N     P     K      temperature  humidity  ph   rainfall
0.00  0.0   0.0    21.74        51.33     0.0  621.39      1
0.96  13.5  166.5  22.06        62.65     7.7  605.88      1
                   23.64        50.04     7.7  600.47      1
                   24.49        72.30     7.7  628.86      1
                   26.12        68.17     7.7  564.70      1
Name: label, dtype: int64

In [4]:
# Clean labels
df['label'] = df['label'].str.lower().str.strip()

print("Unique crops:", df['label'].nunique())
print("\nTop 15 crops:\n")
print(df['label'].value_counts().head(15))

Unique crops: 28

Top 15 crops:

label
maize           3912
mustard         2424
rice            1769
pearl millet    1629
groundnut       1199
pea              914
potato           855
garlic           448
pigeon pea       440
barley           412
sesame           408
chilli           377
black gram       251
tomato           249
cauliflower      230
Name: count, dtype: int64


In [5]:
crop_counts = df['label'].value_counts()

valid_crops = crop_counts[crop_counts >= 500].index

df_clean = df[df['label'].isin(valid_crops)].copy()

print("New shape:", df_clean.shape)
print("Remaining crops:", df_clean['label'].nunique())
print("\nCrop distribution:\n")
print(df_clean['label'].value_counts())

New shape: (12702, 8)
Remaining crops: 7

Crop distribution:

label
maize           3912
mustard         2424
rice            1769
pearl millet    1629
groundnut       1199
pea              914
potato           855
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

X = df_clean.drop('label', axis=1)
y = df_clean['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (10161, 7)
Test shape: (2541, 7)


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaling Done ✅")

Scaling Done ✅


In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

rf.fit(X_train_scaled, y_train)

print("Model Training Done ✅")

Model Training Done ✅


In [9]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = rf.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred)
print("Normal Accuracy (Top-1):", acc)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Normal Accuracy (Top-1): 0.2530499803227076

Classification Report:

              precision    recall  f1-score   support

   groundnut       0.06      0.03      0.03       240
       maize       0.34      0.52      0.41       782
     mustard       0.22      0.14      0.17       485
         pea       0.11      0.02      0.04       183
pearl millet       0.17      0.18      0.18       326
      potato       0.10      0.05      0.07       171
        rice       0.20      0.24      0.22       354

    accuracy                           0.25      2541
   macro avg       0.17      0.17      0.16      2541
weighted avg       0.22      0.25      0.22      2541



In [10]:
import numpy as np

# Get probability predictions
probs = rf.predict_proba(X_test_scaled)

# Get top 3 predicted class indices
top3_indices = np.argsort(probs, axis=1)[:, -3:]

# Convert indices to class labels
top3_predictions = []
for row in top3_indices:
    crops = rf.classes_[row]
    top3_predictions.append(crops)

# Calculate Top-3 accuracy
correct = 0

for i in range(len(y_test)):
    if y_test.iloc[i] in top3_predictions[i]:
        correct += 1

top3_accuracy = correct / len(y_test)

print("Top-3 Accuracy:", top3_accuracy)

Top-3 Accuracy: 0.6544667453758363


In [11]:
import pickle
pickle.dump(rf, open("dataset3_model.pkl", "wb"))
pickle.dump(scaler, open("dataset3_scaler.pkl", "wb"))