In [1]:
import pandas as pd

df1 = pd.read_csv("Dataset1.csv")
df2 = pd.read_csv("Dataset2.csv")
df4 = pd.read_csv("Dataset4.csv")

print("Dataset1 Shape:", df1.shape)
print("Dataset2 Shape:", df2.shape)
print("Dataset4 Shape:", df4.shape)

Dataset1 Shape: (2200, 8)
Dataset2 Shape: (100000, 8)
Dataset4 Shape: (20000, 8)


In [2]:
print("DF1 Columns:", df1.columns)
print("DF2 Columns:", df2.columns)
print("DF4 Columns:", df4.columns)

print("\nSample Labels:")
print("DF1:", df1.iloc[:5, -1])
print("DF2:", df2.iloc[:5, -1])
print("DF4:", df4.iloc[:5, -1])

DF1 Columns: Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')
DF2 Columns: Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')
DF4 Columns: Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')

Sample Labels:
DF1: 0    rice
1    rice
2    rice
3    rice
4    rice
Name: label, dtype: object
DF2: 0    Ground Nuts
1         Cotton
2      Sugarcane
3    Ground Nuts
4    Ground Nuts
Name: label, dtype: object
DF4: 0        Wheat
1       Tomato
2    Sugarcane
3    Sugarcane
4        Maize
Name: label, dtype: object


In [3]:
for df in [df1, df2, df4]:
    df['label'] = df['label'].str.lower().str.strip()

In [4]:
print("DF1 unique crops:", df1['label'].nunique())
print("DF2 unique crops:", df2['label'].nunique())
print("DF4 unique crops:", df4['label'].nunique())

DF1 unique crops: 22
DF2 unique crops: 11
DF4 unique crops: 6


In [5]:
# Temporary merge for analysis
temp_df = pd.concat([df1, df2, df4], axis=0)

print("Total rows after merge:", temp_df.shape)
print("Total unique crops:", temp_df['label'].nunique())

Total rows after merge: (122200, 8)
Total unique crops: 33


In [6]:
temp_df['label'].value_counts().head(20)

label
sugarcane      12551
maize          12465
wheat          12302
cotton          9337
tobacco         9224
millets         9154
paddy           9103
oil seeds       9096
pulses          9072
barley          9041
ground nuts     8881
rice            3368
potato          3362
tomato          3344
chickpea         100
kidneybeans      100
pigeonpeas       100
orange           100
apple            100
muskmelon        100
Name: count, dtype: int64

In [7]:
# Fix similar crop names
temp_df['label'] = temp_df['label'].replace({
    'ground nuts': 'groundnut',
    'pigeon pea': 'pigeonpea',
    'pearl millet': 'pearlmillet',
    'oil seeds': 'oilseeds'
})

print("Unique crops after cleaning:", temp_df['label'].nunique())

Unique crops after cleaning: 33


In [8]:
temp_df['label'].value_counts().head(20)

label
sugarcane      12551
maize          12465
wheat          12302
cotton          9337
tobacco         9224
millets         9154
paddy           9103
oilseeds        9096
pulses          9072
barley          9041
groundnut       8881
rice            3368
potato          3362
tomato          3344
chickpea         100
kidneybeans      100
pigeonpeas       100
orange           100
apple            100
muskmelon        100
Name: count, dtype: int64

In [9]:
# Count crops
crop_counts = temp_df['label'].value_counts()

# Keep crops having >=1000 samples
valid_crops = crop_counts[crop_counts >= 1000].index

filtered_df = temp_df[temp_df['label'].isin(valid_crops)]

print("New shape:", filtered_df.shape)
print("Remaining crops:", filtered_df['label'].nunique())
print(filtered_df['label'].value_counts().tail())

New shape: (120300, 8)
Remaining crops: 14
label
barley       9041
groundnut    8881
rice         3368
potato       3362
tomato       3344
Name: count, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split

X = filtered_df.drop('label', axis=1)
y = filtered_df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (96240, 7)
Test shape: (24060, 7)


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit only on training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data
X_test_scaled = scaler.transform(X_test)

print("Scaling Done ✅")

Scaling Done ✅


In [12]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    n_estimators=80,
    max_depth=12,
    random_state=42,
    class_weight='balanced'
)

rf.fit(X_train_scaled, y_train)

import pickle

with open("multidataset_model.pkl", "wb") as f:
    pickle.dump(rf, f)

with open("multidataset_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Model saved successfully!")

Model saved successfully!


In [13]:
#Evaluation
from sklearn.metrics import accuracy_score, classification_report

# Predictions
y_pred = rf.predict(X_test_scaled)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# Detailed Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.4160847880299252

Classification Report:

              precision    recall  f1-score   support

      barley       0.00      0.00      0.00      1808
      cotton       0.69      0.87      0.77      1867
   groundnut       0.50      0.05      0.08      1776
       maize       0.27      0.04      0.06      2493
     millets       0.33      1.00      0.49      1831
    oilseeds       0.34      0.53      0.42      1819
       paddy       0.63      1.00      0.77      1821
      potato       0.16      0.31      0.21       673
      pulses       0.27      0.02      0.03      1814
        rice       0.16      0.32      0.21       674
   sugarcane       1.00      0.43      0.61      2510
     tobacco       0.38      1.00      0.55      1845
      tomato       0.18      0.35      0.23       669
       wheat       0.00      0.00      0.00      2460

    accuracy                           0.42     24060
   macro avg       0.35      0.42      0.32     24060
weighted avg       0.39   

In [14]:
import numpy as np

# Get probability predictions
probs = rf.predict_proba(X_test_scaled)

# Get top 3 predicted class indices
top3_indices = np.argsort(probs, axis=1)[:, -3:]

# Convert indices to class labels
top3_predictions = []
for row in top3_indices:
    crops = rf.classes_[row]
    top3_predictions.append(crops)

# Calculate Top-3 accuracy
correct = 0

for i in range(len(y_test)):
    if y_test.iloc[i] in top3_predictions[i]:
        correct += 1

top3_accuracy = correct / len(y_test)

print("Top-3 Accuracy:", top3_accuracy)


Top-3 Accuracy: 0.7534081463009143
