In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import top_k_accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin



In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [None]:
test

In [None]:
#Data Visualization

In [None]:
#Temperature Distribution
sns.histplot(train["Temparature"], kde=True, color="skyblue")
plt.title("Temperature Distribution")
plt.xlabel("Temperature")
plt.ylabel("Count")
#plt.savefig("Temperature Distribution.png")
plt.show()

In [None]:
#Boxplot of Moisture by soil type
sns.boxplot(x='Soil Type', y='Moisture', data=train, palette='pastel')
plt.title('Moisture by Soil Type')
plt.xlabel('Soil Type')
plt.ylabel('Moisture')
#plt.savefig("Moisture by Soil Type.png")
plt.show()

In [None]:
#Average Nitrogen
sns.barplot(x='Crop Type', y='Nitrogen', data=train, estimator=np.mean, palette='Set2')
plt.title('Average Nitrogen by Crop Type')
plt.xlabel('Crop Type')
plt.ylabel('Average Nitrogen')
#plt.savefig("Average Nitrogen by Crop Type.png")
plt.show()

In [None]:
#Count Plot of Fertilizer Name
sns.countplot(x='Fertilizer Name', data=train, palette='Set3')
plt.title('Fertilizer Frequency')
plt.xlabel('Fertilizer Name')
plt.ylabel('Count')
#plt.savefig("Countplot of Fertilizer Name.png")
plt.show()

In [None]:
#Temperature vs Humidity by Crop Type
sns.scatterplot(x='Temparature', y='Humidity', hue='Crop Type', data=train, palette='Dark2')
plt.title('Temperature vs Humidity by Crop Type')
plt.xlabel('Temperature')
plt.ylabel('Humidity')
plt.legend(title='Crop Type')
#plt.savefig("Temperature vs Humidity by Crop Type.png")
plt.show()

In [None]:
#Correlation of all training features
corr = train[['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
#plt.savefig("Feature Correlation Heatmap.png")
plt.show()

In [4]:
#lets Do Feature Engineering for train and testing
train = train.rename(columns={'Temparature': 'Temperature'})
test = test.rename(columns={'Temparature': 'Temperature'})

# Save target
y_train = train['Fertilizer Name']
train = train.drop(columns=['Fertilizer Name'])

# Combine for consistent processing
combined = pd.concat([train, test], axis=0)

In [5]:
combined['Temp_Humid'] = combined['Temperature'] * combined['Humidity']
combined['Moisture_Nitrogen'] = combined['Moisture'] * combined['Nitrogen']
combined['NPK_Sum'] = combined['Nitrogen'] + combined['Phosphorous'] + combined['Potassium']
combined['NPK_Mean'] = combined[['Nitrogen', 'Phosphorous', 'Potassium']].mean(axis=1)

combined['N_P_Ratio'] = combined['Nitrogen'] / (combined['Phosphorous'] + 1e-5)
combined['K_P_Ratio'] = combined['Potassium'] / (combined['Phosphorous'] + 1e-5)

combined['Temperature_sq'] = combined['Temperature']**2
combined['Moisture_cu'] = combined['Moisture']**3

combined['Temp_Level'] = pd.cut(combined['Temperature'], bins=[0, 15, 25, 40], labels=['Low', 'Medium', 'High'])
combined['Moisture_Level'] = pd.cut(combined['Moisture'], bins=3, labels=['Dry', 'Normal', 'Wet'])

combined = pd.get_dummies(combined, columns=['Soil Type', 'Crop Type', 'Temp_Level', 'Moisture_Level'], drop_first=True)

In [6]:
X_train = combined.iloc[:len(train)]
X_test = combined.iloc[len(train):]
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val_split)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Logistic Regression (L2 Regularization)
log_reg = LogisticRegression(max_iter=1000, solver='lbfgs', C=1.0)

# Random Forest with tuned hyperparameters
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

# CatBoost Classifier (silent=True to suppress logs)
cat_clf = CatBoostClassifier(
    iterations=200,
    depth=6,
    learning_rate=0.1,
    verbose=False,
    random_state=42
)

In [8]:
#Picking the best model
log_score = cross_val_score(log_reg, X_train_scaled, y_train_split, cv=5, scoring='accuracy')
rf_score = cross_val_score(rf_clf, X_train_scaled, y_train_split, cv=5, scoring='accuracy')
cat_score = cross_val_score(cat_clf, X_train_scaled, y_train_split, cv=5, scoring='accuracy')

# Mean scores
print("Logistic Regression Mean Accuracy:", np.mean(log_score))
print("Random Forest Mean Accuracy:", np.mean(rf_score))
print("CatBoost Mean Accuracy:", np.mean(cat_score))

Logistic Regression Mean Accuracy: 0.16012833333333332
Random Forest Mean Accuracy: 0.174885
CatBoost Mean Accuracy: 0.17734666666666668


In [9]:
combined.columns

Index(['id', 'Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium',
       'Phosphorous', 'Temp_Humid', 'Moisture_Nitrogen', 'NPK_Sum', 'NPK_Mean',
       'N_P_Ratio', 'K_P_Ratio', 'Temperature_sq', 'Moisture_cu',
       'Soil Type_Clayey', 'Soil Type_Loamy', 'Soil Type_Red',
       'Soil Type_Sandy', 'Crop Type_Cotton', 'Crop Type_Ground Nuts',
       'Crop Type_Maize', 'Crop Type_Millets', 'Crop Type_Oil seeds',
       'Crop Type_Paddy', 'Crop Type_Pulses', 'Crop Type_Sugarcane',
       'Crop Type_Tobacco', 'Crop Type_Wheat', 'Temp_Level_Medium',
       'Temp_Level_High', 'Moisture_Level_Normal', 'Moisture_Level_Wet'],
      dtype='object')

In [13]:
#Now we make the pipeline lols
# Custom transformer to drop 'id' column
class DropID(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return X.drop(columns=['id'])

numeric_features = [
    'Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous',
    'Temp_Humid', 'Moisture_Nitrogen', 'NPK_Sum', 'NPK_Mean', 'N_P_Ratio', 'K_P_Ratio',
    'Temperature_sq', 'Moisture_cu'
]

categorical_features = [col for col in [
    'Soil Type_Clayey', 'Soil Type_Loamy', 'Soil Type_Red', 'Soil Type_Sandy',
    'Crop Type_Cotton', 'Crop Type_Ground Nuts', 'Crop Type_Maize', 'Crop Type_Millets',
    'Crop Type_Oil seeds', 'Crop Type_Paddy', 'Crop Type_Pulses', 'Crop Type_Sugarcane',
    'Crop Type_Tobacco', 'Crop Type_Wheat',
    'Temp_Level_Medium', 'Temp_Level_High',
    'Moisture_Level_Normal', 'Moisture_Level_Wet'
] if col not in numeric_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

pipeline = Pipeline(steps=[
    ('drop_id', DropID()),
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(verbose=0, random_state=42))
])
pipeline.fit(X_train_split, y_train_split)

In [15]:
#To make sure our model is trained more with no bias and leakage
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y_train)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = []
val_preds = np.zeros((len(X_train), len(np.unique(y_encoded))))  # one row per sample, one col per class

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_encoded), 1):
    print(f"\n🔁 Training Fold {fold}...")

    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_encoded[train_idx], y_encoded[val_idx]

    # Clone a fresh pipeline (to avoid leakage)
    pipeline_fold = clone(pipeline)
    pipeline_fold.fit(X_train_fold, y_train_fold)

    print(f"Fold {fold} training complete.")
    models.append(pipeline_fold)

    # Predict probabilities for validation set
    val_preds[val_idx] = pipeline_fold.predict_proba(X_val_fold)



🔁 Training Fold 1...
Fold 1 training complete.

🔁 Training Fold 2...
Fold 2 training complete.

🔁 Training Fold 3...
Fold 3 training complete.

🔁 Training Fold 4...
Fold 4 training complete.

🔁 Training Fold 5...
Fold 5 training complete.


In [16]:
def mapk(true_labels, pred_labels, k=3):
    total_score = 0.0
    for true, preds in zip(true_labels, pred_labels):
        try:
            rank = preds.index(true) + 1
            total_score += 1.0 / rank
        except ValueError:
            pass
    return total_score / len(true_labels) #The Competition Metric

In [17]:
top3_preds = np.argsort(val_preds, axis=1)[:, -3:][:, ::-1]

# Top-3 accuracy
top3_acc = top_k_accuracy_score(y_encoded, val_preds, k=3)
map3 = mapk(y_encoded.tolist(), top3_preds.tolist(), k=3)

# Print metrics
print(f"OOF Top-3 Accuracy: {top3_acc:.4f}")
print(f"OOF MAP@3: {map3:.4f}")

OOF Top-3 Accuracy: 0.5081
OOF MAP@3: 0.3234


In [18]:
probs = np.array([model.predict_proba(X_test) for model in models])
avg_probs = np.mean(probs, axis=0)
top3_preds = np.argsort(avg_probs, axis=1)[:, -3:][:, ::-1]

top3_labels = le.inverse_transform(top3_preds.ravel()).reshape(top3_preds.shape)
fertilizer_preds = [' '.join(row) for row in top3_labels]
print(fertilizer_preds[:3])

['DAP 28-28 10-26-26', '17-17-17 10-26-26 20-20', '20-20 10-26-26 14-35-14']


In [19]:
submission = pd.DataFrame({
    'id': test['id'],
    'Fertilizer Name': fertilizer_preds
})

submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,Fertilizer Name
0,750000,DAP 28-28 10-26-26
1,750001,17-17-17 10-26-26 20-20
2,750002,20-20 10-26-26 14-35-14
3,750003,14-35-14 17-17-17 DAP
4,750004,20-20 10-26-26 17-17-17
