In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, top_k_accuracy_score
from joblib import dump
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import numpy as np
import xgboost as xgb

In [49]:
# Read files
train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')

## EDA

In [50]:
print(train.shape)
print(train.dtypes)

(750000, 10)
id                  int64
Temparature         int64
Humidity            int64
Moisture            int64
Soil Type          object
Crop Type          object
Nitrogen            int64
Potassium           int64
Phosphorous         int64
Fertilizer Name    object
dtype: object


In [51]:
train.head()

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP


In [54]:
train['Fertilizer Name'].value_counts()

Unnamed: 0_level_0,count
Fertilizer Name,Unnamed: 1_level_1
14-35-14,114436
10-26-26,113887
17-17-17,112453
28-28,111158
20-20,110889
DAP,94860
Urea,92317


In [55]:
train.describe()

Unnamed: 0,id,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,374999.5,31.503565,61.038912,45.184147,23.093808,9.478296,21.073227
std,216506.495284,4.025574,6.647695,11.794594,11.216125,5.765622,12.346831
min,0.0,25.0,50.0,25.0,4.0,0.0,0.0
25%,187499.75,28.0,55.0,35.0,13.0,4.0,10.0
50%,374999.5,32.0,61.0,45.0,23.0,9.0,21.0
75%,562499.25,35.0,67.0,55.0,33.0,14.0,32.0
max,749999.0,38.0,72.0,65.0,42.0,19.0,42.0


## Preprocessing

In [57]:
# Preprocess the data
X = train.drop(columns=['id', 'Fertilizer Name'])
y = train['Fertilizer Name']
X_test = test.drop(columns=['id'])

In [60]:
# Encode labels to integers
le = LabelEncoder()
y_enc = le.fit_transform(y)

In [64]:
# One-hot encode categorical variables
X_full = pd.concat([X, X_test], axis=0)
X_full_encoded = pd.get_dummies(X_full)
X_enc = X_full_encoded.iloc[:len(X)]
X_test_enc = X_full_encoded.iloc[len(X):]

In [65]:
# Create folds
X_train, X_val, y_train, y_val = train_test_split(X_enc, y_enc, test_size=0.2, random_state=42)

## Compare models


### XGB Model

In [15]:
xgb_params = {
    'learning_rate': 0.0404,
    'max_depth': 20,
    'min_child_weight': 5.53,
    'gamma': 0.2846,
    'alpha': 2.95,
    'subsample': 0.6,
    'colsample_bytree': 0.42,
    'n_estimators': 893,
    'lambda': 0.0106,
    'verbosity': 0,
    'objective': 'multi:softprob',
    'enable_categorical': False,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'n_jobs': -1,
    'random_state': 42
}

xgb_model = xgb.XGBClassifier(**xgb_params)

xgb_model.fit(X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)

[0]	validation_0-mlogloss:2.07589
[100]	validation_0-mlogloss:1.94721
[200]	validation_0-mlogloss:1.93505
[300]	validation_0-mlogloss:1.93700
[400]	validation_0-mlogloss:1.94215
[500]	validation_0-mlogloss:1.94643
[600]	validation_0-mlogloss:1.95171
[700]	validation_0-mlogloss:1.95635
[800]	validation_0-mlogloss:1.96086
[892]	validation_0-mlogloss:1.96488


In [16]:
# Evaluate XGB model
xgb_val_preds = xgb_model.predict(X_val)
xgb_val_probs = xgb_model.predict_proba(X_val)

xgb_acc = accuracy_score(y_val, xgb_val_preds)
print(f"XGB Accuracy: {xgb_acc:.4f}")

XGB Accuracy: 0.1738


## Finetune

In [17]:
# Set best model
best_model = xgb_model

In [67]:
# Create ensemble
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)
ensemble_probs = np.zeros((X_test_enc.shape[0], len(le.classes_)))

for fold, (train_index, val_index) in enumerate(skf.split(X_enc, y_enc)):
    print(f"Fold {fold+1}")
    X_train_fold, X_val_fold = X_enc.iloc[train_index], X_enc.iloc[val_index]
    y_train_fold, y_val_fold = y_enc[train_index], y_enc[val_index]

    xgb_model_cv = xgb.XGBClassifier(**xgb_params)

    xgb_model_cv.fit(X_train_fold, y_train_fold,
                     eval_set=[(X_val_fold, y_val_fold)],
                     verbose=100)

    # Predict test set
    fold_test_probs = xgb_model_cv.predict_proba(X_test_enc)
    ensemble_probs += fold_test_probs / skf.n_splits

Fold 1
[0]	validation_0-mlogloss:1.94543
[100]	validation_0-mlogloss:1.92008
[200]	validation_0-mlogloss:1.91092
[300]	validation_0-mlogloss:1.90672
[400]	validation_0-mlogloss:1.90515
[500]	validation_0-mlogloss:1.90473
[600]	validation_0-mlogloss:1.90528
[700]	validation_0-mlogloss:1.90637
[800]	validation_0-mlogloss:1.90785
[892]	validation_0-mlogloss:1.90959
Fold 2
[0]	validation_0-mlogloss:1.94540
[100]	validation_0-mlogloss:1.91982
[200]	validation_0-mlogloss:1.91065
[300]	validation_0-mlogloss:1.90631
[400]	validation_0-mlogloss:1.90474
[500]	validation_0-mlogloss:1.90411
[600]	validation_0-mlogloss:1.90471
[700]	validation_0-mlogloss:1.90589
[800]	validation_0-mlogloss:1.90732
[892]	validation_0-mlogloss:1.90874
Fold 3
[0]	validation_0-mlogloss:1.94540
[100]	validation_0-mlogloss:1.91973
[200]	validation_0-mlogloss:1.91040
[300]	validation_0-mlogloss:1.90569
[400]	validation_0-mlogloss:1.90385
[500]	validation_0-mlogloss:1.90327
[600]	validation_0-mlogloss:1.90381
[700]	validat

## Create Submission

In [68]:
# Get top 3 predicted labels from averaged probabilities
test_top3_idx = ensemble_probs.argsort(axis=1)[:, -3:][:, ::-1]
test_top3_labels = le.inverse_transform(test_top3_idx.ravel()).reshape(test_top3_idx.shape)

In [69]:
# Save test predictions from ensemble
top3_str = [' '.join(row) for row in test_top3_labels]
submission = pd.DataFrame({
    'id': test['id'],
    'Fertilizer Name': top3_str
})
submission.to_csv("submission.csv", index=False)
display(submission.head())

Unnamed: 0,id,Fertilizer Name
0,750000,10-26-26 28-28 DAP
1,750001,17-17-17 Urea 10-26-26
2,750002,20-20 28-28 17-17-17
3,750003,14-35-14 DAP 17-17-17
4,750004,20-20 Urea 10-26-26
