In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, top_k_accuracy_score
from joblib import dump
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import numpy as np
import xgboost as xgb

In [56]:
# Read files
train_df = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
original_df = pd.read_csv('/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv')

In [57]:
# Mark categorical columns
for col in test_df.select_dtypes(include=['object']).columns:
    train_df[col] = train_df[col].astype('category')
    original_df[col] = original_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

In [58]:
train_df.pop('id')
train_target = train_df.pop('Fertilizer Name')
train_df.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,37,70,36,Clayey,Sugarcane,36,4,5
1,27,69,65,Sandy,Millets,30,6,18
2,29,63,32,Sandy,Millets,24,12,16
3,35,62,54,Sandy,Barley,39,12,4
4,35,58,43,Red,Paddy,37,2,16


In [59]:
original_target = original_df.pop('Fertilizer Name')
original_df.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,32,51,41,Red,Ground Nuts,7,3,19
1,35,58,35,Black,Cotton,4,14,16
2,27,55,43,Sandy,Sugarcane,28,0,17
3,33,56,56,Loamy,Ground Nuts,37,5,24
4,32,70,60,Red,Ground Nuts,4,6,9


In [60]:
test_df.pop('id')
test_df.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,31,70,52,Sandy,Wheat,34,11,24
1,27,62,45,Red,Sugarcane,30,14,15
2,28,72,28,Clayey,Ground Nuts,14,15,4
3,37,53,57,Black,Ground Nuts,18,17,36
4,31,55,32,Red,Pulses,13,19,14


## Preprocessing

In [61]:
X = pd.concat([train_df, original_df])
X

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,37,70,36,Clayey,Sugarcane,36,4,5
1,27,69,65,Sandy,Millets,30,6,18
2,29,63,32,Sandy,Millets,24,12,16
3,35,62,54,Sandy,Barley,39,12,4
4,35,58,43,Red,Paddy,37,2,16
...,...,...,...,...,...,...,...,...
99995,32,71,61,Black,Tobacco,23,1,25
99996,35,72,47,Loamy,Millets,38,1,32
99997,28,50,61,Sandy,Maize,10,11,14
99998,29,57,63,Loamy,Ground Nuts,7,10,4


In [62]:
y = pd.concat([train_target, original_target])
y

0           28-28
1           28-28
2        17-17-17
3        10-26-26
4             DAP
           ...   
99995       20-20
99996    17-17-17
99997    14-35-14
99998         DAP
99999    17-17-17
Name: Fertilizer Name, Length: 850000, dtype: object

In [63]:
X_test = test_df

In [64]:
# Encode target labels
le = LabelEncoder()
y = le.fit_transform(y)

## Finetune

In [65]:
params = {
        'objective': 'multi:softprob',
        'num_class': 7,
        'max_depth': 16,
        'learning_rate': 0.01,
        'n_estimators': 100_000,
        'reg_alpha': 3,
        'reg_lambda': 1.4,
        'gamma': 0.26,
        'max_delta_step': 5,
        'subsample': 0.86,
        'colsample_bytree': 0.4,
        'min_child_weight': 5,
        'random_state': 42,
        'n_jobs': -1,
        'eval_metric': 'mlogloss',
        'enable_categorical': True,
        'device': "cuda"   
}

In [66]:
# Create ensemble
folds = 5
skf = StratifiedKFold(n_splits= folds, shuffle=True, random_state=42)

test_probs = np.zeros((len(X_test), len(le.classes_)))

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    
    print(f"Fold {fold+1}:")
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]

    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold, enable_categorical=True)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold, enable_categorical=True)
    dtest = xgb.DMatrix(X_test, enable_categorical=True)

    xgb_model_fold = xgb.train(
        params, 
        dtrain, 
        num_boost_round=100, 
        evals=[(dtrain, 'train'), (dval, 'validation')], 
        early_stopping_rounds=30, 
        verbose_eval=100
    )

    # Predict test set
    test_probs_fold = xgb_model_fold.predict(dtest)
    test_probs += test_probs_fold / skf.n_splits

Fold 1:


Parameters: { "enable_categorical", "n_estimators" } are not used.



[0]	train-mlogloss:1.94569	validation-mlogloss:1.94579


## Create Submission

In [None]:
# Get top 3 predicted labels from averaged probabilities
test_top3_idx = test_probs.argsort(axis=1)[:, -3:][:, ::-1]
test_top3_labels = le.inverse_transform(test_top3_idx.ravel()).reshape(test_top3_idx.shape)

In [None]:
# Save test predictions from ensemble
top3_str = [' '.join(row) for row in test_top3_labels]
submission = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': top3_str
})
submission.to_csv("submission.csv", index=False)
display(submission.head())