In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, top_k_accuracy_score
from joblib import dump
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import numpy as np
import xgboost as xgb

In [36]:
# Read files
train_df = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
original_df = pd.read_csv('/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv')

In [37]:
train_df.pop('id')
train_target = train_df.pop('Fertilizer Name')
train_df.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,37,70,36,Clayey,Sugarcane,36,4,5
1,27,69,65,Sandy,Millets,30,6,18
2,29,63,32,Sandy,Millets,24,12,16
3,35,62,54,Sandy,Barley,39,12,4
4,35,58,43,Red,Paddy,37,2,16


In [38]:
original_target = original_df.pop('Fertilizer Name')
original_df.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,32,51,41,Red,Ground Nuts,7,3,19
1,35,58,35,Black,Cotton,4,14,16
2,27,55,43,Sandy,Sugarcane,28,0,17
3,33,56,56,Loamy,Ground Nuts,37,5,24
4,32,70,60,Red,Ground Nuts,4,6,9


In [39]:
test_df.pop('id')
test_df.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,31,70,52,Sandy,Wheat,34,11,24
1,27,62,45,Red,Sugarcane,30,14,15
2,28,72,28,Clayey,Ground Nuts,14,15,4
3,37,53,57,Black,Ground Nuts,18,17,36
4,31,55,32,Red,Pulses,13,19,14


## Preprocessing

In [40]:
X = pd.concat([train_df, original_df])
X

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,37,70,36,Clayey,Sugarcane,36,4,5
1,27,69,65,Sandy,Millets,30,6,18
2,29,63,32,Sandy,Millets,24,12,16
3,35,62,54,Sandy,Barley,39,12,4
4,35,58,43,Red,Paddy,37,2,16
...,...,...,...,...,...,...,...,...
99995,32,71,61,Black,Tobacco,23,1,25
99996,35,72,47,Loamy,Millets,38,1,32
99997,28,50,61,Sandy,Maize,10,11,14
99998,29,57,63,Loamy,Ground Nuts,7,10,4


In [41]:
y = pd.concat([train_target, original_target])
y

0           28-28
1           28-28
2        17-17-17
3        10-26-26
4             DAP
           ...   
99995       20-20
99996    17-17-17
99997    14-35-14
99998         DAP
99999    17-17-17
Name: Fertilizer Name, Length: 850000, dtype: object

In [42]:
X_test = test_df

In [43]:
# Encode target labels b65473812
le = LabelEncoder()
y = le.fit_transform(y)

In [44]:
# One-hot encode categorical variables
X_full = pd.concat([X, X_test], axis=0)
X_full_encoded = pd.get_dummies(X_full)
X = X_full_encoded.iloc[:len(X)]
X_test = X_full_encoded.iloc[len(X):]

In [45]:
# Create folds
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Compare models


### XGB Model

In [None]:
xgb_params = {
    'learning_rate': 0.0404,
    'max_depth': 20,
    'min_child_weight': 5.53,
    'gamma': 0.2846,
    'alpha': 2.95,
    'subsample': 0.6,
    'colsample_bytree': 0.42,
    'n_estimators': 10000,
    'lambda': 0.0106,
    'verbosity': 0,
    'objective': 'multi:softprob',
    'enable_categorical': False,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'n_jobs': -1,
    'random_state': 42
}

In [35]:
#xgb_model = xgb.XGBClassifier(**xgb_params)

#xgb_model.fit(X_train, y_train,
#    eval_set=[(X_val, y_val)],
#    verbose=100
#)

[0]	validation_0-mlogloss:1.94545
[100]	validation_0-mlogloss:1.91792
[200]	validation_0-mlogloss:1.90602
[300]	validation_0-mlogloss:1.89958
[400]	validation_0-mlogloss:1.89615


KeyboardInterrupt: 

In [34]:
# Evaluate XGB model
#xgb_val_preds = xgb_model.predict(X_val)
#xgb_val_probs = xgb_model.predict_proba(X_val)

#xgb_acc = accuracy_score(y_val, xgb_val_preds)
#print(f"XGB Accuracy: {xgb_acc:.4f}")

XGB Accuracy: 0.2232


## Finetune

In [46]:
# Create ensemble
folds = 5
skf = StratifiedKFold(n_splits= folds, shuffle=True, random_state=42)

#val_probs = np.zeros((len(train_df), len(le.classes_)))
test_probs = np.zeros((len(X_test), len(le.classes_)))

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    
    print(f"Fold {fold+1}:")
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]

    xgb_model_fold = xgb.XGBClassifier(**xgb_params)

    xgb_model_fold.fit(X_train_fold, y_train_fold,
                     eval_set=[(X_val_fold, y_val_fold)],
                     verbose=100)

    # Predict test set
    test_probs_fold = xgb_model_fold.predict_proba(X_test)
    test_probs += test_probs_fold / skf.n_splits

Fold 1:
[0]	validation_0-mlogloss:1.94546
[100]	validation_0-mlogloss:1.91773
[200]	validation_0-mlogloss:1.90560
[300]	validation_0-mlogloss:1.89914
[400]	validation_0-mlogloss:1.89569
[500]	validation_0-mlogloss:1.89398
[600]	validation_0-mlogloss:1.89320


## Create Submission

In [68]:
# Get top 3 predicted labels from averaged probabilities
test_top3_idx = test_probs.argsort(axis=1)[:, -3:][:, ::-1]
test_top3_labels = le.inverse_transform(test_top3_idx.ravel()).reshape(test_top3_idx.shape)

In [69]:
# Save test predictions from ensemble
top3_str = [' '.join(row) for row in test_top3_labels]
submission = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': top3_str
})
submission.to_csv("submission.csv", index=False)
display(submission.head())

Unnamed: 0,id,Fertilizer Name
0,750000,10-26-26 28-28 DAP
1,750001,17-17-17 Urea 10-26-26
2,750002,20-20 28-28 17-17-17
3,750003,14-35-14 DAP 17-17-17
4,750004,20-20 Urea 10-26-26
