In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")

In [3]:
train_copy, test_copy = train_data, test_data

In [4]:
train_data.sample(5)

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
250413,250413,32,66,52,Loamy,Millets,42,3,4,10-26-26
690756,690756,27,60,47,Black,Tobacco,15,13,16,14-35-14
74795,74795,29,56,26,Clayey,Barley,6,9,20,20-20
58127,58127,26,55,53,Red,Paddy,36,14,42,28-28
598890,598890,36,71,55,Black,Tobacco,26,9,42,17-17-17


In [5]:
test_data.sample(5)

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
231804,981804,37,72,28,Sandy,Paddy,16,1,28
9737,759737,25,57,44,Sandy,Wheat,29,18,29
11016,761016,30,69,31,Loamy,Maize,23,4,31
211512,961512,28,53,27,Loamy,Paddy,29,12,23
23632,773632,37,60,51,Loamy,Paddy,6,7,37


In [6]:
train_data.shape

(750000, 10)

In [7]:
test_data.shape

(250000, 9)

In [8]:
train_data.isnull().sum()

id                 0
Temparature        0
Humidity           0
Moisture           0
Soil Type          0
Crop Type          0
Nitrogen           0
Potassium          0
Phosphorous        0
Fertilizer Name    0
dtype: int64

In [9]:
test_data.isnull().sum()

id             0
Temparature    0
Humidity       0
Moisture       0
Soil Type      0
Crop Type      0
Nitrogen       0
Potassium      0
Phosphorous    0
dtype: int64

In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


In [11]:
categorical_features = ['Soil Type', 'Crop Type']
numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

In [12]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
le=LabelEncoder()
train_data.drop("id",axis=1,inplace=True)
test_data.drop("id",axis=1,inplace=True)

In [13]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

train_data["Fertilizer Name"]=le.fit_transform(train_data["Fertilizer Name"])

X=train_data.drop(columns=["Fertilizer Name"])
y=train_data["Fertilizer Name"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OrdinalEncoder(), categorical_features)
    ]
)

In [14]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import time

In [15]:
FOLDS = 5
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X), dtype=int)
test_preds_proba = np.zeros((len(test_data), len(np.unique(y)))) 

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), 1):
    print(f"\nFold {fold}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    X_train_scaled = preprocessor.fit_transform(X_train)
    X_val_scaled = preprocessor.transform(X_val)
    test_scaled=preprocessor.transform(test_data)

    model = XGBClassifier(
        max_depth=12,
        colsample_bytree=0.467,
        subsample=0.86,
        n_estimators=4000,
        learning_rate=0.03,
        gamma=0.26,
        max_delta_step=4,
        reg_alpha=2.7,
        reg_lambda=1.4,
        early_stopping_rounds=100,
        objective='multi:softprob',
        random_state=13,
        enable_categorical=True,
        tree_method='hist',     
        device='cuda'
    )
    start = time.time()

    model.fit(
        X_train_scaled, y_train,
        eval_set=[(X_val_scaled, y_val)],
        verbose=100
    )

    val_preds = model.predict(X_val_scaled)
    oof_preds[val_idx] = val_preds

    test_preds_proba += model.predict_proba(test_scaled)

    acc = accuracy_score(y_val, val_preds)
    print(f"Fold {fold} Accuracy: {acc:.4f}")
    print(f"Time: {time.time() - start:.1f} sec")

test_preds_proba /= FOLDS

oof_acc = accuracy_score(y, oof_preds)
print(f"\n Final OOF Accuracy: {oof_acc:.4f}")


Fold 1




[0]	validation_0-mlogloss:1.94558
[100]	validation_0-mlogloss:1.92617
[200]	validation_0-mlogloss:1.91695
[300]	validation_0-mlogloss:1.91164
[400]	validation_0-mlogloss:1.90814
[500]	validation_0-mlogloss:1.90587
[600]	validation_0-mlogloss:1.90435
[700]	validation_0-mlogloss:1.90341
[800]	validation_0-mlogloss:1.90295
[900]	validation_0-mlogloss:1.90262
[1000]	validation_0-mlogloss:1.90242
[1100]	validation_0-mlogloss:1.90241
[1148]	validation_0-mlogloss:1.90244
Fold 1 Accuracy: 0.2144
Time: 559.1 sec

Fold 2




[0]	validation_0-mlogloss:1.94556
[100]	validation_0-mlogloss:1.92621
[200]	validation_0-mlogloss:1.91699
[300]	validation_0-mlogloss:1.91148
[400]	validation_0-mlogloss:1.90773
[500]	validation_0-mlogloss:1.90531
[600]	validation_0-mlogloss:1.90375
[700]	validation_0-mlogloss:1.90278
[800]	validation_0-mlogloss:1.90218
[900]	validation_0-mlogloss:1.90176
[1000]	validation_0-mlogloss:1.90152
[1100]	validation_0-mlogloss:1.90141
[1200]	validation_0-mlogloss:1.90135
[1300]	validation_0-mlogloss:1.90135
[1356]	validation_0-mlogloss:1.90141
Fold 2 Accuracy: 0.2152
Time: 640.2 sec

Fold 3




[0]	validation_0-mlogloss:1.94557
[100]	validation_0-mlogloss:1.92593
[200]	validation_0-mlogloss:1.91636
[300]	validation_0-mlogloss:1.91069
[400]	validation_0-mlogloss:1.90687
[500]	validation_0-mlogloss:1.90439
[600]	validation_0-mlogloss:1.90268
[700]	validation_0-mlogloss:1.90162
[800]	validation_0-mlogloss:1.90098
[900]	validation_0-mlogloss:1.90056
[1000]	validation_0-mlogloss:1.90021
[1100]	validation_0-mlogloss:1.90013
[1200]	validation_0-mlogloss:1.90006
[1300]	validation_0-mlogloss:1.90005
[1400]	validation_0-mlogloss:1.90007
[1435]	validation_0-mlogloss:1.90009
Fold 3 Accuracy: 0.2170
Time: 664.8 sec

Fold 4




[0]	validation_0-mlogloss:1.94558
[100]	validation_0-mlogloss:1.92622
[200]	validation_0-mlogloss:1.91693
[300]	validation_0-mlogloss:1.91150
[400]	validation_0-mlogloss:1.90799
[500]	validation_0-mlogloss:1.90575
[600]	validation_0-mlogloss:1.90431
[700]	validation_0-mlogloss:1.90334
[800]	validation_0-mlogloss:1.90272
[900]	validation_0-mlogloss:1.90238
[1000]	validation_0-mlogloss:1.90215
[1100]	validation_0-mlogloss:1.90208
[1200]	validation_0-mlogloss:1.90207
[1271]	validation_0-mlogloss:1.90210
Fold 4 Accuracy: 0.2160
Time: 598.7 sec

Fold 5




[0]	validation_0-mlogloss:1.94558
[100]	validation_0-mlogloss:1.92607
[200]	validation_0-mlogloss:1.91682
[300]	validation_0-mlogloss:1.91136
[400]	validation_0-mlogloss:1.90774
[500]	validation_0-mlogloss:1.90545
[600]	validation_0-mlogloss:1.90383
[700]	validation_0-mlogloss:1.90298
[800]	validation_0-mlogloss:1.90239
[900]	validation_0-mlogloss:1.90208
[1000]	validation_0-mlogloss:1.90185
[1100]	validation_0-mlogloss:1.90178
[1200]	validation_0-mlogloss:1.90177
[1275]	validation_0-mlogloss:1.90182
Fold 5 Accuracy: 0.2162
Time: 602.8 sec

 Final OOF Accuracy: 0.2158


In [16]:
top_3_preds = np.argsort(test_preds_proba, axis=1)[:, -3:][:, ::-1]  
top3_labels = np.array([le.inverse_transform(row) for row in top_3_preds])

top3_joined = [" ".join(row) for row in top3_labels]
submission = pd.read_csv("/kaggle/input/playground-series-s5e6/sample_submission.csv")
submission["Fertilizer Name"] = top3_joined
submission

Unnamed: 0,id,Fertilizer Name
0,750000,10-26-26 28-28 DAP
1,750001,17-17-17 20-20 10-26-26
2,750002,20-20 28-28 Urea
3,750003,14-35-14 17-17-17 DAP
4,750004,20-20 Urea 10-26-26
...,...,...
249995,999995,Urea 17-17-17 28-28
249996,999996,10-26-26 14-35-14 17-17-17
249997,999997,DAP Urea 14-35-14
249998,999998,10-26-26 28-28 17-17-17


In [17]:
submission.to_csv("submission.csv",index=False)