In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.simplefilter('ignore')

PROJECT_HOME = os.getcwd()
# Paths

TRAIN_PATH = PROJECT_HOME+"/data/train_kaggle.csv"
TEST_PATH  = PROJECT_HOME+"/data/test_kaggle.csv"
ORIGINAL_PATH = PROJECT_HOME+"/data/train_ieee.csv"
    
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
original = pd.read_csv(ORIGINAL_PATH)
submission_df = test.copy()
train.shape, test.shape, original.shape

((750000, 10), (250000, 9), (100000, 9))

In [2]:
NFOLDS = 5
FOLDS = np.zeros(len(train))
skf = StratifiedKFold(n_splits=NFOLDS, random_state=42, shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(train, train['Fertilizer Name'])):
    FOLDS[test_index]=i
train['fold'] = FOLDS

FOLDS = np.zeros(len(original))
skf = StratifiedKFold(n_splits=NFOLDS, random_state=42, shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(original, original['Fertilizer Name'])):
    FOLDS[test_index]=i
original['fold'] = FOLDS

In [3]:
train.head()

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name,fold
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28,3.0
1,1,27,69,65,Sandy,Millets,30,6,18,28-28,0.0
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17,0.0
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26,2.0
4,4,35,58,43,Red,Paddy,37,2,16,DAP,1.0


In [4]:
original.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name,fold
0,32,51,41,Red,Ground Nuts,7,3,19,14-35-14,2.0
1,35,58,35,Black,Cotton,4,14,16,Urea,1.0
2,27,55,43,Sandy,Sugarcane,28,0,17,20-20,1.0
3,33,56,56,Loamy,Ground Nuts,37,5,24,28-28,3.0
4,32,70,60,Red,Ground Nuts,4,6,9,14-35-14,4.0


In [5]:
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)

train = train.rename(columns={'Temparature': 'Temperature'})
test  = test.rename(columns={'Temparature': 'Temperature'})
original  = original.rename(columns={'Temparature': 'Temperature'})

cat_cols = [col for col in test.select_dtypes(include=['object', 'category']).columns]
for col in cat_cols:
    label_enc = LabelEncoder()
    train[col] = label_enc.fit_transform(train[col])
    test[col] = label_enc.transform(test[col])
    original[col] = label_enc.transform(original[col])

target_label_enc = LabelEncoder()
train["Fertilizer Name"] = target_label_enc.fit_transform(train["Fertilizer Name"])
original["Fertilizer Name"] = target_label_enc.transform(original["Fertilizer Name"])


In [6]:
train.head()


Unnamed: 0,Temperature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name,fold
0,37,70,36,1,8,36,4,5,4,3.0
1,27,69,65,4,4,30,6,18,4,0.0
2,29,63,32,4,4,24,12,16,2,0.0
3,35,62,54,4,0,39,12,4,0,2.0
4,35,58,43,3,6,37,2,16,5,1.0


In [7]:
train['comp_data'] = 0
test['comp_data'] = 1
original['comp_data'] = 2
raw = pd.concat([train, test, original]).reset_index(drop=True)
print(raw.shape)


numerical_features = ['Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
for col in numerical_features:
    raw[col+'_cat'] =  raw[col].astype(str).astype('category')

numerical_features = ['Soil Type', 'Crop Type']
for col in numerical_features:
    raw[col] =  raw[col].astype(str).astype('category')

train = raw.loc[raw['comp_data']==0].reset_index(drop=True)
test = raw.loc[raw['comp_data']==1].reset_index(drop=True)
original = raw.loc[raw['comp_data']==2].reset_index(drop=True)
del raw
test['comp_data'] = 0
original['comp_data'] = 0

(1100000, 11)


In [8]:
print(train.shape)
train.dtypes

(750000, 17)


Temperature           int64
Humidity              int64
Moisture              int64
Soil Type          category
Crop Type          category
Nitrogen              int64
Potassium             int64
Phosphorous           int64
Fertilizer Name     float64
fold                float64
comp_data             int64
Temperature_cat    category
Humidity_cat       category
Moisture_cat       category
Nitrogen_cat       category
Potassium_cat      category
Phosphorous_cat    category
dtype: object

Feature Engineerig to to genereate Derived parameters and create randomness

In [9]:
RANDOM_1 = 51*19
RANDOM_2 = 33
RANDOM_3 = 37*29
RANDOM_4 = 23
RANDOM_5 = 33*21
RANDOM_6 = 16

RANDOM_7 = 7
RANDOM_8 = 11

train["Derived_1"]   = train['Nitrogen']*RANDOM_1   +train['Potassium']*RANDOM_2   +train['Phosphorous']
test["Derived_1"]    = test['Nitrogen']*RANDOM_1    +test['Potassium']*RANDOM_2    +test['Phosphorous']
original["Derived_1"]= original['Nitrogen']*RANDOM_1 +original['Potassium']*RANDOM_2 +original['Phosphorous']

train["Derived_2"]   = train['Nitrogen'] +train['Potassium']*RANDOM_3 +train['Phosphorous']*RANDOM_4
test["Derived_2"]    = test['Nitrogen']  +test['Potassium']*RANDOM_3 +test['Phosphorous']*RANDOM_4
original["Derived_2"]= original['Nitrogen'] +original['Potassium']*RANDOM_3 +original['Phosphorous']*RANDOM_4

train["Derived_3"]   = train['Nitrogen']*RANDOM_6 +train['Potassium'] +train['Phosphorous']*RANDOM_5
test["Derived_3"]    = test['Nitrogen']*RANDOM_6 +test['Potassium'] +test['Phosphorous']*RANDOM_5
original["Derived_3"]= original['Nitrogen']*RANDOM_6 + original['Potassium'] + original['Phosphorous']*RANDOM_5

train["Derived_4"] = train['Soil Type'].astype(int).values + train['Crop Type'].astype(int).values*RANDOM_7 + ((train["Derived_1"] + train["Derived_2"] + train["Derived_3"]) / 3)
test["Derived_4"] = test['Soil Type'].astype(int).values + test['Crop Type'].astype(int).values*RANDOM_7 + ((test["Derived_1"] + test["Derived_2"] + test["Derived_3"]) / 3)
original["Derived_4"] = original['Soil Type'].astype(int).values + original['Crop Type'].astype(int).values*RANDOM_7 + ((original["Derived_1"] + original["Derived_2"] + original["Derived_3"]) / 3)

train["Derived_5"] = train['Soil Type'].astype(int).values*RANDOM_8 + train['Crop Type'].astype(int).values + ((train["Derived_1"] + train["Derived_2"] + train["Derived_3"]) / 3)
test["Derived_5"] = test['Soil Type'].astype(int).values*RANDOM_8 + test['Crop Type'].astype(int).values + ((test["Derived_1"] + test["Derived_2"] + test["Derived_3"]) / 3)
original["Derived_5"] = original['Soil Type'].astype(int).values*RANDOM_8 + original['Crop Type'].astype(int).values +  ((original["Derived_1"] + original["Derived_2"] + original["Derived_3"]) / 3)

In [10]:


def map3(predicted: np.ndarray, labels: np.ndarray) -> float:
    pred = np.argsort(-1*predicted, 1)
    
    p0 = (labels == pred[:, 0])
    p1 = (labels == pred[:, 1])
    p2 = (labels == pred[:, 2])
    
    return float(np.mean(p0/1 + p1/2 + p2/3))

In [11]:
features = [f for f in train.columns if f not in ['fold', 'target', 'grp', 'Fertilizer Name', 'ids']]
train[features].nunique()

Temperature           14
Humidity              23
Moisture              41
Soil Type              5
Crop Type             11
Nitrogen              39
Potassium             20
Phosphorous           43
comp_data              1
Temperature_cat       14
Humidity_cat          23
Moisture_cat          41
Nitrogen_cat          39
Potassium_cat         20
Phosphorous_cat       43
Derived_1          26130
Derived_2          20100
Derived_3          27004
Derived_4          75218
Derived_5          75100
dtype: int64

In [12]:
train.dtypes

Temperature           int64
Humidity              int64
Moisture              int64
Soil Type          category
Crop Type          category
Nitrogen              int64
Potassium             int64
Phosphorous           int64
Fertilizer Name     float64
fold                float64
comp_data             int64
Temperature_cat    category
Humidity_cat       category
Moisture_cat       category
Nitrogen_cat       category
Potassium_cat      category
Phosphorous_cat    category
Derived_1             int64
Derived_2             int64
Derived_3             int64
Derived_4           float64
Derived_5           float64
dtype: object

In [None]:
params = {
    "n_estimators": 100000,
    "learning_rate": 0.02,
    "num_class": 7,
    "max_depth": 8,
    "min_child_weight": 0.00024,
    "subsample": 0.74,
    "colsample_bytree": 0.39,
    "gamma": 0.48,
    "reg_alpha":0.027,
    "reg_lambda": 0.0002,
    'max_bin': 64,
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
    "random_state": 42,
    "n_jobs": -1,
    "tree_method": "hist",  # Faster and memory-efficient
    "enable_categorical": True,
    "device" :"cuda"
}

NBAGS = 2
ytrain = np.zeros( (len(train), 7) )
ytest = np.zeros( (len(test), 7) )

# Pre-convert original dataset for faster concatenation
original_X = original[features]
original_y = original['Fertilizer Name']

# Pre-build test DMatrix (unchanged across folds/bags)
dtest = xgb.DMatrix(test[features], enable_categorical=True)

for fold in range(NFOLDS):
    print(f"Fold {fold}")

    # Boolean masks for train/valid
    ind_train = train['fold'] != fold
    ind_valid = train['fold'] == fold

    X_valid = train.loc[ind_valid, features]
    y_valid = train.loc[ind_valid, 'Fertilizer Name']

    # Validation DMatrix created once per fold
    dvalid = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)

    # Cached training data for this fold (to reduce repeated slicing)
    fold_X = train.loc[ind_train, features]
    fold_y = train.loc[ind_train, 'Fertilizer Name']

    for bag in range(NBAGS):

        # Randomize parameters for this bag
        params['seed'] = (bag + 1) * 11
        params['learning_rate'] = np.random.normal(0.005, 0.01)
        params['colsample_bytree'] = np.random.normal(0.39, 0.005)
        params['subsample'] = np.random.normal(0.74, 0.005)

        # Random oversampling (add K2 copies of original data)
        K2 = np.random.randint(5, 8)

        # Fast concat using a list (Pandas optimizes this well)
        X_train = pd.concat(
            [fold_X] + [original_X] * K2,
            axis=0,
            ignore_index=True
        )
        y_train_label = pd.concat(
            [fold_y] + [original_y] * K2,
            axis=0,
            ignore_index=True
        )

        # Create training DMatrix
        dtrain = xgb.DMatrix(X_train, label=y_train_label, enable_categorical=True)

        # Train model
        model = xgb.train(
            params,
            dtrain,
            99999,
            evals=[(dvalid, "validation")],
            verbose_eval=False,  
            callbacks=[xgb.callback.EarlyStopping(rounds=100, save_best=True)],
        )

        # OOF predictions
        preds_valid = model.predict(dvalid)
        ytrain[ind_valid] += preds_valid / NBAGS

        # Print score for this bag
        print(f"Fold {fold} | Bag {bag} | MAP@3 = {map3(preds_valid, y_valid.values):.6f}")

        # Test predictions (averaged)
        ytest += model.predict(dtest) / (NFOLDS * NBAGS)


score = map3(ytrain, train['Fertilizer Name'].values)
print(score)
################
top_3_preds = np.argsort(ytest, axis=1)[:, -3:][:, ::-1]
top_3_labels = target_label_enc.inverse_transform(top_3_preds.ravel()).reshape(top_3_preds.shape)

submission = pd.DataFrame({
    'id': submission_df['id'],
    'Fertilizer Name': [' '.join(row) for row in top_3_labels]
})
submission.to_csv(PROJECT_HOME+"/output_dir/submission_Approach2.csv", index=False)
print("âœ… Submission file saved as 'submission_Approach2.csv'")

Fold 0
