# Logistic Regression Model for PL-S5E6

Feature Engineering idea : [EDA Notebook](https://www.kaggle.com/code/suhyukchoi/pl-s5e6-eda-notebook)

# Setup for Training

## Loading Libraries

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from itertools import combinations
import gc
import optuna

  from .autonotebook import tqdm as notebook_tqdm


## Loading Datasets

In [2]:
train = pd.read_csv("data/train.csv", index_col = 0)
test = pd.read_csv("data/test.csv", index_col = 0)
org_train = pd.read_csv("data/Fertilizer Prediction.csv")

train = pd.concat([train, org_train], axis=0, ignore_index=True)

train.head()


Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,27,69,65,Sandy,Millets,30,6,18,28-28
2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,35,58,43,Red,Paddy,37,2,16,DAP


## Check Dataset

### Train Dataset

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850000 entries, 0 to 849999
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Temparature      850000 non-null  int64 
 1   Humidity         850000 non-null  int64 
 2   Moisture         850000 non-null  int64 
 3   Soil Type        850000 non-null  object
 4   Crop Type        850000 non-null  object
 5   Nitrogen         850000 non-null  int64 
 6   Potassium        850000 non-null  int64 
 7   Phosphorous      850000 non-null  int64 
 8   Fertilizer Name  850000 non-null  object
dtypes: int64(6), object(3)
memory usage: 58.4+ MB


In [4]:
train.describe()

Unnamed: 0,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous
count,850000.0,850000.0,850000.0,850000.0,850000.0,850000.0
mean,31.503534,61.032665,45.162887,23.081215,9.477581,21.066198
std,4.024909,6.648149,11.799929,11.219842,5.765965,12.352064
min,25.0,50.0,25.0,4.0,0.0,0.0
25%,28.0,55.0,35.0,13.0,4.0,10.0
50%,32.0,61.0,45.0,23.0,9.0,21.0
75%,35.0,67.0,55.0,33.0,14.0,32.0
max,38.0,72.0,65.0,42.0,19.0,42.0


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250000 entries, 750000 to 999999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Temparature  250000 non-null  int64 
 1   Humidity     250000 non-null  int64 
 2   Moisture     250000 non-null  int64 
 3   Soil Type    250000 non-null  object
 4   Crop Type    250000 non-null  object
 5   Nitrogen     250000 non-null  int64 
 6   Potassium    250000 non-null  int64 
 7   Phosphorous  250000 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 17.2+ MB


In [6]:
test.describe()

Unnamed: 0,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,31.491648,61.04512,45.190444,23.139612,9.487764,21.12206
std,4.024093,6.636372,11.793167,11.215956,5.76686,12.38087
min,25.0,50.0,25.0,4.0,0.0,0.0
25%,28.0,55.0,35.0,13.0,4.0,10.0
50%,31.0,61.0,45.0,23.0,10.0,21.0
75%,35.0,67.0,55.0,33.0,14.0,32.0
max,38.0,72.0,65.0,42.0,19.0,42.0


- No Missing Values. Don't need to impute.

## Define Useful Methods/Variables

### Variables

In [7]:
TARGET = 'Fertilizer Name'
COLUMNS = train.columns.tolist()
QUAN_COLUMNS = [col for col in train.columns if train[col].dtype != 'object' and col != TARGET]
CAT_COLUMNS = [col for col in train.columns if train[col].dtype == 'object' and col != TARGET]

print("Total Columns:" ,COLUMNS)
print('Target column:', TARGET)
print('Quantitative columns:', QUAN_COLUMNS)
print('Categorical columns:', CAT_COLUMNS)

Total Columns: ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type', 'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name']
Target column: Fertilizer Name
Quantitative columns: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
Categorical columns: ['Soil Type', 'Crop Type']


### Methods

In [8]:
def FE_for_gbdt(train, test):
    dfs = [train, test]
    
    # 1. Categorical Columns Encoding.
    
    for df in dfs:
        new_CAT_COLUMNS = set() # For Uniqueness.
        '''
        1.  Add New Cateogical Column Here!
            You need to add new categorical columns into new_CAT_COLUMNS list to successfully conduct Interaction terms.
        '''
        # ========================================
        

        # =========================================
        '''
        2.  Add 2-level Interaction term between Categorical Columns.
            Automatically conducted.
        '''

        comb = list(combinations(CAT_COLUMNS + list(new_CAT_COLUMNS) + ["Temparature"], 2))
        
        for c1, c2 in comb:
            new_col = f"{c1}_{c2}"
            df[new_col] = df[c1].astype(str) + "_" + df[c2].astype(str)
            new_CAT_COLUMNS.add(new_col)

    # Update CAT_COLUMNS.
    CAT_COLUMNS.extend(list(new_CAT_COLUMNS))

    # Label Encoding for Categorical Columns.
    for col in CAT_COLUMNS:
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))

    # 2. Quantitative Columns Encoding.
    for df in dfs:
        new_QUAN_COLUMNS = set()  # For Uniqueness.
        new_RATIO_COLUMNS = set()  # To avoid duplicate columns in interaction feature step (ex : (c1/c2) * c2 == c1), we need to specify ratio columns.
        
        '''
        1.  Add New Quantitative Column Here!
            You need to add new quantitative columns into new_QUAN_COLUMNS list to successfully conduct Interaction terms.
        '''
        # ========================================
        # Add Total_Nutrients Feature.
        df["Total_Nutrients"] = df["Nitrogen"] + df["Phosphorous"] + df["Potassium"]
        new_QUAN_COLUMNS.add("Total_Nutrients")
        
        # Add Nutrients Ratio Feature.
        comb = list(combinations(["Nitrogen", "Phosphorous", "Potassium"], 2))
        
        for c1, c2 in comb:
            new_col = f"{c1}/{c2}"
            k = df[c2].mean() # Smoothing Factor.
            df[new_col] = df[c1] / (df[c2] + k)  # Avoid division by zero.
            new_QUAN_COLUMNS.add(new_col)
            new_RATIO_COLUMNS.add(new_col)
        
        # Add Humidity to Moisture Ratio Feature.
        k = df['Moisture'].mean() # smoothing factor to 1. avoid division by zero 2. to avoid too extreme values
        df['Humidity/Moisture'] = df['Humidity'] / (df['Moisture'] + k)  # Avoid division by zero
        new_QUAN_COLUMNS.add("Humidity/Moisture")
        new_RATIO_COLUMNS.add("Humidity/Moisture")
        
        # =========================================
        '''
        2.  Add 2-level Interaction term between Quantitative Columns.
            Automatically conducted.
        '''
        comb = list(combinations(QUAN_COLUMNS + list(new_QUAN_COLUMNS), 2))
        
        for c1, c2 in comb:
            # Check ratio columns to avoid duplicate columns.
            if c1 in new_RATIO_COLUMNS:
                if c1.split('/')[1] == c2:
                    continue
            elif c2 in new_RATIO_COLUMNS:
                if c2.split('/')[1] == c1:
                    continue
            else:
                new_col = f"{c1}*{c2}"
                df[new_col] = df[c1] * df[c2]
                new_QUAN_COLUMNS.add(new_col)
            
    # Update QUAN_COLUMNS.
    QUAN_COLUMNS.extend(list(new_QUAN_COLUMNS))

    # 3. Target Label Encoding.
    le_target = LabelEncoder()
    train[TARGET] = le_target.fit_transform(train[TARGET])
    
    # Delete Unnecessary Variables for memory Efficiency.
    del new_CAT_COLUMNS, new_QUAN_COLUMNS, new_RATIO_COLUMNS, dfs
    gc.collect()
    
    return train, test, le_target

In [9]:
def prob_to_top_k_label(prob, k = 3):
    return np.argsort(prob, axis = 1)[:, -k:][:, ::-1]

In [10]:
def MAP3_score(y_true, y_pred, k = 3):
    """
    y_pred : 2D array of shape (n_samples, k_classes) Here, k = 3
    y_true : 1D array of shape (n_samples,)
    """
    weight = np.linspace(1, 0, num = k, endpoint=False)
    return np.mean(np.sum((y_true.reshape(-1,1) == y_pred) * weight, axis = 1))

In [11]:
def make_sub(top_k_preds, le_target):
    # Load Sample Submission
    sample_submission = pd.read_csv("data/sample_submission.csv")

    # Convert top_k_preds to original target labels
    org_y_pred = le_target.inverse_transform(top_k_preds.astype(int))
    sample_submission[TARGET] = [' '.join(x) for x in org_y_pred]

    # Save submission file
    sample_submission.to_csv('submission/logistic_sub.csv', index = False)

    # Return the sample submission DataFrame
    return sample_submission

# Tuning XGBoost Parameter 

- Hyperparameter tuning using `optuna`

In [12]:
# Setting up the data.
train, test, le_target = FE_for_gbdt(train, test)
train_X = train.drop(TARGET, axis=1)
train_y = train[TARGET]

In [13]:
train.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name,Soil Type_Crop Type,...,Moisture*Nitrogen,Moisture*Potassium,Moisture*Phosphorous,Moisture*Total_Nutrients,Nitrogen*Potassium,Nitrogen*Phosphorous,Nitrogen*Total_Nutrients,Potassium*Phosphorous,Potassium*Total_Nutrients,Phosphorous*Total_Nutrients
0,37,70,36,1,8,36,4,5,4,19,...,1296,144,180,1620,144,180,1620,20,180,225
1,27,69,65,4,4,30,6,18,4,48,...,1950,390,1170,3510,180,540,1620,108,324,972
2,29,63,32,4,4,24,12,16,2,48,...,768,384,512,1664,288,384,1248,192,624,832
3,35,62,54,4,0,39,12,4,0,44,...,2106,648,216,2970,468,156,2145,48,660,220
4,35,58,43,3,6,37,2,16,5,39,...,1591,86,688,2365,74,592,2035,32,110,880


## Parameter Setting

Using 5 folds.

- `max_depth` : 5 ~ 10
- `learning_rate` : 0.005 ~ 0.01
- `subsample` : 0.7 ~ 1.0
- `colsample_bytree` : 0.7 ~ 1.0
- `reg_alpha` : 0 ~ 1
- `reg_lambda` : 0 ~ 1

In [14]:
def objective(trial):
    xgb_params = {
        'objective' : 'multi:softprob',
        'num_class' : len(le_target.classes_),
        'eval_metric' : 'mlogloss',
        'device' : 'cuda',
        'tree_method' : 'hist',
        'max_depth' : trial.suggest_int('max_depth', 5, 10),
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.05),
        'subsample' : trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'reg_alpha' : trial.suggest_float('reg_alpha', 0.5, 3.0),
        'reg_lambda' : trial.suggest_float('reg_lambda', 0.5, 3.0),
        'n_estimators' : 10000,
        'early_stopping_rounds' : 100,
        'random_state' : 42,
    }

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_probs = np.zeros((train_X.shape[0], len(le_target.classes_)))
    y_pred_probs = np.zeros((test.shape[0], len(le_target.classes_)))

    for fold, (train_idx, val_idx) in enumerate(kf.split(train_X, train_y)):

        X_train, X_val = train_X.iloc[train_idx], train_X.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        model = xgb.XGBClassifier(**xgb_params)
        model.fit(X_train, y_train, 
                  eval_set=[(X_val, y_val)], 
                  verbose=1000)
        oof_probs[val_idx] = model.predict_proba(X_val)
        y_pred_probs += model.predict_proba(test) / kf.n_splits
        del model, X_train, X_val, y_train, y_val
        gc.collect()

    oof_score = MAP3_score(train_y.values, prob_to_top_k_label(oof_probs))
    print(f"OOF MAP@3 Score: {oof_score:.4f}")

    return oof_score

In [None]:
# Conduct hyperparameter optimization using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2025-06-15 15:59:17,626] A new study created in memory with name: no-name-58275505-6bcc-4233-b6af-74b75f4b0549


[0]	validation_0-mlogloss:1.94556
[1000]	validation_0-mlogloss:1.92023
[1384]	validation_0-mlogloss:1.91962


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[0]	validation_0-mlogloss:1.94558
[1000]	validation_0-mlogloss:1.92119
[1256]	validation_0-mlogloss:1.92086
[0]	validation_0-mlogloss:1.94557
[1000]	validation_0-mlogloss:1.92043
[1468]	validation_0-mlogloss:1.92008
[0]	validation_0-mlogloss:1.94557
[1000]	validation_0-mlogloss:1.92061
[1408]	validation_0-mlogloss:1.92029
[0]	validation_0-mlogloss:1.94557
[1000]	validation_0-mlogloss:1.92138
[1226]	validation_0-mlogloss:1.92127


[I 2025-06-15 16:07:53,357] Trial 0 finished with value: 0.35895215686274495 and parameters: {'max_depth': 7, 'learning_rate': 0.04364703885880492, 'subsample': 0.48918329411772327, 'colsample_bytree': 0.6524520703787724, 'reg_alpha': 0.994639505430434, 'reg_lambda': 2.1633843026902224}. Best is trial 0 with value: 0.35895215686274495.


OOF MAP@3 Score: 0.3590
[0]	validation_0-mlogloss:1.94580
[1000]	validation_0-mlogloss:1.92782
[2000]	validation_0-mlogloss:1.92331
[3000]	validation_0-mlogloss:1.92062
[4000]	validation_0-mlogloss:1.91880
[5000]	validation_0-mlogloss:1.91749
[6000]	validation_0-mlogloss:1.91652
[7000]	validation_0-mlogloss:1.91581
[8000]	validation_0-mlogloss:1.91532
[9000]	validation_0-mlogloss:1.91496
[9999]	validation_0-mlogloss:1.91469
[0]	validation_0-mlogloss:1.94580
[1000]	validation_0-mlogloss:1.92861
[2000]	validation_0-mlogloss:1.92422
[3000]	validation_0-mlogloss:1.92168
[4000]	validation_0-mlogloss:1.91989
[5000]	validation_0-mlogloss:1.91856
[6000]	validation_0-mlogloss:1.91753
[7000]	validation_0-mlogloss:1.91685
[8000]	validation_0-mlogloss:1.91629
[9000]	validation_0-mlogloss:1.91594
[9999]	validation_0-mlogloss:1.91570
[0]	validation_0-mlogloss:1.94580


In [None]:
# Best Parameters
best_params = study.best_params
print("Best Parameters:", best_params)