# Predicting Optimal Fertilizers

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv


Private-fertilizer-predic-playground-series-s5-e6 Version 1 notebook Full EDA

Public notebook: https://www.kaggle.com/code/les1781/optimal-fertilizer-predict-playground-series-s5-e6

# Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import (
    LabelEncoder,    
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler
)
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.inspection import permutation_importance

# Initial analysis

In [3]:
# We load the data

fertilizers_train = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv", index_col="id")

In [4]:
fertilizers_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temparature,750000.0,31.503565,4.025574,25.0,28.0,32.0,35.0,38.0
Humidity,750000.0,61.038912,6.647695,50.0,55.0,61.0,67.0,72.0
Moisture,750000.0,45.184147,11.794594,25.0,35.0,45.0,55.0,65.0
Nitrogen,750000.0,23.093808,11.216125,4.0,13.0,23.0,33.0,42.0
Potassium,750000.0,9.478296,5.765622,0.0,4.0,9.0,14.0,19.0
Phosphorous,750000.0,21.073227,12.346831,0.0,10.0,21.0,32.0,42.0


In [5]:
fertilizers_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Temparature      750000 non-null  int64 
 1   Humidity         750000 non-null  int64 
 2   Moisture         750000 non-null  int64 
 3   Soil Type        750000 non-null  object
 4   Crop Type        750000 non-null  object
 5   Nitrogen         750000 non-null  int64 
 6   Potassium        750000 non-null  int64 
 7   Phosphorous      750000 non-null  int64 
 8   Fertilizer Name  750000 non-null  object
dtypes: int64(6), object(3)
memory usage: 57.2+ MB


# Data wrangling

In [6]:
# We make a copy of the original dataset

fertilizers_new = fertilizers_train.copy()

In [7]:
# We confirm that there is no null values

null_values = pd.DataFrame(
        {f"Null Data" : fertilizers_new.isnull().sum(), 
         "Percentage" : (fertilizers_new.isnull().sum()) / (len(fertilizers_new)) * (100)})

null_values

Unnamed: 0,Null Data,Percentage
Temparature,0,0.0
Humidity,0,0.0
Moisture,0,0.0
Soil Type,0,0.0
Crop Type,0,0.0
Nitrogen,0,0.0
Potassium,0,0.0
Phosphorous,0,0.0
Fertilizer Name,0,0.0


In [8]:
# We check the duplicate data found

print(f"Length: {len(fertilizers_new.duplicated())}")
print(f"Duplicates: {fertilizers_new.duplicated().sum()}")

Length: 750000
Duplicates: 0


In [9]:
'''
eval_out = sns.PairGrid(fertilizers_new, palette=sns.light_palette("seagreen"))
eval_out.map(sns.boxplot)
eval_out.tick_params(axis="both", labelbottom=False)
'''

'\neval_out = sns.PairGrid(fertilizers_new, palette=sns.light_palette("seagreen"))\neval_out.map(sns.boxplot)\neval_out.tick_params(axis="both", labelbottom=False)\n'

# Data Preprocessing

In [10]:
fertilizers_end = fertilizers_new.copy()

## Feature Engineering

In [11]:
# We separate the climate variables into bins

bins_tem = [0.0, 29.0, 33.0, 39.0]
bins_hum = [0.0, 57.0, 65.0, 73.0]
bins_moi = [0.0, 36.0, 51.0, 66.0]

# Specify bin labels

labels_climate = ["Low", "Medium", "High"]

# We create the new features

fertilizers_end["Temparature_Bins"] = pd.cut(fertilizers_end["Temparature"], bins_tem, labels=labels_climate)
fertilizers_end["Humidity_Bins"] = pd.cut(fertilizers_end["Humidity"], bins_hum, labels=labels_climate)
fertilizers_end["Moisture_Bins"] = pd.cut(fertilizers_end["Moisture"], bins_moi, labels=labels_climate)

# We separate the elements variables into bins

bins_n = [0.0, 15.0, 30.0, 43.0]
bins_k = [-0.1, 6.0, 12.0, 20.0]
bins_p = [-0.1, 15.0, 30.0, 43.0]

# Specify bin labels

labels_elements = ["Small", "Medium", "Large"]

# We create the new features

fertilizers_end["Nitrogen_Bins"] = pd.cut(fertilizers_end["Nitrogen"], bins_n, labels=labels_elements)
fertilizers_end["Potassium_Bins"] = pd.cut(fertilizers_end["Potassium"], bins_k, labels=labels_elements)
fertilizers_end["Phosphorous_Bins"] = pd.cut(fertilizers_end["Phosphorous"], bins_p, labels=labels_elements)

In [12]:
'''
# Polynomial / transforms

fertilizers_end["tem_log"] = np.log1p(fertilizers_end["Temparature"])
fertilizers_end["tem_sq"]  = fertilizers_end["Temparature"] ** 2
fertilizers_end["tem_cubert"] = np.cbrt(fertilizers_end["Temparature"])

fertilizers_end["hum_log"] = np.log1p(fertilizers_end["Humidity"])
fertilizers_end["hum_sq"]  = fertilizers_end["Humidity"] ** 2
fertilizers_end["hum_cubert"] = np.cbrt(fertilizers_end["Humidity"])

fertilizers_end["moi_log"] = np.log1p(fertilizers_end["Moisture"])
fertilizers_end["moi_sq"]  = fertilizers_end["Moisture"] ** 2
fertilizers_end["moi_cubert"] = np.cbrt(fertilizers_end["Moisture"])

fertilizers_end["n_log"] = np.log1p(fertilizers_end["Nitrogen"])
fertilizers_end["n_sq"]  = fertilizers_end["Nitrogen"] ** 2
fertilizers_end["n_cubert"] = np.cbrt(fertilizers_end["Nitrogen"])

fertilizers_end["k_log"] = np.log1p(fertilizers_end["Potassium"])
fertilizers_end["k_sq"]  = fertilizers_end["Potassium"] ** 2
fertilizers_end["k_cubert"] = np.cbrt(fertilizers_end["Potassium"])

fertilizers_end["p_log"] = np.log1p(fertilizers_end["Phosphorous"])
fertilizers_end["p_sq"]  = fertilizers_end["Phosphorous"] ** 2
fertilizers_end["p_cubert"] = np.cbrt(fertilizers_end["Phosphorous"])
'''

'\n# Polynomial / transforms\n\nfertilizers_end["tem_log"] = np.log1p(fertilizers_end["Temparature"])\nfertilizers_end["tem_sq"]  = fertilizers_end["Temparature"] ** 2\nfertilizers_end["tem_cubert"] = np.cbrt(fertilizers_end["Temparature"])\n\nfertilizers_end["hum_log"] = np.log1p(fertilizers_end["Humidity"])\nfertilizers_end["hum_sq"]  = fertilizers_end["Humidity"] ** 2\nfertilizers_end["hum_cubert"] = np.cbrt(fertilizers_end["Humidity"])\n\nfertilizers_end["moi_log"] = np.log1p(fertilizers_end["Moisture"])\nfertilizers_end["moi_sq"]  = fertilizers_end["Moisture"] ** 2\nfertilizers_end["moi_cubert"] = np.cbrt(fertilizers_end["Moisture"])\n\nfertilizers_end["n_log"] = np.log1p(fertilizers_end["Nitrogen"])\nfertilizers_end["n_sq"]  = fertilizers_end["Nitrogen"] ** 2\nfertilizers_end["n_cubert"] = np.cbrt(fertilizers_end["Nitrogen"])\n\nfertilizers_end["k_log"] = np.log1p(fertilizers_end["Potassium"])\nfertilizers_end["k_sq"]  = fertilizers_end["Potassium"] ** 2\nfertilizers_end["k_cuber

In [13]:
fertilizers_end.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   Temparature       750000 non-null  int64   
 1   Humidity          750000 non-null  int64   
 2   Moisture          750000 non-null  int64   
 3   Soil Type         750000 non-null  object  
 4   Crop Type         750000 non-null  object  
 5   Nitrogen          750000 non-null  int64   
 6   Potassium         750000 non-null  int64   
 7   Phosphorous       750000 non-null  int64   
 8   Fertilizer Name   750000 non-null  object  
 9   Temparature_Bins  750000 non-null  category
 10  Humidity_Bins     750000 non-null  category
 11  Moisture_Bins     750000 non-null  category
 12  Nitrogen_Bins     750000 non-null  category
 13  Potassium_Bins    750000 non-null  category
 14  Phosphorous_Bins  750000 non-null  category
dtypes: category(6), int64(6), object(3)
memory usage: 61.5+ 

## Encoding

In [14]:
# We apply LabelEncoder to the target variable

le = LabelEncoder()

fertilizers_end["Fertilizer Name"] = le.fit_transform(fertilizers_end["Fertilizer Name"])

In [15]:
le_values = le.classes_

In [16]:
# We create a df with the encoded categorical variables

cat_cols = fertilizers_end[["Soil Type", "Crop Type"]]
rest_cols = fertilizers_end.drop(columns=["Soil Type", "Crop Type"])
encoder = OneHotEncoder(sparse=False, drop="if_binary").set_output(transform="pandas")
cat_enc = encoder.fit_transform(cat_cols)
df_encoded = pd.concat([rest_cols, cat_enc], axis=1)

In [17]:
# We create a useful function

def mapper(data, column, order):
    
    data[column] = data[column].map(order)
    data[column] = data[column].astype("float64")

    print(data[column].value_counts())

In [18]:
# We map the variables and change the format

climate_order = {"Low" : 0, "Medium" : 1, "High" : 2}
elements_order = {"Small" : 0, "Medium" : 1, "Large" : 2}

mapper(df_encoded, "Temparature_Bins", climate_order)
mapper(df_encoded, "Humidity_Bins", climate_order)
mapper(df_encoded, "Moisture_Bins", climate_order)
mapper(df_encoded, "Nitrogen_Bins", elements_order)
mapper(df_encoded, "Potassium_Bins", elements_order)
mapper(df_encoded, "Phosphorous_Bins", elements_order)

Temparature_Bins
2.0    268332
0.0    267043
1.0    214625
Name: count, dtype: int64
Humidity_Bins
0.0    260326
1.0    259289
2.0    230385
Name: count, dtype: int64
Moisture_Bins
1.0    276824
2.0    259353
0.0    213823
Name: count, dtype: int64
Nitrogen_Bins
1.0    292080
2.0    231010
0.0    226910
Name: count, dtype: int64
Potassium_Bins
0.0    262455
2.0    262427
1.0    225118
Name: count, dtype: int64
Phosphorous_Bins
0.0    275369
1.0    267395
2.0    207236
Name: count, dtype: int64


In [19]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 29 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Temparature            750000 non-null  int64  
 1   Humidity               750000 non-null  int64  
 2   Moisture               750000 non-null  int64  
 3   Nitrogen               750000 non-null  int64  
 4   Potassium              750000 non-null  int64  
 5   Phosphorous            750000 non-null  int64  
 6   Fertilizer Name        750000 non-null  int64  
 7   Temparature_Bins       750000 non-null  float64
 8   Humidity_Bins          750000 non-null  float64
 9   Moisture_Bins          750000 non-null  float64
 10  Nitrogen_Bins          750000 non-null  float64
 11  Potassium_Bins         750000 non-null  float64
 12  Phosphorous_Bins       750000 non-null  float64
 13  Soil Type_Black        750000 non-null  float64
 14  Soil Type_Clayey       750000 non-null  f

In [20]:
#df_encoded.corr(numeric_only=True).round(2)

## Scaling

In [21]:
df_encoded.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temparature,750000.0,31.503565,4.025574,25.0,28.0,32.0,35.0,38.0
Humidity,750000.0,61.038912,6.647695,50.0,55.0,61.0,67.0,72.0
Moisture,750000.0,45.184147,11.794594,25.0,35.0,45.0,55.0,65.0
Nitrogen,750000.0,23.093808,11.216125,4.0,13.0,23.0,33.0,42.0
Potassium,750000.0,9.478296,5.765622,0.0,4.0,9.0,14.0,19.0
Phosphorous,750000.0,21.073227,12.346831,0.0,10.0,21.0,32.0,42.0
Fertilizer Name,750000.0,2.859791,1.967025,0.0,1.0,3.0,4.0,6.0
Temparature_Bins,750000.0,1.001719,0.844885,0.0,0.0,1.0,2.0,2.0
Humidity_Bins,750000.0,0.960079,0.807891,0.0,0.0,1.0,2.0,2.0
Moisture_Bins,750000.0,1.060707,0.79197,0.0,0.0,1.0,2.0,2.0


In [22]:
# We separate the target variable from the features

x_fertilizers = df_encoded.drop(columns="Fertilizer Name")
y_fertilizers = df_encoded["Fertilizer Name"]

In [23]:
'''
# Numerical variables to scale

fertilizer_numeric = x_fertilizers[[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "tem_sq",
    "hum_sq",
    "moi_sq",
    "n_sq",
    "k_sq",
    "p_sq"
]]

scaler = MinMaxScaler().set_output(transform="pandas")
scale_num = scaler.fit_transform(fertilizer_numeric)

# We create a df with the remaining variables

fertilizer_rest = x_fertilizers.drop(columns=[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "tem_sq",
    "hum_sq",
    "moi_sq",
    "n_sq",
    "k_sq",
    "p_sq"
])

# We concatenate the dataframes

x_end = pd.concat([scale_num, fertilizer_rest], axis=1)
'''

'\n# Numerical variables to scale\n\nfertilizer_numeric = x_fertilizers[[\n    "Temparature",\n    "Humidity",\n    "Moisture",\n    "Nitrogen",\n    "Potassium",\n    "Phosphorous",\n    "tem_sq",\n    "hum_sq",\n    "moi_sq",\n    "n_sq",\n    "k_sq",\n    "p_sq"\n]]\n\nscaler = MinMaxScaler().set_output(transform="pandas")\nscale_num = scaler.fit_transform(fertilizer_numeric)\n\n# We create a df with the remaining variables\n\nfertilizer_rest = x_fertilizers.drop(columns=[\n    "Temparature",\n    "Humidity",\n    "Moisture",\n    "Nitrogen",\n    "Potassium",\n    "Phosphorous",\n    "tem_sq",\n    "hum_sq",\n    "moi_sq",\n    "n_sq",\n    "k_sq",\n    "p_sq"\n])\n\n# We concatenate the dataframes\n\nx_end = pd.concat([scale_num, fertilizer_rest], axis=1)\n'

In [24]:
# Numerical variables to scale

fertilizer_numeric = x_fertilizers[[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous"
]]

scaler = MinMaxScaler().set_output(transform="pandas")
scale_num = scaler.fit_transform(fertilizer_numeric)

# We create a df with the remaining variables

fertilizer_rest = x_fertilizers.drop(columns=[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous"
])

# We concatenate the dataframes

x_end = pd.concat([scale_num, fertilizer_rest], axis=1)

In [25]:
x_end.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 28 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Temparature            750000 non-null  float64
 1   Humidity               750000 non-null  float64
 2   Moisture               750000 non-null  float64
 3   Nitrogen               750000 non-null  float64
 4   Potassium              750000 non-null  float64
 5   Phosphorous            750000 non-null  float64
 6   Temparature_Bins       750000 non-null  float64
 7   Humidity_Bins          750000 non-null  float64
 8   Moisture_Bins          750000 non-null  float64
 9   Nitrogen_Bins          750000 non-null  float64
 10  Potassium_Bins         750000 non-null  float64
 11  Phosphorous_Bins       750000 non-null  float64
 12  Soil Type_Black        750000 non-null  float64
 13  Soil Type_Clayey       750000 non-null  float64
 14  Soil Type_Loamy        750000 non-null  f

In [26]:
x_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temparature,750000.0,0.500274,0.30966,0.0,0.230769,0.538462,0.769231,1.0
Humidity,750000.0,0.501769,0.302168,0.0,0.227273,0.5,0.772727,1.0
Moisture,750000.0,0.504604,0.294865,0.0,0.25,0.5,0.75,1.0
Nitrogen,750000.0,0.502469,0.295161,0.0,0.236842,0.5,0.763158,1.0
Potassium,750000.0,0.498858,0.303454,0.0,0.210526,0.473684,0.736842,1.0
Phosphorous,750000.0,0.501743,0.293972,0.0,0.238095,0.5,0.761905,1.0
Temparature_Bins,750000.0,1.001719,0.844885,0.0,0.0,1.0,2.0,2.0
Humidity_Bins,750000.0,0.960079,0.807891,0.0,0.0,1.0,2.0,2.0
Moisture_Bins,750000.0,1.060707,0.79197,0.0,0.0,1.0,2.0,2.0
Nitrogen_Bins,750000.0,1.005467,0.781365,0.0,0.0,1.0,2.0,2.0


## Feature Selection

In [27]:
fertilizers_scores = mutual_info_classif(x_end, y_fertilizers)
fertilizers_scores = pd.Series(fertilizers_scores, name="Fertilizers MI Scores", index=x_end.columns)
fertilizers_scores = fertilizers_scores.sort_values(ascending=False)
fertilizers_scores

Moisture_Bins            0.010182
Potassium_Bins           0.010178
Nitrogen_Bins            0.010081
Temparature_Bins         0.009796
Humidity_Bins            0.008730
Phosphorous_Bins         0.008328
Soil Type_Clayey         0.003656
Moisture                 0.003411
Soil Type_Black          0.003398
Soil Type_Sandy          0.003335
Soil Type_Loamy          0.003314
Temparature              0.002990
Potassium                0.002913
Phosphorous              0.002868
Nitrogen                 0.002473
Soil Type_Red            0.002008
Crop Type_Pulses         0.001631
Crop Type_Sugarcane      0.001362
Crop Type_Wheat          0.001280
Crop Type_Ground Nuts    0.001029
Crop Type_Barley         0.000866
Crop Type_Cotton         0.000487
Crop Type_Maize          0.000442
Crop Type_Paddy          0.000430
Crop Type_Millets        0.000379
Humidity                 0.000333
Crop Type_Tobacco        0.000201
Crop Type_Oil seeds      0.000000
Name: Fertilizers MI Scores, dtype: float64

First feature analysis: All Features

    Moisture_Bins            0.010722
    Potassium_Bins           0.010263
    Nitrogen_Bins            0.009758
    Temparature_Bins         0.009301
    Humidity_Bins            0.009003
    Phosphorous_Bins         0.008652
    p_sq                     0.004939
    Soil Type_Red            0.004646
    moi_sq                   0.004591
    Soil Type_Clayey         0.004016
    tem_sq                   0.003965
    p_log                    0.003829
    Phosphorous              0.003596
    moi_cubert               0.003573
    p_cubert                 0.003512
    k_log                    0.003362
    Soil Type_Loamy          0.003089
    Moisture                 0.002972
    moi_log                  0.002962
    tem_log                  0.002932
    Temparature              0.002869
    Crop Type_Tobacco        0.002838
    Potassium                0.002815
    Crop Type_Cotton         0.002729
    k_sq                     0.002702
    Nitrogen                 0.002574
    Soil Type_Sandy          0.002465
    hum_log                  0.002205
    Humidity                 0.002096
    n_cubert                 0.002027
    Crop Type_Paddy          0.002016
    k_cubert                 0.001883
    n_sq                     0.001818
    Crop Type_Oil seeds      0.001802
    hum_cubert               0.001699
    n_log                    0.001666
    Soil Type_Black          0.001601
    hum_sq                   0.001590
    Crop Type_Pulses         0.001401
    tem_cubert               0.001277
    Crop Type_Wheat          0.001178
    Crop Type_Millets        0.000953
    Crop Type_Ground Nuts    0.000540
    Crop Type_Sugarcane      0.000244
    Crop Type_Maize          0.000215
    Crop Type_Barley         0.000000

In [28]:
'''
scores = fertilizers_scores.sort_values(ascending=True)
width = np.arange(len(fertilizers_scores))
ticks = list(fertilizers_scores.index)
plt.barh(width, fertilizers_scores)
plt.yticks(width, ticks)
plt.title("Mutual Information Scores")
plt.figure(dpi=100, figsize=(8, 5))
plt.show()
'''

'\nscores = fertilizers_scores.sort_values(ascending=True)\nwidth = np.arange(len(fertilizers_scores))\nticks = list(fertilizers_scores.index)\nplt.barh(width, fertilizers_scores)\nplt.yticks(width, ticks)\nplt.title("Mutual Information Scores")\nplt.figure(dpi=100, figsize=(8, 5))\nplt.show()\n'

In [29]:
x_end.columns

Index(['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium',
       'Phosphorous', 'Temparature_Bins', 'Humidity_Bins', 'Moisture_Bins',
       'Nitrogen_Bins', 'Potassium_Bins', 'Phosphorous_Bins',
       'Soil Type_Black', 'Soil Type_Clayey', 'Soil Type_Loamy',
       'Soil Type_Red', 'Soil Type_Sandy', 'Crop Type_Barley',
       'Crop Type_Cotton', 'Crop Type_Ground Nuts', 'Crop Type_Maize',
       'Crop Type_Millets', 'Crop Type_Oil seeds', 'Crop Type_Paddy',
       'Crop Type_Pulses', 'Crop Type_Sugarcane', 'Crop Type_Tobacco',
       'Crop Type_Wheat'],
      dtype='object')

In [30]:
x_comp = x_end[['Soil Type_Black',
       'Soil Type_Clayey', 'Soil Type_Loamy', 'Soil Type_Red',
       'Soil Type_Sandy', 'Crop Type_Barley', 'Crop Type_Cotton',
       'Crop Type_Ground Nuts', 'Crop Type_Maize', 'Crop Type_Millets',
       'Crop Type_Oil seeds', 'Crop Type_Paddy', 'Crop Type_Pulses',
       'Crop Type_Sugarcane', 'Crop Type_Tobacco', 'Crop Type_Wheat']]

pca = PCA(n_components=2).set_output(transform="pandas")
x_pca = pca.fit_transform(x_comp)

In [31]:
x_keep = x_end.drop(columns=['Soil Type_Black',
       'Soil Type_Clayey', 'Soil Type_Loamy', 'Soil Type_Red',
       'Soil Type_Sandy', 'Crop Type_Barley', 'Crop Type_Cotton',
       'Crop Type_Ground Nuts', 'Crop Type_Maize', 'Crop Type_Millets',
       'Crop Type_Oil seeds', 'Crop Type_Paddy', 'Crop Type_Pulses',
       'Crop Type_Sugarcane', 'Crop Type_Tobacco', 'Crop Type_Wheat'])

In [32]:
x_end_new = pd.concat([x_keep, x_pca], axis=1)

In [33]:
#x_end_new = x_end.drop(columns=[])

In [34]:
x_end_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Temparature       750000 non-null  float64
 1   Humidity          750000 non-null  float64
 2   Moisture          750000 non-null  float64
 3   Nitrogen          750000 non-null  float64
 4   Potassium         750000 non-null  float64
 5   Phosphorous       750000 non-null  float64
 6   Temparature_Bins  750000 non-null  float64
 7   Humidity_Bins     750000 non-null  float64
 8   Moisture_Bins     750000 non-null  float64
 9   Nitrogen_Bins     750000 non-null  float64
 10  Potassium_Bins    750000 non-null  float64
 11  Phosphorous_Bins  750000 non-null  float64
 12  pca0              750000 non-null  float64
 13  pca1              750000 non-null  float64
dtypes: float64(14)
memory usage: 85.8 MB


# Model Selection

In [35]:
# We separate the data into training and validation sets

x_train, x_val, y_train, y_val = (
    train_test_split(x_end_new, y_fertilizers, test_size=0.2, random_state=42)
)

In [36]:
# We review the balance of the target variable

values_counts = np.asarray(np.unique(y_train, return_counts=True))

print(values_counts)

[[    0     1     2     3     4     5     6]
 [91046 91797 89689 88879 88774 75712 74103]]


In [37]:
# Common function to calculate AP@K and then MAP@K

def apk(actual, predicted, k):

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score

def mapk(actual, predicted, k):

    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [38]:
# Function for an initial evaluation of the model

def evaluator(model, val_x, val_y, enc, k, model_name):
    
    y_pred_prob = model.predict_proba(val_x)
    top_3_indices_val = np.argsort(-y_pred_prob, axis=1)[:, :3]
    
    predicted_val = []
    
    for row_indices in top_3_indices_val:
        names = enc.inverse_transform(row_indices)
        predicted_val.append(list(names))
    
    actual_val = [[enc.inverse_transform([label])[0]] for label in val_y]
    map_3_score_val = mapk(actual_val, predicted_val, k=k)
    
    print(f"\n{model_name} initial MAP@3 Score: {map_3_score_val:.4f}")

In [39]:
# we create the StratifiedKFold object

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [40]:
# Function to evaluate models with a cross-validation method

def cv_evaluator(model_class, model_params, x, y, enc, k_map, model_name):

    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(x, y)):

        print(f"\nFold {fold + 1}/{n_splits}")
        x_train_fold, x_val_fold = x.iloc[train_idx], x.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        model = model_class(**model_params)
        model.fit(x_train_fold, y_train_fold)

        y_pred_prob = model.predict_proba(x_val_fold)
        top_k_indices_val = np.argsort(-y_pred_prob, axis=1)[:, :k_map]

        predicted_fold = []

        for row_indices in top_k_indices_val:

            valid_indices = [idx for idx in row_indices if idx < len(enc.classes_)]
            names = enc.inverse_transform(valid_indices)
            predicted_fold.append(list(names))

        actual_fold = [[enc.inverse_transform([label])[0]] for label in y_val_fold]

        fold_map_score = mapk(actual_fold, predicted_fold, k=k_map)
        fold_scores.append(fold_map_score)
        print(f"MAP@{k_map} Fold {fold + 1}: {fold_map_score:.4f}")

    print(f"\n--- CV Summary for {model_name} ---\n")
    print(f"MAP@{k_map} Scores for fold: {np.round(fold_scores, 4)}")
    print(f"MAP@{k_map} average: {np.mean(fold_scores):.4f}")
    print(f"Standard deviation of MAP@{k_map}: {np.std(fold_scores):.4f}")

    return fold_scores

## First model: LogisticRegression

In [41]:
# We create the model instance

#lrc = LogisticRegression(multi_class="multinomial")

# Train the model with the data

#lrc.fit(x_train, y_train)

In [42]:
#evaluator(lrc, x_val, y_val, le, 3, "LogisticRegression")

LogisticRegression initial MAP@3 Score: 0.2870

LogisticRegression initial MAP@3 Score: 0.2894 (post feature engineering)

In [43]:
'''
# We apply the function for CV

lrc_params = {"multi_class": "multinomial", "random_state": 42}

lrc_cv_scores = cv_evaluator(
    model_class=LogisticRegression,
    model_params=lrc_params,
    x=x_end_new,
    y=y_fertilizers, 
    enc=le,
    k_map=3,
    model_name="Logistic Regression"
)
'''

'\n# We apply the function for CV\n\nlrc_params = {"multi_class": "multinomial", "random_state": 42}\n\nlrc_cv_scores = cv_evaluator(\n    model_class=LogisticRegression,\n    model_params=lrc_params,\n    x=x_end_new,\n    y=y_fertilizers, \n    enc=le,\n    k_map=3,\n    model_name="Logistic Regression"\n)\n'


--- CV Summary for Logistic Regression ---


MAP@3 Scores for fold: [0.2871 0.2875 0.2861 0.2856 0.2871]

MAP@3 average: 0.2867

Standard deviation of MAP@3: 0.0007

## Second model: XGBClassifier

In [44]:
# We create the model instance

xgbc = XGBClassifier()

# Train the model with the data

xgbc.fit(x_train, y_train)

In [45]:
evaluator(xgbc, x_val, y_val, le, 3, "XGBClassifier")


XGBClassifier initial MAP@3 Score: 0.3276


XGBClassifier initial MAP@3 Score: 0.3307

In [46]:
'''
# We apply the function for CV

xgbc_params = xgbc.get_params()

xgbc_cv_scores = cv_evaluator(
    model_class=XGBClassifier,
    model_params=xgbc_params,
    x=x_end_new,
    y=y_fertilizers, 
    enc=le,
    k_map=3,
    model_name="XGBClassifier"
)
'''

'\n# We apply the function for CV\n\nxgbc_params = xgbc.get_params()\n\nxgbc_cv_scores = cv_evaluator(\n    model_class=XGBClassifier,\n    model_params=xgbc_params,\n    x=x_end_new,\n    y=y_fertilizers, \n    enc=le,\n    k_map=3,\n    model_name="XGBClassifier"\n)\n'

--- CV Summary for XGBClassifier ---

MAP@3 Scores for fold: [0.3294 0.3306 0.3304 0.3308 0.3305]

MAP@3 average: 0.3303

Standard deviation of MAP@3: 0.0005

## Third model: LGBMClassifier

In [47]:
# We create the model instance

lgbmc = LGBMClassifier(verbose=0)

# Train the model with the data

lgbmc.fit(x_train, y_train)

In [48]:
evaluator(lgbmc, x_val, y_val, le, 3, "LGBMClassifier")


LGBMClassifier initial MAP@3 Score: 0.3203


LGBMClassifier initial MAP@3 Score: 0.3230

In [49]:
'''
# We apply the function for CV

lgbmc_params = lgbmc.get_params()

lgbmc_cv_scores = cv_evaluator(
    model_class=LGBMClassifier,
    model_params=lgbmc_params,
    x=x_end_new,
    y=y_fertilizers, 
    enc=le,
    k_map=3,
    model_name="LGBMClassifier"
)
'''

'\n# We apply the function for CV\n\nlgbmc_params = lgbmc.get_params()\n\nlgbmc_cv_scores = cv_evaluator(\n    model_class=LGBMClassifier,\n    model_params=lgbmc_params,\n    x=x_end_new,\n    y=y_fertilizers, \n    enc=le,\n    k_map=3,\n    model_name="LGBMClassifier"\n)\n'

--- CV Summary for LGBMClassifier ---

MAP@3 Scores for fold: [0.3219 0.3236 0.3228 0.3225 0.3226]

MAP@3 average: 0.3227

Standard deviation of MAP@3: 0.0005

## Fourth model: CatBoostClassifier

In [50]:
# We create the model instance

cbc = CatBoostClassifier(silent=True)

# Train the model with the data

cbc.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7b73289e4dd0>

In [51]:
evaluator(cbc, x_val, y_val, le, 3, "CatBoostClassifier")


CatBoostClassifier initial MAP@3 Score: 0.3231


CatBoostClassifier initial MAP@3 Score: 0.3275

In [52]:
'''
# We apply the function for CV

cbc_params = cbc.get_all_params()

cbc_cv_scores = cv_evaluator(
    model_class=CatBoostClassifier,
    model_params=cbc_params,
    x=x_end_new,
    y=y_fertilizers, 
    enc=le,
    k_map=3,
    model_name="CatBoostClassifier"
)
'''

'\n# We apply the function for CV\n\ncbc_params = cbc.get_all_params()\n\ncbc_cv_scores = cv_evaluator(\n    model_class=CatBoostClassifier,\n    model_params=cbc_params,\n    x=x_end_new,\n    y=y_fertilizers, \n    enc=le,\n    k_map=3,\n    model_name="CatBoostClassifier"\n)\n'

## Ensemble Method

AdaBoostClassifier initial MAP@3 Score: 0.2783 - estimator=xgbc

BaggingClassifier initial MAP@3 Score: 0.3342 - estimator=xgbc

VotingClassifier initial MAP@3 Score: 0.3315 - estimators=[("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)]

StackingClassifier initial MAP@3 Score: 0.3303 - [("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)], final_estimator=LogisticRegression()

### VotingClassifier

In [53]:
'''
# We create the model instance

voc = VotingClassifier(
    estimators=[("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)], voting="soft"
)

# Train the model with the data

voc.fit(x_train, y_train)
'''

'\n# We create the model instance\n\nvoc = VotingClassifier(\n    estimators=[("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)], voting="soft"\n)\n\n# Train the model with the data\n\nvoc.fit(x_train, y_train)\n'

In [54]:
#evaluator(voc, x_val, y_val, le, 3, "VotingClassifier")

VotingClassifier initial MAP@3 Score: 0.3299 - estimators=[("lrc", lrc), ("xgbc", xgbc), ("cbc", cbc)]

VotingClassifier initial MAP@3 Score: 0.3315 - estimators=[("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)]

In [55]:
'''
# We apply the function for CV

voc_params = {
    "estimators" : [("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)], 
    "voting" : "soft"
}

voc_cv_scores = cv_evaluator(
    model_class=VotingClassifier,
    model_params=voc_params,
    x=x_end_new,
    y=y_fertilizers, 
    enc=le,
    k_map=3,
    model_name="VotingClassifier"
)
'''

'\n# We apply the function for CV\n\nvoc_params = {\n    "estimators" : [("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)], \n    "voting" : "soft"\n}\n\nvoc_cv_scores = cv_evaluator(\n    model_class=VotingClassifier,\n    model_params=voc_params,\n    x=x_end_new,\n    y=y_fertilizers, \n    enc=le,\n    k_map=3,\n    model_name="VotingClassifier"\n)\n'

--- CV Summary for VotingClassifier ---

MAP@3 Scores for fold: [0.3308 0.3316 0.3321 0.3309 0.3312]

MAP@3 average: 0.3313

Standard deviation of MAP@3: 0.0005

### StackingClassifier

In [56]:
# Create a stacking ensemble

stc = StackingClassifier(
    estimators=[("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)], final_estimator=LogisticRegression()
)

# Train the model with the data

stc.fit(x_train, y_train)

In [57]:
evaluator(stc, x_val, y_val, le, 3, "StackingClassifier")


StackingClassifier initial MAP@3 Score: 0.3287


StackingClassifier initial MAP@3 Score: 0.3303 - [("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)], final_estimator=LogisticRegression()

In [58]:
'''
# We apply the function for CV

stc_params = {
    "estimators" : [("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)], 
    "final_estimator" : LogisticRegression()
}

stc_cv_scores = cv_evaluator(
    model_class=StackingClassifier,
    model_params=voc_params,
    x=x_end_new,
    y=y_fertilizers, 
    enc=le,
    k_map=3,
    model_name="StackingClassifier"
)
'''

'\n# We apply the function for CV\n\nstc_params = {\n    "estimators" : [("xgbc", xgbc), ("lgbmc", lgbmc), ("cbc", cbc)], \n    "final_estimator" : LogisticRegression()\n}\n\nstc_cv_scores = cv_evaluator(\n    model_class=StackingClassifier,\n    model_params=voc_params,\n    x=x_end_new,\n    y=y_fertilizers, \n    enc=le,\n    k_map=3,\n    model_name="StackingClassifier"\n)\n'

# Modeling

In [59]:
final_model = stc

final_model.get_params()

{'cv': None,
 'estimators': [('xgbc',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=None, device=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric=None, feature_types=None,
                 gamma=None, grow_policy=None, importance_type=None,
                 interaction_constraints=None, learning_rate=None, max_bin=None,
                 max_cat_threshold=None, max_cat_to_onehot=None,
                 max_delta_step=None, max_depth=None, max_leaves=None,
                 min_child_weight=None, missing=nan, monotone_constraints=None,
                 multi_strategy=None, n_estimators=None, n_jobs=None,
                 num_parallel_tree=None, objective='multi:softprob', ...)),
  ('lgbmc', LGBMClassifier(verbose=0)),
  ('cbc', <catboost.core.CatBoostClassifier at 0x7b73289e4dd0>)],
 'final_estimator__C': 1.0,
 'final_estimator__class_w

In [60]:
'''
# Permutation Importance

perm_importance = permutation_importance(final_model, x_val, y_val, n_repeats=30, random_state=42, n_jobs=-1)
perm_importance_df = pd.DataFrame({
    'Feature': x_end.columns,
    'Importance Mean': perm_importance.importances_mean,
    'Importance Std': perm_importance.importances_std
})
print("\nPermutation Importance:\n")
print(perm_importance_df.sort_values(by='Importance Mean', ascending=False))
'''

'\n# Permutation Importance\n\nperm_importance = permutation_importance(final_model, x_val, y_val, n_repeats=30, random_state=42, n_jobs=-1)\nperm_importance_df = pd.DataFrame({\n    \'Feature\': x_end.columns,\n    \'Importance Mean\': perm_importance.importances_mean,\n    \'Importance Std\': perm_importance.importances_std\n})\nprint("\nPermutation Importance:\n")\nprint(perm_importance_df.sort_values(by=\'Importance Mean\', ascending=False))\n'

# Test data

In [61]:
# We load the test data and submission sample data

df_test = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")

fertilizer_sample = pd.read_csv("/kaggle/input/playground-series-s5e6/sample_submission.csv")

In [62]:
# We check the shape

print(f"Shape: {df_test.shape}")

Shape: (250000, 9)


In [63]:
df_test.head()

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,750000,31,70,52,Sandy,Wheat,34,11,24
1,750001,27,62,45,Red,Sugarcane,30,14,15
2,750002,28,72,28,Clayey,Ground Nuts,14,15,4
3,750003,37,53,57,Black,Ground Nuts,18,17,36
4,750004,31,55,32,Red,Pulses,13,19,14


In [64]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           250000 non-null  int64 
 1   Temparature  250000 non-null  int64 
 2   Humidity     250000 non-null  int64 
 3   Moisture     250000 non-null  int64 
 4   Soil Type    250000 non-null  object
 5   Crop Type    250000 non-null  object
 6   Nitrogen     250000 non-null  int64 
 7   Potassium    250000 non-null  int64 
 8   Phosphorous  250000 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 17.2+ MB


In [65]:
df_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,250000.0,874999.5,72168.927986,750000.0,812499.75,874999.5,937499.25,999999.0
Temparature,250000.0,31.491648,4.024093,25.0,28.0,31.0,35.0,38.0
Humidity,250000.0,61.04512,6.636372,50.0,55.0,61.0,67.0,72.0
Moisture,250000.0,45.190444,11.793167,25.0,35.0,45.0,55.0,65.0
Nitrogen,250000.0,23.139612,11.215956,4.0,13.0,23.0,33.0,42.0
Potassium,250000.0,9.487764,5.76686,0.0,4.0,10.0,14.0,19.0
Phosphorous,250000.0,21.12206,12.38087,0.0,10.0,21.0,32.0,42.0


In [66]:
df_test.describe(exclude = np.number)

Unnamed: 0,Soil Type,Crop Type
count,250000,250000
unique,5,11
top,Sandy,Paddy
freq,52323,28332


## Data wrangling & Preprocessing

In [67]:
# We check that no duplicate data is found

print(f"Length: {len(df_test.duplicated())}")

print(f"Duplicates: {df_test.duplicated().sum()}")

Length: 250000
Duplicates: 0


In [68]:
# We confirm that there is no null values

null_values_test = pd.DataFrame(
        {f"Null Data" : df_test.isnull().sum(), 
         "Percentage" : (df_test.isnull().sum()) / (len(df_test)) * (100)})

null_values_test

Unnamed: 0,Null Data,Percentage
id,0,0.0
Temparature,0,0.0
Humidity,0,0.0
Moisture,0,0.0
Soil Type,0,0.0
Crop Type,0,0.0
Nitrogen,0,0.0
Potassium,0,0.0
Phosphorous,0,0.0


In [69]:
# We start by removing the variables that we will not use

test_new = df_test.drop(columns=["id"])

In [70]:
# We separate the climate variables into bins

test_new["Temparature_Bins"] = pd.cut(test_new["Temparature"], bins_tem, labels=labels_climate)
test_new["Humidity_Bins"] = pd.cut(test_new["Humidity"], bins_hum, labels=labels_climate)
test_new["Moisture_Bins"] = pd.cut(test_new["Moisture"], bins_moi, labels=labels_climate)

# We separate the elements variables into bins

test_new["Nitrogen_Bins"] = pd.cut(test_new["Nitrogen"], bins_n, labels=labels_elements)
test_new["Potassium_Bins"] = pd.cut(test_new["Potassium"], bins_k, labels=labels_elements)
test_new["Phosphorous_Bins"] = pd.cut(test_new["Phosphorous"], bins_p, labels=labels_elements)

In [71]:
'''
# Polynomial / transforms

test_new["tem_log"] = np.log1p(test_new["Temparature"])
test_new["tem_sq"]  = test_new["Temparature"] ** 2
test_new["tem_cubert"] = np.cbrt(test_new["Temparature"])

test_new["hum_log"] = np.log1p(test_new["Humidity"])
test_new["hum_sq"]  = test_new["Humidity"] ** 2
test_new["hum_cubert"] = np.cbrt(test_new["Humidity"])

test_new["moi_log"] = np.log1p(test_new["Moisture"])
test_new["moi_sq"]  = test_new["Moisture"] ** 2
test_new["moi_cubert"] = np.cbrt(test_new["Moisture"])

test_new["n_log"] = np.log1p(test_new["Nitrogen"])
test_new["n_sq"]  = test_new["Nitrogen"] ** 2
test_new["n_cubert"] = np.cbrt(test_new["Nitrogen"])

test_new["k_log"] = np.log1p(test_new["Potassium"])
test_new["k_sq"]  = test_new["Potassium"] ** 2
test_new["k_cubert"] = np.cbrt(test_new["Potassium"])

test_new["p_log"] = np.log1p(test_new["Phosphorous"])
test_new["p_sq"]  = test_new["Phosphorous"] ** 2
test_new["p_cubert"] = np.cbrt(test_new["Phosphorous"])
'''

'\n# Polynomial / transforms\n\ntest_new["tem_log"] = np.log1p(test_new["Temparature"])\ntest_new["tem_sq"]  = test_new["Temparature"] ** 2\ntest_new["tem_cubert"] = np.cbrt(test_new["Temparature"])\n\ntest_new["hum_log"] = np.log1p(test_new["Humidity"])\ntest_new["hum_sq"]  = test_new["Humidity"] ** 2\ntest_new["hum_cubert"] = np.cbrt(test_new["Humidity"])\n\ntest_new["moi_log"] = np.log1p(test_new["Moisture"])\ntest_new["moi_sq"]  = test_new["Moisture"] ** 2\ntest_new["moi_cubert"] = np.cbrt(test_new["Moisture"])\n\ntest_new["n_log"] = np.log1p(test_new["Nitrogen"])\ntest_new["n_sq"]  = test_new["Nitrogen"] ** 2\ntest_new["n_cubert"] = np.cbrt(test_new["Nitrogen"])\n\ntest_new["k_log"] = np.log1p(test_new["Potassium"])\ntest_new["k_sq"]  = test_new["Potassium"] ** 2\ntest_new["k_cubert"] = np.cbrt(test_new["Potassium"])\n\ntest_new["p_log"] = np.log1p(test_new["Phosphorous"])\ntest_new["p_sq"]  = test_new["Phosphorous"] ** 2\ntest_new["p_cubert"] = np.cbrt(test_new["Phosphorous"])\n'

In [72]:
# We encode categorical variables

test_cat_cols = test_new[["Soil Type", "Crop Type"]]
test_rest_cols = test_new.drop(columns=["Soil Type", "Crop Type"])
test_cat_enc = encoder.fit_transform(test_cat_cols)
test_encoded = pd.concat([test_rest_cols, test_cat_enc], axis=1)

In [73]:
mapper(test_encoded, "Temparature_Bins", climate_order)
mapper(test_encoded, "Humidity_Bins", climate_order)
mapper(test_encoded, "Moisture_Bins", climate_order)
mapper(test_encoded, "Nitrogen_Bins", elements_order)
mapper(test_encoded, "Potassium_Bins", elements_order)
mapper(test_encoded, "Phosphorous_Bins", elements_order)

Temparature_Bins
0.0    89313
2.0    88986
1.0    71701
Name: count, dtype: int64
Humidity_Bins
1.0    86642
0.0    86495
2.0    76863
Name: count, dtype: int64
Moisture_Bins
1.0    92250
2.0    86522
0.0    71228
Name: count, dtype: int64
Nitrogen_Bins
1.0    97364
2.0    77401
0.0    75235
Name: count, dtype: int64
Potassium_Bins
2.0    87702
0.0    87318
1.0    74980
Name: count, dtype: int64
Phosphorous_Bins
0.0    91568
1.0    88700
2.0    69732
Name: count, dtype: int64


In [74]:
test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 28 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Temparature            250000 non-null  int64  
 1   Humidity               250000 non-null  int64  
 2   Moisture               250000 non-null  int64  
 3   Nitrogen               250000 non-null  int64  
 4   Potassium              250000 non-null  int64  
 5   Phosphorous            250000 non-null  int64  
 6   Temparature_Bins       250000 non-null  float64
 7   Humidity_Bins          250000 non-null  float64
 8   Moisture_Bins          250000 non-null  float64
 9   Nitrogen_Bins          250000 non-null  float64
 10  Potassium_Bins         250000 non-null  float64
 11  Phosphorous_Bins       250000 non-null  float64
 12  Soil Type_Black        250000 non-null  float64
 13  Soil Type_Clayey       250000 non-null  float64
 14  Soil Type_Loamy        250000 non-nu

In [75]:
'''
# Numerical variables to scale

test_numeric = test_encoded[[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "tem_sq",
    "hum_sq",
    "moi_sq",
    "n_sq",
    "k_sq",
    "p_sq"
]]
test_scale_num = scaler.transform(test_numeric)
test_rest = test_encoded.drop(columns=[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "tem_sq",
    "hum_sq",
    "moi_sq",
    "n_sq",
    "k_sq",
    "p_sq"
])
test_end = pd.concat([test_scale_num, test_rest], axis=1)
'''

'\n# Numerical variables to scale\n\ntest_numeric = test_encoded[[\n    "Temparature",\n    "Humidity",\n    "Moisture",\n    "Nitrogen",\n    "Potassium",\n    "Phosphorous",\n    "tem_sq",\n    "hum_sq",\n    "moi_sq",\n    "n_sq",\n    "k_sq",\n    "p_sq"\n]]\ntest_scale_num = scaler.transform(test_numeric)\ntest_rest = test_encoded.drop(columns=[\n    "Temparature",\n    "Humidity",\n    "Moisture",\n    "Nitrogen",\n    "Potassium",\n    "Phosphorous",\n    "tem_sq",\n    "hum_sq",\n    "moi_sq",\n    "n_sq",\n    "k_sq",\n    "p_sq"\n])\ntest_end = pd.concat([test_scale_num, test_rest], axis=1)\n'

In [76]:
# Numerical variables to scale

test_numeric = test_encoded[[
    "Temparature", "Humidity", "Moisture",
    "Nitrogen", "Potassium", "Phosphorous"
]]
test_scale_num = scaler.transform(test_numeric)
test_rest = test_encoded.drop(columns=[
    "Temparature", "Humidity", "Moisture",
    "Nitrogen", "Potassium", "Phosphorous"
])
test_end = pd.concat([test_scale_num, test_rest], axis=1)

In [77]:
test_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temparature,250000.0,0.499358,0.309546,0.0,0.230769,0.461538,0.769231,1.0
Humidity,250000.0,0.502051,0.301653,0.0,0.227273,0.5,0.772727,1.0
Moisture,250000.0,0.504761,0.294829,0.0,0.25,0.5,0.75,1.0
Nitrogen,250000.0,0.503674,0.295157,0.0,0.236842,0.5,0.763158,1.0
Potassium,250000.0,0.499356,0.303519,0.0,0.210526,0.526316,0.736842,1.0
Phosphorous,250000.0,0.502906,0.294783,0.0,0.238095,0.5,0.761905,1.0
Temparature_Bins,250000.0,0.998692,0.84451,0.0,0.0,1.0,2.0,2.0
Humidity_Bins,250000.0,0.961472,0.807434,0.0,0.0,1.0,2.0,2.0
Moisture_Bins,250000.0,1.061176,0.791997,0.0,0.0,1.0,2.0,2.0
Nitrogen_Bins,250000.0,1.008664,0.781327,0.0,0.0,1.0,2.0,2.0


In [78]:
test_x_comp = test_end[['Soil Type_Black',
       'Soil Type_Clayey', 'Soil Type_Loamy', 'Soil Type_Red',
       'Soil Type_Sandy', 'Crop Type_Barley', 'Crop Type_Cotton',
       'Crop Type_Ground Nuts', 'Crop Type_Maize', 'Crop Type_Millets',
       'Crop Type_Oil seeds', 'Crop Type_Paddy', 'Crop Type_Pulses',
       'Crop Type_Sugarcane', 'Crop Type_Tobacco', 'Crop Type_Wheat']]

test_x_pca = pca.fit_transform(test_x_comp)

test_x_keep = test_end.drop(columns=['Soil Type_Black',
       'Soil Type_Clayey', 'Soil Type_Loamy', 'Soil Type_Red',
       'Soil Type_Sandy', 'Crop Type_Barley', 'Crop Type_Cotton',
       'Crop Type_Ground Nuts', 'Crop Type_Maize', 'Crop Type_Millets',
       'Crop Type_Oil seeds', 'Crop Type_Paddy', 'Crop Type_Pulses',
       'Crop Type_Sugarcane', 'Crop Type_Tobacco', 'Crop Type_Wheat'])

test_end_new = pd.concat([test_x_keep, test_x_pca], axis=1)

In [79]:
# We remove the variables that we will not use

#test_end_new = test_end.drop(columns=[])

In [80]:
test_end_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Temparature       250000 non-null  float64
 1   Humidity          250000 non-null  float64
 2   Moisture          250000 non-null  float64
 3   Nitrogen          250000 non-null  float64
 4   Potassium         250000 non-null  float64
 5   Phosphorous       250000 non-null  float64
 6   Temparature_Bins  250000 non-null  float64
 7   Humidity_Bins     250000 non-null  float64
 8   Moisture_Bins     250000 non-null  float64
 9   Nitrogen_Bins     250000 non-null  float64
 10  Potassium_Bins    250000 non-null  float64
 11  Phosphorous_Bins  250000 non-null  float64
 12  pca0              250000 non-null  float64
 13  pca1              250000 non-null  float64
dtypes: float64(14)
memory usage: 26.7 MB


## Apply the Model & Submission File

In [81]:
# We apply the trained model

test_pred_prob = final_model.predict_proba(test_end_new)
test_top_3 = np.argsort(-test_pred_prob, axis=1)[:, :3]

top_3_fertilizer_names = []
for row_indices in test_top_3:
    valid_indices = [idx for idx in row_indices if idx < len(le.classes_)]
    names = le.inverse_transform(valid_indices)
    top_3_fertilizer_names.append(names)

formatted_predictions = [" ".join(names) for names in top_3_fertilizer_names]

In [82]:
# We review the result

print("Total predictions: ", len(formatted_predictions), "\n")

Total predictions:  250000 



In [83]:
# We create the dataframe

fertilizer_submission = pd.DataFrame({
    "id" : df_test["id"], 
    "Fertilizer Name" : formatted_predictions
})

fertilizer_submission.head()

Unnamed: 0,id,Fertilizer Name
0,750000,28-28 10-26-26 DAP
1,750001,17-17-17 20-20 10-26-26
2,750002,20-20 10-26-26 Urea
3,750003,14-35-14 DAP Urea
4,750004,20-20 10-26-26 28-28


In [84]:
# We compare the results with the sample

print(
    f"Shape Sample Submission: {fertilizer_sample.shape}",
    f"\nShape Fertilizer Submission: {fertilizer_submission.shape}"
)
print("\n", fertilizer_sample.head())

Shape Sample Submission: (250000, 2) 
Shape Fertilizer Submission: (250000, 2)

        id         Fertilizer Name
0  750000  14-35-14 10-26-26 Urea
1  750001  14-35-14 10-26-26 Urea
2  750002  14-35-14 10-26-26 Urea
3  750003  14-35-14 10-26-26 Urea
4  750004  14-35-14 10-26-26 Urea


In [85]:
# We convert the dataframe to a csv file

fertilizer_submission.to_csv("submission.csv", index=False)