# Predicting Optimal Fertilizers

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv


Private-fertilizer-predic-playground-series-s5-e6 Version 1 notebook Full EDA

Public notebook: https://www.kaggle.com/code/les1781/optimal-fertilizer-predict-playground-series-s5-e6

# Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import (
    LabelEncoder,    
    OneHotEncoder,
    StandardScaler
)
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    confusion_matrix, 
    classification_report, 
    f1_score, 
    accuracy_score,
    make_scorer
)
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.inspection import permutation_importance

# Initial analysis

In [3]:
# We load the data

fertilizers_train = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv", index_col="id")

In [4]:
fertilizers_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temparature,750000.0,31.503565,4.025574,25.0,28.0,32.0,35.0,38.0
Humidity,750000.0,61.038912,6.647695,50.0,55.0,61.0,67.0,72.0
Moisture,750000.0,45.184147,11.794594,25.0,35.0,45.0,55.0,65.0
Nitrogen,750000.0,23.093808,11.216125,4.0,13.0,23.0,33.0,42.0
Potassium,750000.0,9.478296,5.765622,0.0,4.0,9.0,14.0,19.0
Phosphorous,750000.0,21.073227,12.346831,0.0,10.0,21.0,32.0,42.0


In [5]:
fertilizers_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Temparature      750000 non-null  int64 
 1   Humidity         750000 non-null  int64 
 2   Moisture         750000 non-null  int64 
 3   Soil Type        750000 non-null  object
 4   Crop Type        750000 non-null  object
 5   Nitrogen         750000 non-null  int64 
 6   Potassium        750000 non-null  int64 
 7   Phosphorous      750000 non-null  int64 
 8   Fertilizer Name  750000 non-null  object
dtypes: int64(6), object(3)
memory usage: 57.2+ MB


# Data wrangling

In [6]:
# We make a copy of the original dataset

fertilizers_new = fertilizers_train.copy()

In [7]:
# We confirm that there is no null values

null_values = pd.DataFrame(
        {f"Null Data" : fertilizers_new.isnull().sum(), 
         "Percentage" : (fertilizers_new.isnull().sum()) / (len(fertilizers_new)) * (100)})

null_values

Unnamed: 0,Null Data,Percentage
Temparature,0,0.0
Humidity,0,0.0
Moisture,0,0.0
Soil Type,0,0.0
Crop Type,0,0.0
Nitrogen,0,0.0
Potassium,0,0.0
Phosphorous,0,0.0
Fertilizer Name,0,0.0


In [8]:
# We check the duplicate data found

print(f"Length: {len(fertilizers_new.duplicated())}")
print(f"Duplicates: {fertilizers_new.duplicated().sum()}")

Length: 750000
Duplicates: 0


In [9]:
'''
eval_out = sns.PairGrid(fertilizers_new, palette=sns.light_palette("seagreen"))
eval_out.map(sns.boxplot)
eval_out.tick_params(axis="both", labelbottom=False)
'''

'\neval_out = sns.PairGrid(fertilizers_new, palette=sns.light_palette("seagreen"))\neval_out.map(sns.boxplot)\neval_out.tick_params(axis="both", labelbottom=False)\n'

# Data Preprocessing

In [10]:
fertilizers_end = fertilizers_new.copy()

## Feature Engineering

First approach with original variables only.

## Encoding

In [11]:
# We apply LabelEncoder to the target variable

le = LabelEncoder()

fertilizers_end["Fertilizer Name"] = le.fit_transform(fertilizers_end["Fertilizer Name"])

In [12]:
le_values = le.classes_

In [13]:
# We create a df with the encoded categorical variables

cat_cols = fertilizers_end[["Soil Type", "Crop Type"]]
rest_cols = fertilizers_end.drop(columns=["Soil Type", "Crop Type"])
encoder = OneHotEncoder(sparse=False, drop="if_binary").set_output(transform="pandas")
cat_enc = encoder.fit_transform(cat_cols)
df_encoded = pd.concat([rest_cols, cat_enc], axis=1)

In [14]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Temparature            750000 non-null  int64  
 1   Humidity               750000 non-null  int64  
 2   Moisture               750000 non-null  int64  
 3   Nitrogen               750000 non-null  int64  
 4   Potassium              750000 non-null  int64  
 5   Phosphorous            750000 non-null  int64  
 6   Fertilizer Name        750000 non-null  int64  
 7   Soil Type_Black        750000 non-null  float64
 8   Soil Type_Clayey       750000 non-null  float64
 9   Soil Type_Loamy        750000 non-null  float64
 10  Soil Type_Red          750000 non-null  float64
 11  Soil Type_Sandy        750000 non-null  float64
 12  Crop Type_Barley       750000 non-null  float64
 13  Crop Type_Cotton       750000 non-null  float64
 14  Crop Type_Ground Nuts  750000 non-null  f

In [15]:
#df_encoded.corr(numeric_only=True).round(2)

## Scaling

In [16]:
df_encoded.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temparature,750000.0,31.503565,4.025574,25.0,28.0,32.0,35.0,38.0
Humidity,750000.0,61.038912,6.647695,50.0,55.0,61.0,67.0,72.0
Moisture,750000.0,45.184147,11.794594,25.0,35.0,45.0,55.0,65.0
Nitrogen,750000.0,23.093808,11.216125,4.0,13.0,23.0,33.0,42.0
Potassium,750000.0,9.478296,5.765622,0.0,4.0,9.0,14.0,19.0
Phosphorous,750000.0,21.073227,12.346831,0.0,10.0,21.0,32.0,42.0
Fertilizer Name,750000.0,2.859791,1.967025,0.0,1.0,3.0,4.0,6.0
Soil Type_Black,750000.0,0.201275,0.400953,0.0,0.0,0.0,0.0,1.0
Soil Type_Clayey,750000.0,0.197843,0.398373,0.0,0.0,0.0,0.0,1.0
Soil Type_Loamy,750000.0,0.194467,0.39579,0.0,0.0,0.0,0.0,1.0


In [17]:
# We separate the target variable from the features

x_fertilizers = df_encoded.drop(columns="Fertilizer Name")
y_fertilizers = df_encoded["Fertilizer Name"]

In [18]:
# Numerical variables to scale

fertilizer_numeric = x_fertilizers[[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous"
]]

scaler = StandardScaler().set_output(transform="pandas")
scale_num = scaler.fit_transform(fertilizer_numeric)

# We create a df with the remaining variables

fertilizer_rest = x_fertilizers.drop(columns=[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous"
])

# We concatenate the dataframes

x_end = pd.concat([scale_num, fertilizer_rest], axis=1)

In [19]:
x_end.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Temparature            750000 non-null  float64
 1   Humidity               750000 non-null  float64
 2   Moisture               750000 non-null  float64
 3   Nitrogen               750000 non-null  float64
 4   Potassium              750000 non-null  float64
 5   Phosphorous            750000 non-null  float64
 6   Soil Type_Black        750000 non-null  float64
 7   Soil Type_Clayey       750000 non-null  float64
 8   Soil Type_Loamy        750000 non-null  float64
 9   Soil Type_Red          750000 non-null  float64
 10  Soil Type_Sandy        750000 non-null  float64
 11  Crop Type_Barley       750000 non-null  float64
 12  Crop Type_Cotton       750000 non-null  float64
 13  Crop Type_Ground Nuts  750000 non-null  float64
 14  Crop Type_Maize        750000 non-null  f

In [20]:
x_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temparature,750000.0,-2.430625e-16,1.000001,-1.615563,-0.870328,0.12332,0.868556,1.613792
Humidity,750000.0,-5.286627e-16,1.000001,-1.660563,-0.908422,-0.005853,0.896716,1.648856
Moisture,750000.0,2.707263e-16,1.000001,-1.711306,-0.863459,-0.015613,0.832234,1.68008
Nitrogen,750000.0,3.8653520000000004e-17,1.000001,-1.702354,-0.899938,-0.008364,0.88321,1.685627
Potassium,750000.0,-5.2220150000000005e-17,1.000001,-1.643934,-0.950166,-0.082957,0.784253,1.651463
Phosphorous,750000.0,-2.046363e-18,1.000001,-1.706773,-0.896848,-0.005931,0.884987,1.694912
Soil Type_Black,750000.0,0.2012747,0.400953,0.0,0.0,0.0,0.0,1.0
Soil Type_Clayey,750000.0,0.1978427,0.398373,0.0,0.0,0.0,0.0,1.0
Soil Type_Loamy,750000.0,0.1944667,0.39579,0.0,0.0,0.0,0.0,1.0
Soil Type_Red,750000.0,0.1974693,0.39809,0.0,0.0,0.0,0.0,1.0


## Feature Selection

In [21]:
'''
fertilizers_scores = mutual_info_classif(x_end, y_fertilizers)
fertilizers_scores = pd.Series(fertilizers_scores, name="Fertilizers MI Scores", index=x_end.columns)
fertilizers_scores = fertilizers_scores.sort_values(ascending=False)
fertilizers_scores
'''

'\nfertilizers_scores = mutual_info_classif(x_end, y_fertilizers)\nfertilizers_scores = pd.Series(fertilizers_scores, name="Fertilizers MI Scores", index=x_end.columns)\nfertilizers_scores = fertilizers_scores.sort_values(ascending=False)\nfertilizers_scores\n'

First feature analysis:

    Moisture                 0.004975
    Soil Type_Black          0.004838
    Phosphorous              0.004139
    Soil Type_Clayey         0.003320
    Soil Type_Loamy          0.003201
    Nitrogen                 0.002672
    Soil Type_Sandy          0.002432
    Soil Type_Red            0.002427
    Crop Type_Oil seeds      0.002218
    Potassium                0.001786
    Temparature              0.001252
    Crop Type_Paddy          0.001237
    Crop Type_Tobacco        0.001203
    Crop Type_Pulses         0.001140
    Crop Type_Maize          0.001094
    Crop Type_Wheat          0.000915
    Crop Type_Barley         0.000832
    Crop Type_Cotton         0.000788
    Humidity                 0.000540
    Crop Type_Ground Nuts    0.000466
    Crop Type_Sugarcane      0.000300
    Crop Type_Millets        0.000000

In [22]:
'''
scores = fertilizers_scores.sort_values(ascending=True)
width = np.arange(len(fertilizers_scores))
ticks = list(fertilizers_scores.index)
plt.barh(width, fertilizers_scores)
plt.yticks(width, ticks)
plt.title("Mutual Information Scores")
plt.figure(dpi=100, figsize=(8, 5))
plt.show()
'''

'\nscores = fertilizers_scores.sort_values(ascending=True)\nwidth = np.arange(len(fertilizers_scores))\nticks = list(fertilizers_scores.index)\nplt.barh(width, fertilizers_scores)\nplt.yticks(width, ticks)\nplt.title("Mutual Information Scores")\nplt.figure(dpi=100, figsize=(8, 5))\nplt.show()\n'

In [23]:
#x_end = x_end.drop(columns=[])

# Model Selection

In [24]:
# We separate the data into training and validation sets

x_train, x_val, y_train, y_val = (
    train_test_split(x_end, y_fertilizers, test_size=0.2, random_state=42)
)

In [25]:
# We review the balance of the target variable

values_counts = np.asarray(np.unique(y_train, return_counts=True))

print(values_counts)

[[    0     1     2     3     4     5     6]
 [91046 91797 89689 88879 88774 75712 74103]]


In [26]:
# Common function to calculate AP@K and then MAP@K

def apk(actual, predicted, k):

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score

def mapk(actual, predicted, k):

    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [27]:
# Function for an initial evaluation of the model

def evaluator(model, val_x, val_y, enc, k, model_name):
    
    y_pred_prob = model.predict_proba(val_x)
    top_3_indices_val = np.argsort(-y_pred_prob, axis=1)[:, :3]
    
    predicted_val = []
    
    for row_indices in top_3_indices_val:
        names = enc.inverse_transform(row_indices)
        predicted_val.append(list(names))
    
    actual_val = [[enc.inverse_transform([label])[0]] for label in val_y]
    map_3_score_val = mapk(actual_val, predicted_val, k=k)
    
    print(f"\n{model_name} initial MAP@3 Score: {map_3_score_val:.4f}")

In [28]:
# we create the StratifiedKFold object

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [29]:
# Function to evaluate models with a cross-validation method

def cv_evaluator(model_class, model_params, x, y, enc, k_map, model_name):

    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(x, y)):

        print(f"\nFold {fold + 1}/{n_splits}")
        x_train_fold, x_val_fold = x.iloc[train_idx], x.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        model = model_class(**model_params)
        model.fit(x_train_fold, y_train_fold)

        y_pred_prob = model.predict_proba(x_val_fold)
        top_k_indices_val = np.argsort(-y_pred_prob, axis=1)[:, :k_map]

        predicted_fold = []

        for row_indices in top_k_indices_val:

            valid_indices = [idx for idx in row_indices if idx < len(enc.classes_)]
            names = enc.inverse_transform(valid_indices)
            predicted_fold.append(list(names))

        actual_fold = [[enc.inverse_transform([label])[0]] for label in y_val_fold]

        fold_map_score = mapk(actual_fold, predicted_fold, k=k_map)
        fold_scores.append(fold_map_score)
        print(f"MAP@{k_map} Fold {fold + 1}: {fold_map_score:.4f}")

    print(f"\n--- CV Summary for {model_name} ---\n")
    print(f"MAP@{k_map} Scores for fold: {np.round(fold_scores, 4)}")
    print(f"MAP@{k_map} average: {np.mean(fold_scores):.4f}")
    print(f"Standard deviation of MAP@{k_map}: {np.std(fold_scores):.4f}")

    return fold_scores

## First model: LogisticRegression

In [30]:
# We create the model instance

#lrc = LogisticRegression(multi_class="multinomial")

# Train the model with the data

#lrc.fit(x_train, y_train)

In [31]:
#evaluator(lrc, x_val, y_val, le, 3, "LogisticRegression")

LogisticRegression initial MAP@3 Score: 0.2870

In [32]:
'''
# We apply the function for CV

lrc_params = {"multi_class": "multinomial", "random_state": 42}

lrc_cv_scores = cv_evaluator(
    model_class=LogisticRegression,
    model_params=lrc_params,
    x=x_end,
    y=y_fertilizers, 
    enc=le,
    k_map=3,
    model_name="Logistic Regression"
)
'''

'\n# We apply the function for CV\n\nlrc_params = {"multi_class": "multinomial", "random_state": 42}\n\nlrc_cv_scores = cv_evaluator(\n    model_class=LogisticRegression,\n    model_params=lrc_params,\n    x=x_end,\n    y=y_fertilizers, \n    enc=le,\n    k_map=3,\n    model_name="Logistic Regression"\n)\n'


--- CV Summary for Logistic Regression ---


MAP@3 Scores for fold: [0.2871 0.2875 0.2861 0.2856 0.2871]

MAP@3 average: 0.2867

Standard deviation of MAP@3: 0.0007

## Second model: XGBClassifier

In [33]:
# We create the model instance

xgbc = XGBClassifier()

# Train the model with the data

xgbc.fit(x_train, y_train)

In [34]:
evaluator(xgbc, x_val, y_val, le, 3, "XGBClassifier")


XGBClassifier initial MAP@3 Score: 0.3307


In [35]:
# We apply the function for CV

xgbc_params = xgbc.get_params()

lrc_cv_scores = cv_evaluator(
    model_class=XGBClassifier,
    model_params=xgbc_params,
    x=x_end,
    y=y_fertilizers, 
    enc=le,
    k_map=3,
    model_name="XGBClassifier"
)


Fold 1/5
MAP@3 Fold 1: 0.3294

Fold 2/5
MAP@3 Fold 2: 0.3306

Fold 3/5
MAP@3 Fold 3: 0.3304

Fold 4/5
MAP@3 Fold 4: 0.3308

Fold 5/5
MAP@3 Fold 5: 0.3305

--- CV Summary for XGBClassifier ---

MAP@3 Scores for fold: [0.3294 0.3306 0.3304 0.3308 0.3305]
MAP@3 average: 0.3303
Standard deviation of MAP@3: 0.0005


# Modeling

In [36]:
final_model = xgbc

final_model.get_params()

{'objective': 'multi:softprob',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [37]:
# Permutation Importance

perm_importance = permutation_importance(final_model, x_val, y_val, n_repeats=30, random_state=42, n_jobs=-1)
perm_importance_df = pd.DataFrame({
    'Feature': x_end.columns,
    'Importance Mean': perm_importance.importances_mean,
    'Importance Std': perm_importance.importances_std
})
print("\nPermutation Importance:\n")
print(perm_importance_df.sort_values(by='Importance Mean', ascending=False))


Permutation Importance:

                  Feature  Importance Mean  Importance Std
5             Phosphorous         0.014369        0.000756
2                Moisture         0.012838        0.000798
3                Nitrogen         0.012112        0.000791
4               Potassium         0.009043        0.000588
1                Humidity         0.006347        0.000624
0             Temparature         0.006166        0.000633
10        Soil Type_Sandy         0.001643        0.000340
8         Soil Type_Loamy         0.001471        0.000294
13  Crop Type_Ground Nuts         0.001244        0.000227
9           Soil Type_Red         0.001223        0.000210
7        Soil Type_Clayey         0.001222        0.000309
17        Crop Type_Paddy         0.001162        0.000200
18       Crop Type_Pulses         0.001140        0.000225
21        Crop Type_Wheat         0.001092        0.000254
6         Soil Type_Black         0.001082        0.000181
12       Crop Type_Cotton     

# Test data

In [38]:
# We load the test data and submission sample data

df_test = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")

fertilizer_sample = pd.read_csv("/kaggle/input/playground-series-s5e6/sample_submission.csv")

In [39]:
# We check the shape

print(f"Shape: {df_test.shape}")

Shape: (250000, 9)


In [40]:
df_test.head()

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,750000,31,70,52,Sandy,Wheat,34,11,24
1,750001,27,62,45,Red,Sugarcane,30,14,15
2,750002,28,72,28,Clayey,Ground Nuts,14,15,4
3,750003,37,53,57,Black,Ground Nuts,18,17,36
4,750004,31,55,32,Red,Pulses,13,19,14


In [41]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           250000 non-null  int64 
 1   Temparature  250000 non-null  int64 
 2   Humidity     250000 non-null  int64 
 3   Moisture     250000 non-null  int64 
 4   Soil Type    250000 non-null  object
 5   Crop Type    250000 non-null  object
 6   Nitrogen     250000 non-null  int64 
 7   Potassium    250000 non-null  int64 
 8   Phosphorous  250000 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 17.2+ MB


In [42]:
df_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,250000.0,874999.5,72168.927986,750000.0,812499.75,874999.5,937499.25,999999.0
Temparature,250000.0,31.491648,4.024093,25.0,28.0,31.0,35.0,38.0
Humidity,250000.0,61.04512,6.636372,50.0,55.0,61.0,67.0,72.0
Moisture,250000.0,45.190444,11.793167,25.0,35.0,45.0,55.0,65.0
Nitrogen,250000.0,23.139612,11.215956,4.0,13.0,23.0,33.0,42.0
Potassium,250000.0,9.487764,5.76686,0.0,4.0,10.0,14.0,19.0
Phosphorous,250000.0,21.12206,12.38087,0.0,10.0,21.0,32.0,42.0


In [43]:
df_test.describe(exclude = np.number)

Unnamed: 0,Soil Type,Crop Type
count,250000,250000
unique,5,11
top,Sandy,Paddy
freq,52323,28332


## Data wrangling & Preprocessing

In [44]:
# We check that no duplicate data is found

print(f"Length: {len(df_test.duplicated())}")

print(f"Duplicates: {df_test.duplicated().sum()}")

Length: 250000
Duplicates: 0


In [45]:
# We confirm that there is no null values

null_values_test = pd.DataFrame(
        {f"Null Data" : df_test.isnull().sum(), 
         "Percentage" : (df_test.isnull().sum()) / (len(df_test)) * (100)})

null_values_test

Unnamed: 0,Null Data,Percentage
id,0,0.0
Temparature,0,0.0
Humidity,0,0.0
Moisture,0,0.0
Soil Type,0,0.0
Crop Type,0,0.0
Nitrogen,0,0.0
Potassium,0,0.0
Phosphorous,0,0.0


In [46]:
# We start by removing the variables that we will not use

test_new = df_test.drop(columns=["id"])

In [47]:
# We encode categorical variables

test_cat_cols = test_new[["Soil Type", "Crop Type"]]
test_rest_cols = test_new.drop(columns=["Soil Type", "Crop Type"])
test_cat_enc = encoder.fit_transform(test_cat_cols)
test_encoded = pd.concat([test_rest_cols, test_cat_enc], axis=1)

In [48]:
test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Temparature            250000 non-null  int64  
 1   Humidity               250000 non-null  int64  
 2   Moisture               250000 non-null  int64  
 3   Nitrogen               250000 non-null  int64  
 4   Potassium              250000 non-null  int64  
 5   Phosphorous            250000 non-null  int64  
 6   Soil Type_Black        250000 non-null  float64
 7   Soil Type_Clayey       250000 non-null  float64
 8   Soil Type_Loamy        250000 non-null  float64
 9   Soil Type_Red          250000 non-null  float64
 10  Soil Type_Sandy        250000 non-null  float64
 11  Crop Type_Barley       250000 non-null  float64
 12  Crop Type_Cotton       250000 non-null  float64
 13  Crop Type_Ground Nuts  250000 non-null  float64
 14  Crop Type_Maize        250000 non-nu

In [49]:
# Numerical variables to scale

test_numeric = test_encoded[[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous"
]]
test_scale_num = scaler.transform(test_numeric)
test_rest = test_encoded.drop(columns=[
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous"
])
test_end = pd.concat([test_scale_num, test_rest], axis=1)

In [50]:
test_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temparature,250000.0,-0.00296,0.999633,-1.615563,-0.870328,-0.125092,0.868556,1.613792
Humidity,250000.0,0.000934,0.998297,-1.660563,-0.908422,-0.005853,0.896716,1.648856
Moisture,250000.0,0.000534,0.99988,-1.711306,-0.863459,-0.015613,0.832234,1.68008
Nitrogen,250000.0,0.004084,0.999986,-1.702354,-0.899938,-0.008364,0.88321,1.685627
Potassium,250000.0,0.001642,1.000215,-1.643934,-0.950166,0.090485,0.784253,1.651463
Phosphorous,250000.0,0.003955,1.002758,-1.706773,-0.896848,-0.005931,0.884987,1.694912
Soil Type_Black,250000.0,0.200588,0.400441,0.0,0.0,0.0,0.0,1.0
Soil Type_Clayey,250000.0,0.199168,0.399375,0.0,0.0,0.0,0.0,1.0
Soil Type_Loamy,250000.0,0.193516,0.395055,0.0,0.0,0.0,0.0,1.0
Soil Type_Red,250000.0,0.197436,0.398065,0.0,0.0,0.0,0.0,1.0


In [51]:
# We remove the variables that we will not use

#test_end_new = test_end.drop(columns=[])

In [52]:
test_end.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Temparature            250000 non-null  float64
 1   Humidity               250000 non-null  float64
 2   Moisture               250000 non-null  float64
 3   Nitrogen               250000 non-null  float64
 4   Potassium              250000 non-null  float64
 5   Phosphorous            250000 non-null  float64
 6   Soil Type_Black        250000 non-null  float64
 7   Soil Type_Clayey       250000 non-null  float64
 8   Soil Type_Loamy        250000 non-null  float64
 9   Soil Type_Red          250000 non-null  float64
 10  Soil Type_Sandy        250000 non-null  float64
 11  Crop Type_Barley       250000 non-null  float64
 12  Crop Type_Cotton       250000 non-null  float64
 13  Crop Type_Ground Nuts  250000 non-null  float64
 14  Crop Type_Maize        250000 non-nu

## Apply the Model & Submission File

In [53]:
# We apply the trained model

test_pred_prob = final_model.predict_proba(test_end)
test_top_3 = np.argsort(-test_pred_prob, axis=1)[:, :3]

top_3_fertilizer_names = []
for row_indices in test_top_3:
    valid_indices = [idx for idx in row_indices if idx < len(le.classes_)]
    names = le.inverse_transform(valid_indices)
    top_3_fertilizer_names.append(names)

formatted_predictions = [" ".join(names) for names in top_3_fertilizer_names]

In [54]:
# We review the result

print("Total predictions: ", len(formatted_predictions), "\n")

Total predictions:  250000 



In [55]:
# We create the dataframe

fertilizer_submission = pd.DataFrame({
    "id" : df_test["id"], 
    "Fertilizer Name" : formatted_predictions
})

fertilizer_submission.head()

Unnamed: 0,id,Fertilizer Name
0,750000,28-28 DAP Urea
1,750001,17-17-17 20-20 10-26-26
2,750002,20-20 28-28 10-26-26
3,750003,14-35-14 Urea 17-17-17
4,750004,20-20 28-28 10-26-26


In [56]:
# We compare the results with the sample

print(
    f"Shape Sample Submission: {fertilizer_sample.shape}",
    f"\nShape Fertilizer Submission: {fertilizer_submission.shape}"
)
print("\n", fertilizer_sample.head())

Shape Sample Submission: (250000, 2) 
Shape Fertilizer Submission: (250000, 2)

        id         Fertilizer Name
0  750000  14-35-14 10-26-26 Urea
1  750001  14-35-14 10-26-26 Urea
2  750002  14-35-14 10-26-26 Urea
3  750003  14-35-14 10-26-26 Urea
4  750004  14-35-14 10-26-26 Urea


In [57]:
# We convert the dataframe to a csv file

fertilizer_submission.to_csv("submission.csv", index=False)