# Model Validation

This notebook validates the performance of our fine-tuned CatBoost model by estimating its MAP@3 score using 4-fold cross-validation on the competition dataset. For each fold, we use a 75%/25% split of the training fold to determine the optimal number of boosting iterations via early stopping. We then retrain on the full fold using this optimal iteration count to generate predictions on the held-out validation set. The MAP@3 scores from each fold are averaged to produce a final performance estimate.

**Runtime:** > 2 hours

In [None]:
# pip3 install numpy pandas matplotlib scipy scikit-learn catboost optuna nbformat --upgrade
import time
import json

# Data Science Libraries
import numpy as np
import pandas as pd

# Data Preparation
from sklearn.preprocessing import LabelEncoder

# Modelling
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

# Custom Functions -> Function definitions can be found in utils.py file
from utils import MAP3, generate_model_rankings

In [2]:
sampler_seed = 346346
split_seed = 4326
model_seed = 36209436

In [3]:
# Read in the training data
train = pd.read_csv('Data/train.csv', index_col='id')
train = train.rename(columns={'Temparature':'Temperature'})
print(f'The training data has {train.shape[0]:,} rows and {train.shape[1]} columns.')
train.head()

The training data has 750,000 rows and 9 columns.


Unnamed: 0_level_0,Temperature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,27,69,65,Sandy,Millets,30,6,18,28-28
2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,35,58,43,Red,Paddy,37,2,16,DAP


In [4]:
# Store the numeric and categorical columns
numeric_cols = ['Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
categorical_cols = ['Soil Type', 'Crop Type']

# Extract feature columns and the target column
features = numeric_cols + categorical_cols
cat_cols_idxs = [int(inx) for inx in np.where(np.isin(features, categorical_cols))[0]]
target = 'Fertilizer Name'

In [5]:
# Fit LabelEncoder to the target
le = LabelEncoder()
le.fit(train[target])
class_mapping = list(le.classes_)
class_mapping

['10-26-26', '14-35-14', '17-17-17', '20-20', '28-28', 'DAP', 'Urea']

In [6]:
# Read the hyperparameters from the file in which they are stored
with open('Study_Results/Phase2/best_params.json', 'r') as file:
    hyperparameters = json.load(file)

for parameter, setting in hyperparameters.items():
    print(f'{parameter}: {setting}')

od_wait = hyperparameters.pop('od_wait')

learning_rate: 0.049745499889766426
depth: 10
colsample_bylevel: 0.46619923304859245
od_wait: 39
subsample: 0.06155900139866675
min_data_in_leaf: 90
l2_leaf_reg: 2.399719543149106
random_strength: 0.050324592372389315
bootstrap_type: Bernoulli
boosting_type: Ordered
loss_function: MultiClass
od_type: IncToDec


In [7]:
# Get the features and the target
X, y = train[features], train[target]

# Initialize the CatBoostClassifier model 
model = CatBoostClassifier(
    **hyperparameters,
    iterations = 5000, # Set the number of iterations high to allow the stopping criterion to kick in
    allow_writing_files = False,
    random_seed = model_seed                  
)

In [8]:
# Set up the folds
sk_folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=split_seed) # 4-folds, since the competition data split is 75/25

# Initialize a list to keep the MAP@3 scores in
scores = []

# Initialize a list to keep track of the number of iterations required per run
numIterations = []

# Count of the number of fold iterations
i = 0

# Run and time the cross-validation step
start = time.time()
for train_indices, valid_indices in sk_folds.split(X, y):
    i += 1

    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
    y_train_encoded = le.transform(y_train)

    X_valid, y_valid = X.iloc[valid_indices], y.iloc[valid_indices]
    y_valid_encoded = le.transform(y_valid)

    model.fit(X_train,
              y_train_encoded,
              cat_features=cat_cols_idxs,
              eval_set=(X_valid, y_valid_encoded),
              early_stopping_rounds=od_wait,
              verbose=True)
    
    fold_rankings = generate_model_rankings(model, X_valid, class_mapping)

    fold_score = MAP3(labels=y_valid, rankings=fold_rankings)

    best_iteration = model.get_best_iteration()

    scores.append(fold_score)
    numIterations.append(best_iteration+1)

    print(f'Run {i}:')
    print(f'\t* Best iteration: {best_iteration}')
    print(f'\t* Total number of trees: {model.tree_count_}')    
    print(f'\t* MAP@3 Score: {fold_score:.6f}')
    print('---------------------------------------------\n')
end = time.time()

# Print the results
print(f'Estimated MAP@3 Score: {np.mean(scores):.6f} +/- {np.std(scores):.6f}')
print(f'Number of iterations needed: {np.mean(numIterations):.1f} +/- {np.std(numIterations):.1f}')
print('---------------------------------------------\n')
print(f'Model Validation Runtime: {(end-start)/60:.2f} minutes.')

0:	learn: 1.9455798	test: 1.9455845	best: 1.9455845 (0)	total: 471ms	remaining: 39m 14s
1:	learn: 1.9452478	test: 1.9452673	best: 1.9452673 (1)	total: 1.24s	remaining: 51m 32s
2:	learn: 1.9449857	test: 1.9450056	best: 1.9450056 (2)	total: 1.6s	remaining: 44m 23s
3:	learn: 1.9447368	test: 1.9447538	best: 1.9447538 (3)	total: 1.96s	remaining: 40m 46s
4:	learn: 1.9443744	test: 1.9443982	best: 1.9443982 (4)	total: 2.46s	remaining: 41m 3s
5:	learn: 1.9440579	test: 1.9440827	best: 1.9440827 (5)	total: 2.85s	remaining: 39m 36s
6:	learn: 1.9438640	test: 1.9438902	best: 1.9438902 (6)	total: 3.15s	remaining: 37m 25s
7:	learn: 1.9436058	test: 1.9436298	best: 1.9436298 (7)	total: 3.39s	remaining: 35m 18s
8:	learn: 1.9433066	test: 1.9433398	best: 1.9433398 (8)	total: 4.02s	remaining: 37m 9s
9:	learn: 1.9431383	test: 1.9431656	best: 1.9431656 (9)	total: 4.28s	remaining: 35m 34s
10:	learn: 1.9429250	test: 1.9429586	best: 1.9429586 (10)	total: 4.57s	remaining: 34m 34s
11:	learn: 1.9426980	test: 1.9427

In [9]:
# Save the average number of iterations used
iterations = int(round( np.mean(numIterations)))
print(iterations)

2458


In [10]:
# Save the approximate number of iterations to a file for later use in the main model
with open('average_iterations.txt', 'w') as file:
    file.write(str(iterations))