# Overview
This notebook was my first attempt at a Kaggle Competition. Upon reviewing it now (~6 months after the end of the competition) I can see numerous ways it could be improved, especially with regards to feature engineering. However, I did learn some useful skills such as nested cross-validation to improve model performance during this competition.

[Competition Link](https://www.kaggle.com/competitions/playground-series-s3e14)

In [3]:
# General Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# XGBoost Imports
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_validate

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/playground-series-s3e14/sample_submission.csv
/kaggle/input/playground-series-s3e14/train.csv
/kaggle/input/playground-series-s3e14/test.csv
/kaggle/input/wild-blueberry-yield-prediction-dataset/WildBlueberryPollinationSimulationData.csv


In [4]:
def output_to_csv(predictions):
    ids = [val for val in range(15289, 25483)]
    submission_df = pd.DataFrame({'id': ids, 'yield' : predictions})
    submission_df = submission_df.astype({"id": "int", "yield": "float64"})
    submission_df.to_csv(f"submission.csv", index=False)

In [5]:
data = pd.read_csv('/kaggle/input/playground-series-s3e14/train.csv')
data

Unnamed: 0,id,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,0,25.0,0.50,0.25,0.75,0.50,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887,4476.81146
1,1,25.0,0.50,0.25,0.50,0.50,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317,5548.12201
2,2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781,6869.77760
3,3,12.5,0.25,0.25,0.63,0.50,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561,6880.77590
4,4,25.0,0.50,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512,7479.93417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15284,15284,12.5,0.25,0.25,0.38,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.556302,0.476308,40.546480,7667.83619
15285,15285,12.5,0.25,0.25,0.25,0.50,86.0,52.0,71.9,62.0,30.0,50.8,34.0,0.56,0.354413,0.388145,29.467434,3680.56025
15286,15286,25.0,0.50,0.25,0.38,0.75,77.4,46.8,64.7,55.8,27.0,45.8,34.0,0.56,0.422548,0.416786,32.299059,4696.44394
15287,15287,25.0,0.50,0.25,0.63,0.63,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.542170,0.434133,36.674243,6772.93347


In [6]:
X_train = data.iloc[:, 1:17]
y_train = data['yield']

In [7]:
X_train

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,25.0,0.50,0.25,0.75,0.50,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887
1,25.0,0.50,0.25,0.50,0.50,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781
3,12.5,0.25,0.25,0.63,0.50,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561
4,25.0,0.50,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15284,12.5,0.25,0.25,0.38,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.556302,0.476308,40.546480
15285,12.5,0.25,0.25,0.25,0.50,86.0,52.0,71.9,62.0,30.0,50.8,34.0,0.56,0.354413,0.388145,29.467434
15286,25.0,0.50,0.25,0.38,0.75,77.4,46.8,64.7,55.8,27.0,45.8,34.0,0.56,0.422548,0.416786,32.299059
15287,25.0,0.50,0.25,0.63,0.63,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.542170,0.434133,36.674243


In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15289 entries, 0 to 15288
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   clonesize             15289 non-null  float64
 1   honeybee              15289 non-null  float64
 2   bumbles               15289 non-null  float64
 3   andrena               15289 non-null  float64
 4   osmia                 15289 non-null  float64
 5   MaxOfUpperTRange      15289 non-null  float64
 6   MinOfUpperTRange      15289 non-null  float64
 7   AverageOfUpperTRange  15289 non-null  float64
 8   MaxOfLowerTRange      15289 non-null  float64
 9   MinOfLowerTRange      15289 non-null  float64
 10  AverageOfLowerTRange  15289 non-null  float64
 11  RainingDays           15289 non-null  float64
 12  AverageRainingDays    15289 non-null  float64
 13  fruitset              15289 non-null  float64
 14  fruitmass             15289 non-null  float64
 15  seeds              

In [14]:
y_train

0        4476.81146
1        5548.12201
2        6869.77760
3        6880.77590
4        7479.93417
            ...    
15284    7667.83619
15285    3680.56025
15286    4696.44394
15287    6772.93347
15288    5867.99722
Name: yield, Length: 15289, dtype: float64

In [15]:
X_test = pd.read_csv("/kaggle/input/playground-series-s3e14/test.csv")
X_test.drop(columns='id', inplace=True)
X_test

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569
1,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.10,0.488048,0.442866,36.846956
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644
3,25.0,0.50,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091
4,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.388860,29.558019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10189,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.474162,0.437923,34.525258
10190,25.0,0.50,0.25,0.50,0.75,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.482854,0.440676,35.648221
10191,25.0,0.50,0.38,0.50,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.568854,0.463065,37.724724
10192,12.5,0.25,0.25,0.38,0.50,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.407374,0.409261,31.881847


In [16]:
wild_blueberries_dmatrix = xgb.DMatrix(X_train, y_train)
wild_blueberries_dmatrix

<xgboost.core.DMatrix at 0x7a7b39502710>

In [17]:
params = {"objective":"reg:squarederror", "max_depth":3}
num_rounds = list(range(5,50,5))
final_mae_per_round = []

for curr_round in num_rounds:
    cv_results = xgb.cv(dtrain=wild_blueberries_dmatrix, params=params, nfold=3, early_stopping_rounds=10, num_boost_round=curr_round, metrics="mae", as_pandas=True, seed=123)
    final_mae_per_round.append(cv_results['test-mae-mean'].values[-1])
    
final_mae_per_round

[1049.3294229881383,
 392.3614867455091,
 358.34693020319173,
 356.9392538731042,
 356.34035450429343,
 356.50371861364056,
 356.34035450429343,
 356.34035450429343,
 356.34035450429343]

In [18]:
params = {
    'learning_rate' : [0.01, 0.1, 0.5, 0.9, 1]
    , 'n_estimators' : [25, 100]
    , 'subsample' : [0.3, 0.5, 0.9, 1] 
}

In [19]:
xgb_model = xgb.XGBRegressor()
grid_mae = GridSearchCV(estimator=xgb_model, param_grid=params, scoring='neg_mean_absolute_error', cv=5, verbose=True)
grid_mae.fit(X_train, y_train)
print(grid_mae.best_params_)
print(grid_mae.best_score_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
{'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 0.9}
-354.6334268788306


In [20]:
xgb_model = xgb.XGBRegressor()
rndm_mae = RandomizedSearchCV(estimator=xgb_model, param_distributions=params, scoring='neg_mean_absolute_error', n_iter=25, cv=5, verbose=True)
rndm_mae.fit(X_train, y_train)
print(rndm_mae.best_params_)
print(rndm_mae.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
{'subsample': 0.9, 'n_estimators': 100, 'learning_rate': 0.1}
-354.6334268788306


In [21]:
X_train.std()

clonesize                6.595211
honeybee                 0.361643
bumbles                  0.059917
andrena                  0.148115
osmia                    0.139489
MaxOfUpperTRange         9.146703
MinOfUpperTRange         5.546405
AverageOfUpperTRange     7.641807
MaxOfLowerTRange         6.610640
MinOfLowerTRange         3.195367
AverageOfLowerTRange     5.390545
RainingDays             11.657582
AverageRainingDays       0.163905
fruitset                 0.074390
fruitmass                0.037035
seeds                    4.031087
dtype: float64

In [22]:
y_train.std()

1337.0568497954757

In [23]:
X_test = pd.read_csv("/kaggle/input/playground-series-s3e14/test.csv")
X_test.drop(columns="id", inplace=True)
X_test

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569
1,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.10,0.488048,0.442866,36.846956
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644
3,25.0,0.50,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091
4,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.388860,29.558019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10189,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.474162,0.437923,34.525258
10190,25.0,0.50,0.25,0.50,0.75,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.482854,0.440676,35.648221
10191,25.0,0.50,0.38,0.50,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.568854,0.463065,37.724724
10192,12.5,0.25,0.25,0.38,0.50,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.407374,0.409261,31.881847


In [24]:
original_data = pd.read_csv("/kaggle/input/wild-blueberry-yield-prediction-dataset/WildBlueberryPollinationSimulationData.csv")
original_data.drop(columns="Row#", inplace=True)
original_data

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,37.5,0.750,0.250,0.250,0.250,86.0,52.0,71.9,62.0,30.0,50.8,16.00,0.26,0.410652,0.408159,31.678898,3813.165795
1,37.5,0.750,0.250,0.250,0.250,86.0,52.0,71.9,62.0,30.0,50.8,1.00,0.10,0.444254,0.425458,33.449385,4947.605663
2,37.5,0.750,0.250,0.250,0.250,94.6,57.2,79.0,68.2,33.0,55.9,16.00,0.26,0.383787,0.399172,30.546306,3866.798965
3,37.5,0.750,0.250,0.250,0.250,94.6,57.2,79.0,68.2,33.0,55.9,1.00,0.10,0.407564,0.408789,31.562586,4303.943030
4,37.5,0.750,0.250,0.250,0.250,86.0,52.0,71.9,62.0,30.0,50.8,24.00,0.39,0.354413,0.382703,28.873714,3436.493543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,10.0,0.537,0.117,0.409,0.058,86.0,52.0,71.9,62.0,30.0,50.8,3.77,0.06,0.486815,0.428012,33.447471,5333.873335
773,40.0,0.537,0.117,0.409,0.058,86.0,52.0,71.9,62.0,30.0,50.8,3.77,0.06,0.342841,0.377915,28.462005,3373.436842
774,20.0,0.537,0.117,0.409,0.058,86.0,52.0,71.9,62.0,30.0,50.8,24.00,0.39,0.404617,0.401670,30.748240,4203.027624
775,20.0,0.537,0.117,0.409,0.058,89.0,39.0,65.6,66.0,28.0,45.3,3.77,0.06,0.401538,0.399935,30.582161,4166.299735


In [25]:
X_train.append(original_data[:, 1:17], inplace=True)
X_train

InvalidIndexError: (slice(None, None, None), slice(1, 17, None))

In [26]:
xgb_model = xgb.XGBRegressor()
gcv = GridSearchCV(estimator=xgb_model, param_grid=params, scoring='neg_mean_absolute_error', cv=3, n_jobs=1, return_train_score=False)
results = cross_validate(gcv, X_train, y_train, scoring='neg_mean_absolute_error', cv=10, return_train_score=False)

In [27]:
display(pd.DataFrame(results))

Unnamed: 0,fit_time,score_time,test_score
0,51.394625,0.005261,-360.846443
1,50.14551,0.005326,-346.702527
2,50.906357,0.005144,-362.328595
3,50.907925,0.005208,-345.071986
4,49.169035,0.005125,-341.980482
5,50.178354,0.005168,-361.151756
6,49.066548,0.005127,-349.203024
7,50.044523,0.00549,-345.485009
8,50.358106,0.005187,-361.125641
9,48.905806,0.005147,-348.50103


In [28]:
gcv.fit(X_train, y_train)

In [29]:
gcv.best_params_

{'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 0.9}

In [30]:
xgb_model = gcv.best_estimator_
xgb_model.set_params(eval_metric='mae')

In [31]:
xgb_model.fit(X_train, y_train, verbose=True)

In [32]:
y_pred = xgb_model.predict(X_test)
y_pred

array([4278.0103, 6267.155 , 7244.962 , ..., 6451.824 , 4399.286 ,
       7291.465 ], dtype=float32)

In [33]:
y_pred

array([4278.0103, 6267.155 , 7244.962 , ..., 6451.824 , 4399.286 ,
       7291.465 ], dtype=float32)

In [37]:
output_to_csv(y_pred)