In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'planttraits2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F65626%2F8046133%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240610%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240610T173557Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D6734198f50f3d3cf40425e0b6fa5903d0cddf6803ccca1e565fb786fab35dc4b7190254ad657bd1a2f4ca4fd154db9bdb2aa918433734656d7552143f1c9ea072d4e9b744d5b76b07a42ed196a63489972b499341154fecdb7d9e74b5000bee259cd4112fc47da07f2a08d836fd1234920b6892254cf5f6f5982f46eda88e131a32146de151ee206a23f47ea6b7403a556ebfbde70a74e2e32d171d5d16a528b9babab23c3b04c63deaa00ee43c212b8890dfefd98e9df280e4a0dd166dd58871f1b5141366535584282262648e3bf9ee663b55f9cea69af0fb73bd72b9ebe5e4904bacb85dc99537dc157a7cf8db7bb5a1d2bdae5e323089e83c7854fdd5a9d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading planttraits2024, 3402840652 bytes compressed
Downloaded and uncompressed: planttraits2024
Data source import complete.


In [2]:
import os
import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing import image

![image.png](attachment:4c8d425c-f3ea-46c4-b28d-3c85c879e2d3.png)


# Load data + clean

In [3]:
train = pd.read_csv('/kaggle/input/planttraits2024/train.csv')
test = pd.read_csv('/kaggle/input/planttraits2024/test.csv')

# Drop sd columns
sd_columns = [col for col in train.columns if col.endswith('_sd')]
train = train.drop(columns=sd_columns)

# Identify target variables
mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

# Drop train items with outliers beyond percentile 0.98 (as per results of a prior test)
for column in mean_columns:
    upper_quantile = train[column].quantile(0.98)
    train = train[(train[column] < upper_quantile)]

# Load ImageNet

In [4]:
image_model = EfficientNetB3(weights='imagenet', include_top=False, pooling='avg')

# Resolution for ImageNet
image_model_x = 300
image_model_y = 300

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb3_notop.h5


# EfficientNet for image extraction

In [5]:
# Define function to create a TensorFlow image dataset
def create_dataset(image_paths, batch_size=128):
    def process_path(file_path):
        img = tf.io.read_file(file_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, [image_model_x, image_model_y])
        img = preprocess_input(img)
        return img
    path_ds = tf.data.Dataset.from_tensor_slices(image_paths)
    image_ds = path_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
    image_ds = image_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return image_ds

def extract_features_with_dataset(dataset, df):
    features_list = []
    for batch_imgs in dataset:
        print(".", end="")
        features = image_model.predict(batch_imgs, verbose=0)
        features_list.extend(features)
    features_array = np.array(features_list)

    # Convert features array into df
    features_df = pd.DataFrame(features_array)

    features_df.columns = [f'feature_{i}' for i in range(features_array.shape[1])]

    new_df = pd.concat([df.reset_index(drop=True), features_df.reset_index(drop=True)], axis=1)

    return new_df

# Extract image data for train

In [6]:
train_image_folder = '/kaggle/input/planttraits2024/train_images'

image_paths = [os.path.join(train_image_folder, f"{img_id}.jpeg") for img_id in train['id']]

# Create dataset
image_dataset = create_dataset(image_paths)

# Extract features and insert them into df
train = extract_features_with_dataset(image_dataset, train)

print(train.head())

................................................................................................................................................................................................................................................................................................................................................................................................          id  WORLDCLIM_BIO1_annual_mean_temperature  \
0  192027691                               12.235703   
1  195542235                               17.270555   
2  196639184                               14.254504   
3  195728812                               18.680834   
4  195251545                                0.673204   

   WORLDCLIM_BIO12_annual_precipitation  \
0                            374.466675   
1                             90.239998   
2                            902.071411   
3                           1473.933350   
4                            530.088867   

   WORLDCLIM_BIO13.B

# Train on tabular data

In [7]:
max_estimators = 1000
early_stopping_limit = 30
val_size = 0.05

X_full = train.drop(columns=mean_columns)
Y_full = train[mean_columns]

models = {}

for column in Y_full.columns:
    print("\nTraining for column:", column, "...")
    X_train, X_test, y_train, y_test = train_test_split(X_full, Y_full[column], test_size=val_size, random_state=42)

    # LightGBM parameters
    params = {
        'n_estimators': max_estimators,
        'max_depth': 8,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'regression',
        'metric': 'rmse',
        'random_state': 42,
        'learning_rate': 0.03,
        'verbose': -1,  # keep logs quiet
        'lambda_l1': 0.1,  # L1 regularization
        'lambda_l2': 0.1   # L2 regularization
    }

    # Create model
    model = lgb.LGBMRegressor(**params)

    def callback(env):
        if env.iteration % 10 == 0:
            print("Iteration:", env.iteration, "\tRMSE:", env.evaluation_result_list[0][2])

    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_limit), callback]
    )

    models[column] = model

    # Predict on test set
    y_pred = model.predict(X_test)

    # Calculate Rsq score
    r2 = r2_score(y_test, y_pred)
    print(f"R2 score for column {column}: {r2:.4f}")



Training for column: X4_mean ...
Iteration: 0 	RMSE: 0.13713577324074647
Training until validation scores don't improve for 30 rounds
Iteration: 10 	RMSE: 0.1314612132528413
Iteration: 20 	RMSE: 0.1277301183661913
Iteration: 30 	RMSE: 0.12510526502978683
Iteration: 40 	RMSE: 0.1232546660600135
Iteration: 50 	RMSE: 0.12189660959793264
Iteration: 60 	RMSE: 0.12086556154662204
Iteration: 70 	RMSE: 0.12010384949103241
Iteration: 80 	RMSE: 0.11950077896585222
Iteration: 90 	RMSE: 0.11896530813646142
Iteration: 100 	RMSE: 0.11853799177882071
Iteration: 110 	RMSE: 0.11829138947104981
Iteration: 120 	RMSE: 0.11801739771727338
Iteration: 130 	RMSE: 0.11779542391845992
Iteration: 140 	RMSE: 0.11758766583788971
Iteration: 150 	RMSE: 0.11739652641610406
Iteration: 160 	RMSE: 0.11728768416362084
Iteration: 170 	RMSE: 0.11714991567124244
Iteration: 180 	RMSE: 0.11700071596626281
Iteration: 190 	RMSE: 0.11687478745713988
Iteration: 200 	RMSE: 0.11680973910885907
Iteration: 210 	RMSE: 0.1166735914544

# Prepare for submission

In [8]:
mean_values = Y_full.mean()
submission = pd.DataFrame({'id': test['id']})
submission[Y_full.columns] = mean_values

#rename
submission.columns = submission.columns.str.replace('_mean', '')
submission.head()

Unnamed: 0,id,X4,X11,X18,X50,X26,X3112
0,201238668,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
1,202310319,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
2,202604412,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
3,201353439,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
4,195351745,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107


# Extract image data for test

In [9]:
test_image_folder = '/kaggle/input/planttraits2024/test_images'

image_paths = [os.path.join(test_image_folder, f"{img_id}.jpeg") for img_id in test['id']]

# Create dataset
image_dataset = create_dataset(image_paths)

# Extract features and insert into df
test = extract_features_with_dataset(image_dataset, test)

test.head()

....................................................

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,feature_1526,feature_1527,feature_1528,feature_1529,feature_1530,feature_1531,feature_1532,feature_1533,feature_1534,feature_1535
0,201238668,8.086756,2246.5,127.321426,20.423418,353.381042,17.535713,80,109,90,...,-0.07106,-0.068737,0.044932,-0.235093,0.041368,-0.178281,1.420579,0.095737,-0.052467,0.43753
1,202310319,10.844286,495.871429,28.023809,18.738306,786.554382,29.292856,130,155,142,...,-0.102929,2.028094,-0.102838,0.111633,0.086307,0.240147,-0.067976,0.681794,-0.058765,0.17743
2,202604412,8.105556,378.328583,39.92857,41.885647,722.071167,34.853809,133,134,139,...,0.226042,-0.065445,-0.128685,0.826448,-0.166941,-0.13688,0.219512,0.489472,0.25125,0.54536
3,201353439,7.077679,878.785706,70.428574,37.045235,669.389343,25.15,103,140,116,...,0.02,0.427104,0.193953,-0.159339,0.76523,-0.108503,0.37582,-0.193448,-0.113357,0.859642
4,195351745,4.790555,2299.366699,150.199997,24.136568,462.887695,22.516666,85,114,98,...,-0.145303,0.07119,-0.124988,-0.172981,-0.139534,-0.087718,-0.218766,0.195551,0.064251,-0.054057


# Predictions for test

In [10]:
submission['X4'] = models['X4_mean'].predict(test)
submission['X11'] = models['X11_mean'].predict(test)
submission['X18'] = models['X18_mean'].predict(test)
submission['X50'] = models['X50_mean'].predict(test)
submission['X26'] = models['X26_mean'].predict(test)
submission['X3112'] = models['X3112_mean'].predict(test)

submission.head()

Unnamed: 0,id,X4,X11,X18,X50,X26,X3112
0,201238668,0.593165,11.952044,1.882482,1.550535,8.167944,374.539593
1,202310319,0.415975,18.628869,0.817654,1.495307,5.396787,1514.162637
2,202604412,0.521911,14.762932,1.840625,1.68844,14.152941,1564.773119
3,201353439,0.429978,22.59645,-0.063633,1.302227,5.238432,1457.489072
4,195351745,0.495612,11.103205,0.688223,1.544372,4.080323,713.747735
