In [1]:
import os
import numpy as np
import pandas as pd

import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image

from tqdm.notebook import tqdm

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import optuna
import xgboost as xgb

tqdm.pandas()

In [None]:
class PlantDataset(Dataset):
    def __init__(self, image_dir, df, transform=None):
        self.image_dir = image_dir
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0]
        img_path = os.path.join(self.image_dir, f"{img_id}.jpeg")
        image = Image.open(img_path)

        if self.transform:
            image = self.transform(image)

        return image, img_id

In [None]:
class Config:
    def __init__(self):
        self.seed = 32
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.train_csv_path = 'train.csv'
        self.test_csv_path = 'test.csv'
        self.train_images_path = 'train_images'
        self.test_images_path = 'test_images'
        self.target_columns = ['X4_mean', 'X11_mean',
                               'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
        self.batch_size = 64


def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


CONFIG = Config()
seed_everything(CONFIG.seed)

In [None]:
# Load the pretrained model
model = torch.hub.load('facebookresearch/dinov2',
                       'dinov2_vitg14_reg').to(CONFIG.device)
model.eval()

In [None]:
# Transform for images
transform = transforms.Compose([
    transforms.Resize(256, interpolation=3),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# Load training data
train_data = pd.read_csv(CONFIG.train_csv_path)
test_data = pd.read_csv(CONFIG.test_csv_path)

# Extract metadata and target values. From id (exclusive) to end of ancillary data
train_metadata = train_data.iloc[:, 1:164].values
# Get targets for training data
train_targets = train_data[CONFIG.target_columns].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val, train_indices, val_indices = train_test_split(
    train_metadata, train_targets, range(len(train_metadata)), test_size=0.05, random_state=42
)

# Fit and transform metadata
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_metadata = test_data.values[:, 1:]
test_metadata_scaled = scaler.transform(test_metadata)


poly = PolynomialFeatures(degree=2, include_bias=False)
# Fit and transform the training metadata using polynomial features
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
test_metadata_poly = poly.transform(test_metadata_scaled)

In [None]:
# Datasets and DataLoaders
train_dataset = PlantDataset(
    image_dir='train_images', df=train_data.iloc[train_indices], transform=transform)
val_dataset = PlantDataset(image_dir='train_images',
                           df=train_data.iloc[val_indices], transform=transform)
test_dataset = PlantDataset(image_dir='test_images',
                            df=test_data, transform=transform)

train_loader = DataLoader(
    train_dataset, batch_size=CONFIG.batch_size, shuffle=False)
val_loader = DataLoader(
    val_dataset, batch_size=CONFIG.batch_size, shuffle=False)
test_loader = DataLoader(
    test_dataset, batch_size=CONFIG.batch_size, shuffle=False)

In [None]:
def extract_embeddings(loader, model):
    embeddings = []
    for images, _ in tqdm(loader, desc="Extracting embeddings"):
        with torch.no_grad():
            # obtain the embeddings, put them on the cpu and convert them to a numpy array
            batch_embeddings = model(images).cpu().numpy()
            embeddings.append(batch_embeddings)
    # stack all of the embeddings into one single array
    return np.vstack(embeddings)


suffix = 'img_embs'
train_embeddings = extract_embeddings(train_loader, model)
np.save(f'torch_giant_five/train_{suffix}', np.array(train_embeddings))
val_embeddings = extract_embeddings(val_loader, model)
np.save(f'torch_giant_five/val_{suffix}', np.array(val_embeddings))
test_embeddings = extract_embeddings(test_loader, model)
np.save(f'torch_giant_five/test_{suffix}', np.array(test_embeddings))

In [None]:
# Load the saved extracted embeddings
train_embeddings = np.load('torch_giant_five/train_img_embs.npy')
val_embeddings = np.load('torch_giant_five/val_img_embs.npy')
test_embeddings = np.load('torch_giant_five/test_img_embs.npy')

In [None]:
# Combine polynomial features with embeddings
x_train_full_poly = np.concatenate([train_embeddings, X_train_poly], axis=1)
x_val_poly = np.concatenate([val_embeddings, X_val_poly], axis=1)
x_test_poly = np.concatenate([test_embeddings, test_metadata_poly], axis=1)

In [None]:
# OPTIONAL: Use optuna to find the best result
from optuna.pruners import MedianPruner


def objective(trial):
    xgb_params = {
        # Expanded upper bound for n_estimators, keeping the minimum at 1500
        'n_estimators': trial.suggest_int('n_estimators', 1400, 3000),
        # Expanded range for learning_rate
        'learning_rate': trial.suggest_float('learning_rate', 0.015, 0.06),
        # Slightly expanded range for max_depth
        'max_depth': trial.suggest_int('max_depth', 6, 12),
        # Slightly expanded range for subsample
        'subsample': trial.suggest_float('subsample', 0.65, 0.95),
        # Expanded range for colsample_bytree
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.65, 0.95),
        # Expanded range for reg_alpha
        'reg_alpha': trial.suggest_float('reg_alpha', 0.15, 0.85),
        # Expanded range for reg_lambda
        'reg_lambda': trial.suggest_float('reg_lambda', 1.0, 6.5),
        # Adjusted range for min_child_weight
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 8),
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'random_state': CONFIG.seed
    }

    # Train and evaluate the model
    r2_scores = {}
    for i, trait in tqdm(enumerate(CONFIG.target_columns), total=len(CONFIG.target_columns)):
        model = xgb.XGBRegressor(**xgb_params)

        eval_set = [(x_train_full_poly, y_train[:, i]),
                    (x_val_poly, y_val[:, i])]
        model.fit(
            x_train_full_poly,
            y_train[:, i],
            eval_set=eval_set,
            verbose=False  # Disable training logs
        )

        # Prune the trial if it's not improving
        val_predictions = model.predict(x_val_poly)
        r2_scores[trait] = r2_score(y_val[:, i], val_predictions)

        # Report the intermediate R2 score
        trial.report(-r2_scores[trait], i)

        # Prune the trial if it is getting worse
        if trial.should_prune():
            print(f'Trial {trial.number} pruned at column {i}')
            raise optuna.TrialPruned()

    # Return the negative average R2 score
    average_r2_val = sum(r2_scores.values()) / len(r2_scores)
    return -average_r2_val


study = optuna.create_study(direction='minimize', pruner=MedianPruner())

# Optimize the objective function
study.optimize(objective, n_trials=50)

fig = optuna.visualization.plot_optimization_history(study)
fig.show()

# Get the best trial
best_trial = study.best_trial
print(f'Best trial: {best_trial.number}')
print(f'Best value: {-best_trial.value}')
print(f'Best params: {best_trial.params}')

In [None]:
# Print best hyperparameters
print("Best hyperparameters:", study.best_params)

# Train your final model with the best parameters found from optuna
best_xgb_params = study.best_params
best_xgb_params['objective'] = 'reg:squarederror'
best_xgb_params['eval_metric'] = 'rmse'
best_xgb_params['random_state'] = CONFIG.seed

best_xgb_params = {
    'n_estimators': 2906,
    'learning_rate': 0.028283636339476847,
    'max_depth': 7,
    'subsample': 0.9209937477043662,
    'colsample_bytree': 0.8045338627186344,
    'reg_alpha': 0.3276660546210266,
    'reg_lambda': 5.834782742225015,
    'min_child_weight': 4,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'random_state': CONFIG.seed
}

models = {}
val_predictions = {}
r2_scores = {}

# Train, evaluate, and retrain the model
for i, trait in tqdm(enumerate(CONFIG.target_columns), total=len(CONFIG.target_columns), desc="Training"):
    # Initialize and train the model
    models[trait] = xgb.XGBRegressor(**best_xgb_params)
    models[trait].fit(x_train_full_poly, y_train[:, i])

    # Evaluate on the validation set
    val_predictions[trait] = models[trait].predict(x_val_poly)
    r2_scores[trait] = r2_score(y_val[:, i], val_predictions[trait])
    print(f'R2 score for {trait} on validation set: {r2_scores[trait]}')

print("----------FINISHING TRAINING----------")
# Calculate the average R2 score on the validation set
average_r2_val = sum(r2_scores.values()) / len(r2_scores)
print(f'Average R2 score on validation set: {average_r2_val}')

In [None]:
# Test predictions with polynomial features
test_predictions = {}
for i, trait in tqdm(enumerate(CONFIG.target_columns), total=len(CONFIG.target_columns)):
    # Make predictions for the test set using the model trained on polynomial features
    test_predictions[trait] = models[trait].predict(x_test_poly)

In [None]:
# Create a DataFrame for the submission
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'X4': test_predictions['X4_mean'],
    'X11': test_predictions['X11_mean'],
    'X18': test_predictions['X18_mean'],
    'X26': test_predictions['X26_mean'],
    'X50': test_predictions['X50_mean'],
    'X3112': test_predictions['X3112_mean'],
})

# Save the DataFrame to a CSV file
submission_df.to_csv(f'v25.csv', index=False)