In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torchvision
from PIL import Image
from tqdm.notebook import tqdm
from torchvision import transforms
from catboost import Pool, CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing import image

# Load, Split, and Filter data

In [None]:
train = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/train.csv')
train['file_path'] = train['id'].apply(lambda s: f'/kaggle/input/cs-480-2024-spring/data/train_images/{s}.jpeg')

test = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/test.csv')
test['file_path'] = test['id'].apply(lambda s: f'/kaggle/input/cs-480-2024-spring/data/test_images/{s}.jpeg')

#target columns
mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

#drop any train outliers
for column in mean_columns:
    upper_quantile = train[column].quantile(0.98)
    lower_quantile = train[column].quantile(0.001)
    train = train[(train[column] < upper_quantile) & (train[column] > lower_quantile)]

# Preprocessing

In [None]:
feature_columns = test.columns.values[1:-1]
#print(f"Length: {len(feature_columns)}\n{feature_columns}")

scaler = StandardScaler()
train[feature_columns] = scaler.fit_transform(train[feature_columns].values.astype(np.float32))
test[feature_columns] = scaler.transform(test[feature_columns].values.astype(np.float32))

# Load up DinoV2 image embeddings

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

def get_image_embeddings(model, preprocess_img, batch_size, df):
    embeddings = []
    for i in tqdm(range(0, len(df), batch_size)):
        img_paths = df['file_path'][i:i+batch_size]
        processed_imgs = [preprocess_img(Image.open(path)) for path in img_paths]
        img_tensor = torch.stack(processed_imgs).to(device)
        with torch.no_grad():
            img_embeddings = model(img_tensor)
        embeddings.extend(img_embeddings.cpu().numpy())
    return embeddings

# Use DinoV2 to turn images to image embeddings

In [None]:
# Whether we want to recompute image embeddings (~2h15)
RECOMPUTE = False

In [None]:
if RECOMPUTE:
    batch_size = 64
    model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_reg').to(CONFIG.DEVICE)
    model.eval()

    preprocess = transforms.Compose([
        transforms.Resize(224, interpolation=3),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    train_img_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, train)
    np.save(f'train_image_embeddings', np.array(train_image_embeddings))
    test_img_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, test)
    np.save(f'test_image_embeddings', np.array(test_image_embeddings))

    train['extracted_features'] = train_img_embeddings
    test['extracted_features'] = test_img_embeddings
else:
    train_img_embeddings = np.load(f'/kaggle/input/data-emebeddings/train_image_embs_dinov2_vitg14_reg.npy')
    test_img_embeddings = np.load(f'/kaggle/input/data-emebeddings/test_image_embs_dinov2_vitg14_reg.npy')

    # Delete this
    train['extracted_features'] = [list(row) for row in train_img_embeddings]
    test['extracted_features'] = [list(row) for row in test_img_embeddings]

print(type(train))

# Train on Catboost

In [None]:
# Prepare for training
max_estimators = 1000
early_stopping_limit = 30
val_size = 0.05

X_train = train.drop(columns=mean_columns)
# Need to drop since this is non-float
X_train = X_train.drop(columns='file_path')
Y_train = train[mean_columns] 

In [None]:
models = {}

for column in Y_train.columns:
    print("\nTraining for column:", column, "...")
    X_train_split, X_test, y_train_split, y_test = train_test_split(X_train, Y_train[column], test_size=val_size, random_state=42)

    train_pool = Pool(X_train_split, y_train_split, embedding_features=['extracted_features'])
    val_pool = Pool(X_test, y_test, embedding_features=['extracted_features'])
    
    model = CatBoostRegressor(iterations=1500, learning_rate=0.03, loss_function='RMSE', verbose=0, random_state=42)
    model.fit(train_pool)
    models[column] = model
    
    # Predict on the test set
    y_pred = model.predict(val_pool)

    # Calculate R² score
    r2 = r2_score(y_test, y_pred)
    print(f"R2 score for column {column}: {r2:.4f}")


# Fill in submit DF with mean of train values by default
### Provides near-0 R2 score values for any columns we don't predict for

In [None]:
mean_values = Y_train.mean()
submission = pd.DataFrame({'id': test['id']})
submission[Y_train.columns] = mean_values

#rename from _mean
submission.columns = submission.columns.str.replace('_mean', '')
submission.head()

# Predictions for test
## R2 scores look good for all targets - so we predict on everything...

In [None]:
test = test.drop(columns='file_path')
for i, column in enumerate(mean_columns):
    test_pool = Pool(test, embedding_features=['extracted_features'])
    column_prediction=models[column].predict(test_pool)
    submission[column.replace('_mean', '')] = column_prediction

submission.head()

# Submit!

In [None]:
submission.to_csv('20915348_pei.csv', index=False)

In [None]:
df = pd.read_csv('20915348_pei.csv')

submission_order = ['id', 'X4', 'X11', 'X18', 'X26', 'X50', 'X3112']

df = df.reindex(columns=submission_order)

df.to_csv('20915348_pei.csv', index=False)