# Setup Enviroment

In [None]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

import torch
from torchvision.transforms import v2
from torch import nn

from torchvision.io import read_image

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor

import lightgbm as lgb
import xgboost as xgb

from catboost import Pool, CatBoostRegressor


device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

if device == torch.device("cuda:0"):
  print('Everything looks good; continue')
else:
  print('GPU is not detected. Make sure you have chosen the right runtime type')


# load data 

In [None]:
df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

img_path_df = df['id'].apply(lambda x: os.path.join('./data/train_images', f'{x}.jpeg'))
img_path_test_df = test_df['id'].apply(lambda x: os.path.join('./data/test_images', f'{x}.jpeg'))


df.insert(1, 'img_path', img_path_df)
test_df.insert(1, 'img_path', img_path_test_df)

# Config

In [None]:
class CFG():
  # X4_mean,X11_mean,X18_mean,X26_mean,X50_mean,X3112_mean
  TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']
  PRED_COLUMNS = ['X4', 'X11', 'X18', 'X26', 'X50', 'X3112']
  # remove target columns and id
  FEATURE_COLUMNS = df.columns.drop(['id', 'X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean', 'img_path']).to_list()

  BATCH_SIZE = 64
  NUM_EPOCHS = 6

  NUM_TARGETS = 6
  NUM_FEATURES = 163

  LEARNING_RATE = 0.001
  WEIGHT_DECAY = 0.0001

  SEED = 42

print(CFG.TARGET_COLUMNS)
print(CFG.PRED_COLUMNS)
print(CFG.FEATURE_COLUMNS)


# Preprocess

## remove outliers

In [None]:
# remove 98% upper_quantile and 0.1% lower_quantile
print(df.shape)
for col in CFG.TARGET_COLUMNS:
  upper_quantile = df[col].quantile(0.98)
  lower_quantile = df[col].quantile(0.001)
  df = df[df[col] < upper_quantile ]
  df = df[df[col] > lower_quantile ]
print(df.shape)
df[CFG.TARGET_COLUMNS].hist(bins=50, figsize=(20, 10))


## split data

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=CFG.SEED)
print(train_df.shape, val_df.shape)


# Get image embedding

In [None]:
def get_embeddings(model, df, transform):
  model.eval()
  embeddings = []
  with torch.no_grad():
    for i in tqdm(range(0,len(df), CFG.BATCH_SIZE)):
      img_paths = df['img_path'][i: i+CFG.BATCH_SIZE].values
      images = torch.stack([transform(read_image(img_path)) for img_path in img_paths])
      embedding = model(images.to(device))
      embeddings.append(embedding.cpu())
  return torch.cat(embeddings)


train_transform =v2.Compose([
  v2.Resize(140),
  v2.ToImage(), 
  v2.ToDtype(torch.float32, scale=True),
  v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_transform = v2.Compose([
  v2.Resize(140),
  v2.ToImage(), 
  v2.ToDtype(torch.float32, scale=True),
  v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_transform = v2.Compose([
  v2.Resize(140),
  v2.ToImage(), 
  v2.ToDtype(torch.float32, scale=True),
  v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])



model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14_reg').to(device)

train_embeddings = get_embeddings(model, train_df, train_transform)
val_embeddings = get_embeddings(model, val_df,  val_transform)
test_embeddings = get_embeddings(model, test_df, test_transform)

print("Train embeddings: ",train_embeddings.shape)
print("Val embeddings: ",val_embeddings.shape)
print("Test embeddings: ",test_embeddings.shape)

# Get final features

In [None]:
def cat_embedding(df, embeddings):
  return pd.concat([df, pd.DataFrame(embeddings)], axis=1)

# reset index
df = df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_final_df= cat_embedding(df, train_embeddings)
val_final_df = cat_embedding(val_df, val_embeddings)
test_final_df = cat_embedding(test_df, test_embeddings)

print("Train final: ", train_final_df.shape)
print("Val final: ", val_final_df.shape)
print("Test final: ", test_final_df.shape)

In [None]:
# save the final data(optional)
train_final_df.to_csv('./data/train_final.csv', index=False)  
val_final_df.to_csv('./data/val_final.csv', index=False)
test_final_df.to_csv('./data/test_final.csv', index=False)

# LightGBM

In [None]:
# # load the final data
# train_final_df = pd.read_csv('./data/train_final.csv')
# val_final_df = pd.read_csv('./data/val_final.csv')
# test_final_df = pd.read_csv('./data/test_final.csv')

X_train = train_final_df.drop(['id', 'img_path']+CFG.TARGET_COLUMNS, axis=1)
y_train = train_final_df[CFG.TARGET_COLUMNS]
X_val = val_final_df.drop(['id', 'img_path']+CFG.TARGET_COLUMNS, axis=1)
y_val = val_final_df[CFG.TARGET_COLUMNS]
params = {
    'learning_rate': 0.005,

    "num_iterations": 10000,
    "bagging_freq": 7,
    "bagging_fraction": 0.75,
    "feature_fraction": 0.75,
    'lambda_l1': 0.01,  # Reduce regularization
    'lambda_l2': 0.01,
}

print("X_train.values: ", X_train.values.shape)

lgbm_model = lgb.LGBMRegressor(**params)
sklearn_model = MultiOutputRegressor(lgbm_model)
sklearn_model.fit(
  X = X_train,
  y = y_train,
)


score = sklearn_model.score(X_val, y_val)
print("Score: ", score)


# test
X_test = test_final_df.drop(['id', 'img_path'], axis=1)
y_test = sklearn_model.predict(X_test)

#concatenate test_df[ids] to the predictions
ids = test_df.iloc[:,0].values
y_test = np.concatenate((ids.reshape(-1,1), y_test), axis=1)

preds_df = pd.DataFrame(
  y_test,
  columns=["id"] + CFG.PRED_COLUMNS
)
preds_df["id"] = preds_df["id"].astype(int)
  
preds_df[CFG.PRED_COLUMNS].hist(bins=100, figsize=(20, 10))
preds_df.to_csv('submission.csv', index=False)

# XGboost

In [None]:

X_train = train_final_df.drop(['id', 'img_path']+CFG.TARGET_COLUMNS, axis=1)
y_train = train_final_df[CFG.TARGET_COLUMNS]
X_val = val_final_df.drop(['id', 'img_path']+CFG.TARGET_COLUMNS, axis=1)
y_val = val_final_df[CFG.TARGET_COLUMNS]

xgb_model = xgb.XGBRegressor(
    tree_method="hist",
    learning_rate=0.005,
    random_state=CFG.SEED,
    multi_strategy='multi_output_tree',
    eval_metric=sklearn.metrics.mean_squared_error,
    n_estimators=1000,
)



sklearn_model = MultiOutputRegressor(xgb_model)
sklearn_model.fit(
  X = X_train,
  y = y_train,
)
     


score = sklearn_model.score(X_val, y_val)
print("Score: ", score)


# test
X_test = test_final_df.drop(['id', 'img_path'], axis=1)
y_test = sklearn_model.predict(X_test)

#concatenate test_df[ids] to the predictions
ids = test_df.iloc[:,0].values
y_test = np.concatenate((ids.reshape(-1,1), y_test), axis=1)

preds_df = pd.DataFrame(
  y_test,
  columns=["id"] + CFG.PRED_COLUMNS
)
preds_df["id"] = preds_df["id"].astype(int)
  
preds_df[CFG.PRED_COLUMNS].hist(bins=100, figsize=(20, 10))
preds_df.to_csv('submission.csv', index=False)

# Catboost

In [None]:
# catboost
X_train = train_final_df.drop(['id', 'img_path']+CFG.TARGET_COLUMNS, axis=1)
y_train = train_final_df[CFG.TARGET_COLUMNS]
X_val = val_final_df.drop(['id', 'img_path']+CFG.TARGET_COLUMNS, axis=1)
y_val = val_final_df[CFG.TARGET_COLUMNS]

params = {'learning_rate': 0.05, 
          'loss_function': 'MultiRMSE', 
          'eval_metric': 'MultiRMSE', 
          'task_type': 'GPU', 
          'iterations': 10000,
          'boosting_type': 'Plain', 
         }

cat_model = CatBoostRegressor(**params) 

cat_model.fit(
  X = X_train,
  y = y_train,
  eval_set=(X_val, y_val),
)

score = cat_model.score(X_val, y_val)
print("Score: ", score)



# test
X_test = test_final_df.drop(['id', 'img_path'], axis=1)
y_test = cat_model.predict(X_test)

#concatenate test_df[ids] to the predictions
ids = test_df.iloc[:,0].values
y_test = np.concatenate((ids.reshape(-1,1), y_test), axis=1)

preds_df = pd.DataFrame(
  y_test,
  columns=["id"] + CFG.PRED_COLUMNS
)
preds_df["id"] = preds_df["id"].astype(int)
  
preds_df[CFG.PRED_COLUMNS].hist(bins=100, figsize=(20, 10))
preds_df.to_csv('submission.csv', index=False)

# Test loop

In [None]:
# Load model

X_test = test_final_df.drop(['id', 'img_path'], axis=1)

y_test = sklearn_model.predict(X_test)
# y_test = cat_model.predict(X_test)

#concatenate test_df[ids] to the predictions
ids = test_df.iloc[:,0].values
y_test = np.concatenate((ids.reshape(-1,1), y_test), axis=1)

preds_df = pd.DataFrame(
  y_test,
  columns=["id"] + CFG.PRED_COLUMNS
)
preds_df["id"] = preds_df["id"].astype(int)


# denormalize the predictions
# preds_df[CFG.TARGET_COLUMNS] = target_scaler.inverse_transform(preds_df[CFG.TARGET_COLUMNS])
# preds_df[CFG.PRED_COLUMNS] = np.expm1(preds_df[CFG.PRED_COLUMNS])
  
preds_df[CFG.PRED_COLUMNS].hist(bins=100, figsize=(20, 10))

preds_df.to_csv('submission.csv', index=False)