In [1]:
import os
import os
import torch
import timm
import pandas as pd
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import albumentations as A
from PIL import Image
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import numpy as np
from tqdm import tqdm
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, OneCycleLR, CosineAnnealingLR, ReduceLROnPlateau, StepLR, LambdaLR
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import Pool, CatBoostRegressor
from torchvision import transforms

from utils import seed_torch, current_date_time, init_logger

# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ["HTTP_PROXY"] = "http://127.0.0.1:7890"
# os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"{device = }")

class CFG:
    n_fold = 10
    seed = 44
    num_workers = 4
    target_cols = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']
    sub_cols = ['X4', 'X11', 'X18', 'X26', 'X50', 'X3112']
    fold_list = [0]
    full_data_train = True

    use_clip = True
    clip_min = 0.001 # 0.001
    clip_max = 0.990 # 0.990
    
    use_label_norm = False
    use_log10 = False
    label_norm_method = 'minmax' # minmax zscore
    
    use_feature_norm = True
    first_n_poly_feats = 1000

    lr = 0.06 # 0.06
    iterations = 1500 # 1500

CFG.num_outputs = len(CFG.target_cols)

seed_torch(CFG.seed)
cur_time = current_date_time()
cur_time_abbr = cur_time.replace("-", "").replace(":", "").replace(" ", "")[4:12]
output_dir = './output'
output_dir = f"{output_dir}/{cur_time_abbr}_catboost"
os.makedirs(output_dir, exist_ok=True)
LOGGER = init_logger(f'{output_dir}/train_catboost.log')

for key, value in CFG.__dict__.items():
    if not key.startswith("__"):
        LOGGER.info(f"{key} = {value}")

n_fold = 10
seed = 44
num_workers = 4
target_cols = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']
sub_cols = ['X4', 'X11', 'X18', 'X26', 'X50', 'X3112']
fold_list = [0]
full_data_train = True
use_clip = True
clip_min = 0.001
clip_max = 0.99
use_label_norm = False
use_log10 = False
label_norm_method = minmax
use_feature_norm = True
first_n_poly_feats = 1000
lr = 0.06
iterations = 1500
num_outputs = 6


device = device(type='cuda')


In [2]:
# 读取数据
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

train_df['file_path'] = train_df['id'].apply(lambda x: f'./data/train_images/{x}.jpeg')
test_df['file_path'] = test_df['id'].apply(lambda x: f'./data/test_images/test_images/{x}.jpeg')

print(f"{train_df.shape = }")
print(f"{test_df.shape = }")

# need run get_embedding.ipynb first
train_image_embeddings = np.load(f'train_dinov2_embeds.npy')
test_image_embeddings = np.load(f'test_dinov2_embeds.npy')
print(f"{train_image_embeddings.shape = }")
print(f"{test_image_embeddings.shape = }")


# 删除列
drop_cols = ['id', 'file_path'] + CFG.target_cols
# 特征列
feature_cols = [col for col in train_df.columns if col not in drop_cols]
# 5-fold交叉验证
kf = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)

train_df.shape = (43363, 171)
test_df.shape = (6391, 165)
train_image_embeddings.shape = (43363, 1536)
test_image_embeddings.shape = (6391, 1536)


normalization

In [3]:
def label_norm(df, use_log10, norm_method, minmax_dict=None):
    if use_log10:
        # 对6标签做 log10 处理
        for col in CFG.target_cols:
            df[col] = np.log10(df[col] + 1)
            print(f"log10 {col}: {df[col].min() = }, {df[col].max() = }")
    
    if norm_method == 'minmax':
        # ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean'] 的最大值和最小值
        if minmax_dict is None:
            minmax_dict = {}
            for col in CFG.target_cols:
                minmax_dict[col] = {}
                minmax_dict[col]['min'] = df[col].min()
                minmax_dict[col]['max'] = df[col].max()
            print(f"{minmax_dict = }")

        # 对6列标签进行归一化
        for col in CFG.target_cols:
            df[col] = (df[col] - minmax_dict[col]['min']) / (minmax_dict[col]['max'] - minmax_dict[col]['min'])
            print(f"minmax {col}: {df[col].min() = }, {df[col].max() = }")


    return df, minmax_dict


def feature_norm(df, feature_cols, scaler):
    if scaler is None:
        scaler = StandardScaler()
        df[feature_cols] = scaler.fit_transform(df[feature_cols]).astype(np.float32)
    else:
        df[feature_cols] = scaler.transform(df[feature_cols]).astype(np.float32)
    return df, scaler

In [4]:
for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
    if fold not in CFG.fold_list:
        continue

    LOGGER.info(f'Fold {fold + 1}/{kf.n_splits}')
    valid_scores = {target: -999 for target in CFG.target_cols}

    if CFG.full_data_train:
        train_subset = train_df
    else:
        train_subset = train_df.iloc[train_idx]
    val_subset = train_df.iloc[val_idx]
    LOGGER.info(f"train_subset.shape = {train_subset.shape}, val_subset.shape = {val_subset.shape}")

    if CFG.use_clip:
        for col in CFG.target_cols:
            lower_quantile = train_subset[col].quantile(CFG.clip_min)
            upper_quantile = train_subset[col].quantile(CFG.clip_max)
            train_subset = train_subset[(train_subset[col] >= lower_quantile) & (train_subset[col] <= upper_quantile)]
            after_clip_index = train_subset.index
        LOGGER.info(f"after clip, train_subset.shape = {train_subset.shape}, val_subset.shape = {val_subset.shape}")
    
    if CFG.use_label_norm:
        train_subset, minmax_dict = label_norm(train_subset, CFG.use_log10, CFG.label_norm_method, None)
        val_subset, _  = label_norm(val_subset, CFG.use_log10, CFG.label_norm_method, minmax_dict)

    if CFG.use_feature_norm:
        # 特征归一化
        train_subset, scaler = feature_norm(train_subset, feature_cols, None)
        val_subset, _ = feature_norm(val_subset, feature_cols, scaler)
    
    # embedding
    if CFG.full_data_train:
        train_image_embeddings_subset = train_image_embeddings[list(set(train_subset.index))]
    else:
        new_train_idx = list(set(train_idx) & set(train_subset.index))
        train_image_embeddings_subset = train_image_embeddings[new_train_idx]
    valid_image_embeddings_subset = train_image_embeddings[val_idx]

    # feature
    train_features = train_subset[feature_cols].values
    valid_features = val_subset[feature_cols].values
    
    # y
    train_y = train_subset[CFG.target_cols].values
    valid_y = val_subset[CFG.target_cols].values


    # 简单的特征工程
    first_n_poly_feats = CFG.first_n_poly_feats
    train_features_mask_all = np.concatenate(
        (PolynomialFeatures(2).fit_transform(train_features)[:, :first_n_poly_feats], train_image_embeddings_subset), axis=1
    )
    val_features_mask_all = np.concatenate(
        (PolynomialFeatures(2).fit_transform(valid_features)[:, :first_n_poly_feats], valid_image_embeddings_subset), axis=1
    )


    train_features_mask_df = pd.DataFrame(train_features_mask_all)
    train_features_mask_df['emb'] = list(train_image_embeddings_subset)

    val_features_mask_df = pd.DataFrame(val_features_mask_all)
    val_features_mask_df['emb'] = list(valid_image_embeddings_subset)

    LOGGER.info(f"train_features_mask_df.shape = {train_features_mask_df.shape}, val_features_mask_df.shape = {val_features_mask_df.shape}")
    LOGGER.info(f"train_y.shape = {train_y.shape}, valid_y.shape = {valid_y.shape}")

    LOGGER.info("Start training CatBoost")
    models = {}
    scores = {}
    for i, col in tqdm(enumerate(CFG.target_cols), total=len(CFG.target_cols)):
        y_curr = train_y[:, i]
        y_curr_val = valid_y[:, i]
        train_pool = Pool(train_features_mask_df, y_curr, embedding_features=['emb'])
        val_pool = Pool(val_features_mask_df, y_curr_val, embedding_features=['emb'])
        
        # tried to tune these parameters but without real success 
        model = CatBoostRegressor(
            iterations=CFG.iterations,
            learning_rate=CFG.lr,
            loss_function='RMSE', 
            verbose=0, 
            )
        model.fit(train_pool)
        models[col] = model
        
        y_curr_val_pred = model.predict(val_pool)
        
        r2_col = r2_score(y_curr_val, y_curr_val_pred)
        scores[col] = r2_col
        LOGGER.info(f'Target: {col}, R2: {r2_col:.3f}')
        
    # this val score somewhat correlates with submission score bit I didn't really bother
    mean_r2 = np.mean(list(scores.values()))
    LOGGER.info(f'Mean R2: {mean_r2:.3f}')


    # test prediction
    LOGGER.info("Start Testing")
    if CFG.use_label_norm:
        test_df, _ = label_norm(test_df, CFG.use_log10, 'minmax', minmax_dict)
    if CFG.use_feature_norm:
        test_df, _ = feature_norm(test_df, feature_cols, scaler)

    test_features = test_df[feature_cols].values

    test_features_all = np.concatenate(
        (PolynomialFeatures(2).fit_transform(test_features)[:, :first_n_poly_feats], test_image_embeddings), axis=1
    )

    test_features_mask_df = pd.DataFrame(test_features_all)
    test_features_mask_df['emb'] = list(test_image_embeddings)

    submission = pd.DataFrame({'id': test_df['id']})
    submission[CFG.target_cols] = 0
    submission.columns = submission.columns.str.replace('_mean', '')

    for i, col in enumerate(CFG.target_cols):
        test_pool = Pool(test_features_mask_df, embedding_features=['emb'])
        col_pred = models[col].predict(test_pool)
        submission[col.replace('_mean', '')] = col_pred

    submission.to_csv(f'{output_dir}/sub_{cur_time_abbr}_f{fold}_cv{mean_r2:.3f}.csv', index=False)
    submission.head()

Fold 1/10
train_subset.shape = (43363, 171), val_subset.shape = (4337, 171)
after clip, train_subset.shape = (40587, 171), val_subset.shape = (4337, 171)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature_cols] = scaler.transform(df[feature_cols]).astype(np.float32)
train_features_mask_df.shape = (40587, 2537), val_features_mask_df.shape = (4337, 2537)
train_y.shape = (40587, 6), valid_y.shape = (4337, 6)
Start training CatBoost
  0%|          | 0/6 [00:00<?, ?it/s]Target: X4_mean, R2: 0.765
 17%|█▋        | 1/6 [05:08<25:43, 308.73s/it]Target: X11_mean, R2: 0.718
 33%|███▎      | 2/6 [09:58<19:49, 297.42s/it]Target: X18_mean, R2: 0.809
 50%|█████     | 3/6 [14:46<14:40, 293.44s/it]Target: X26_mean, R2: 0.408
 67%|██████▋   | 4/6 [19:54<09:57, 298.97s/it]Target: X5

In [5]:
if len(CFG.fold_list) > 1:
    from glob import glob
    # Ensembling
    LOGGER.info("Start Ensembling")
    all_submissions = []
    for path in glob(f'{output_dir}/*.csv'):
        all_submissions.append(pd.read_csv(path))

    LOGGER.info(f"Total {len(all_submissions)} submissions")
    # mean
    ens_submission = all_submissions[0].copy()
    ens_submission[CFG.sub_cols] = 0
    for sub in all_submissions:
        ens_submission[CFG.sub_cols] += sub[CFG.sub_cols] / len(all_submissions)
    ens_submission.to_csv(f'{output_dir}/sub_{cur_time_abbr}_ens{len(all_submissions)}.csv', index=False)

In [6]:
# from glob import glob
# csv_files = [
#     './output/08121650_catboost/sub_08121513_f0_cv0.474.csv',
#     './output/08121650_catboost/sub_08121513_f2_cv0.466.csv',
#     './output/08121650_catboost/sub_08121650_f8_cv0.479.csv',
#     './output/08121650_catboost/sub_08121650_f9_cv0.467.csv',
#  ]
# all_submissions = []
# for path in csv_files:
#     all_submissions.append(pd.read_csv(path))
# print(f"Total {len(all_submissions)} submissions")

# # mean
# ens_submission = all_submissions[0].copy()
# ens_submission[CFG.sub_cols] = 0
# for sub in all_submissions:
#     ens_submission[CFG.sub_cols] += sub[CFG.sub_cols] / len(all_submissions)
# ens_submission.to_csv(f'{output_dir}/sub_{cur_time_abbr}_ens4best.csv', index=False)

In [7]:
# from glob import glob
# csv_files = [
#     './output/08122049_catboost/sub_08122049_f0_cv0.689.csv', # seed 42
#     './output/08122124_catboost/sub_08122124_f0_cv0.692.csv', # seed 43
#     './output/08122154_catboost/sub_08122154_f0_cv0.677.csv', # seed 44

#  ]
# all_submissions = []
# for path in csv_files:
#     all_submissions.append(pd.read_csv(path))
# print(f"Total {len(all_submissions)} submissions")

# # mean
# ens_submission = all_submissions[0].copy()
# ens_submission[CFG.sub_cols] = 0
# for sub in all_submissions:
#     ens_submission[CFG.sub_cols] += sub[CFG.sub_cols] / len(all_submissions)
# ens_submission.to_csv(f'{output_dir}/sub_{cur_time_abbr}_ens3seed.csv', index=False)

Total 3 submissions
