In [1]:
import yaml
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold

from utils import obj
from sdv.tabular.ctgan import CTGAN

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

pd.options.display.max_columns=100

import timm
import torch
import torch.nn as nn
import cv2
import albumentations as A
from tqdm import tqdm
from albumentations.pytorch import ToTensorV2

from sklearn.decomposition import PCA
from utils import seed_everything
seed_everything(42)

In [None]:
data_path = '../open/'
info = pd.read_excel(data_path+'clinical_info.xlsx')

df_train = pd.read_csv(data_path+'train.csv')
df_test = pd.read_csv(data_path+'test.csv')
sub = pd.read_csv(data_path+'sample_submission.csv')

df_train['img_path'] = df_train['img_path'].apply(lambda x: x.replace('./', '../open/'))
df_test['img_path'] = df_test['img_path'].apply(lambda x: x.replace('./', '../open/'))

In [None]:
train_img_vectors = []
test_img_vectors = []

resize = A.Resize(224, 224)
normalize = A.transforms.Normalize(
                                    mean=(0.5, 0.5, 0.5), 
                                    std=(0.5, 0.5, 0.5), 
                                    max_pixel_value=255.0, 
                                    p=1.0)

model = timm.create_model(model_name='tf_efficientnetv2_l_in21k', pretrained=True, in_chans=3)
model.classifier = nn.Identity()
# fc_layer = nn.Linear(1280, 128)
model.to('cuda')
model.eval()

for img_path in tqdm(df_train['img_path']):
    img = cv2.imread(img_path)
    img = normalize(image=resize(image=img)['image'])['image']
    img = ToTensorV2()(image=img)['image'].unsqueeze(0).to('cuda')
    
    # img_vector = fc_layer(model(img)).to('mps')
    img_vector = model(img).squeeze().cpu().detach().numpy()

    train_img_vectors.append(img_vector)

for img_path in tqdm(df_test['img_path']):
    img = cv2.imread(img_path)
    img = normalize(image=resize(image=img)['image'])['image']
    img = ToTensorV2()(image=img)['image'].unsqueeze(0).to('cuda')
    
    # img_vector = fc_layer(model(img)).to('mps')
    img_vector = model(img).squeeze().cpu().detach().numpy()

    test_img_vectors.append(img_vector)

In [None]:
train_img_df = pd.DataFrame(np.array(train_img_vectors), 
                    columns=['img_rep_' + str(i+1) for i in range(train_img_vectors[0].shape[0])])
test_img_df = pd.DataFrame(np.array(test_img_vectors), 
                    columns=['img_rep_' + str(i+1) for i in range(train_img_vectors[0].shape[0])])

In [None]:
train_img_df.to_csv('../open/train_img_vector.csv', index=False)
test_img_df.to_csv('../open/test_img_vector.csv', index=False)

In [14]:
data_path = '../open/'
df_train = pd.read_csv(data_path+'train.csv')
df_test = pd.read_csv(data_path+'test.csv')

train_img_df = pd.read_csv(data_path+'train_img_vector.csv')
test_img_df = pd.read_csv(data_path+'test_img_vector.csv')

perm_res = []

df_train['fe1'] = df_train['암의 장경'] / df_train['암의 개수'].fillna(1)
df_test['fe1'] = df_test['암의 장경'] / df_test['암의 개수'].fillna(1)


n_components = 10
pca = PCA(n_components=n_components, random_state=42)
train_pca = pca.fit_transform(train_img_df.values)
test_pca = pca.transform(test_img_df.values)
df_train = pd.concat([df_train, 
                pd.DataFrame(train_pca, columns=['pca_'+str(i+1) for i in range(n_components)])], axis=1)
df_test = pd.concat([df_test, 
                pd.DataFrame(test_pca, columns=['pca_'+str(i+1) for i in range(n_components)])], axis=1)

for df in [df_train, df_test]:
    df['암의 장경'] = pd.cut(df['암의 장경'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])
    df['fe1'] = pd.cut(df['fe1'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])

FOLDS = 5

def basic_set():
    use_features = [
                    '나이', 
                    '진단명', '암의 위치', '암의 개수', '암의 장경', 'NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3',
                    'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type', 'T_category', 
                    'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score', 
                    'KI-67_LI_percent', 
                    'HER2', 'HER2_IHC', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation', 
                    'fe1', 
                    ]
    use_features += ['pca_'+str(i+1) for i in range(n_components)]

    cat_features = ['암의 장경', 'fe1']
    return use_features, cat_features

df_train['due_date'] = 2022 - pd.to_datetime(df_train['수술연월일']).dt.year
df_test['due_date'] = 2022 - pd.to_datetime(df_test['수술연월일']).dt.year

df_train['date_year'] = pd.to_datetime(df_train['수술연월일']).dt.year
df_test['date_year'] = pd.to_datetime(df_test['수술연월일']).dt.year

for col in [
                '나이', '진단명', '암의 위치', '암의 개수', 'NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3',
                'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type', 'T_category', 
                'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score', 
                'HER2', 'HER2_IHC', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation', 'due_date', 'date_year',
                # 'fe1',
                ]:
    df_test[col] = df_test[col].map(df_train[col].value_counts(True))
    df_train[col] = df_train[col].map(df_train[col].value_counts(True))
    
skf = StratifiedKFold(n_splits=FOLDS, random_state=42, shuffle=True)
splits = list(skf.split(df_train, df_train['N_category']))


fi_df = pd.DataFrame()
oof = np.zeros(len(df_train))
preds = np.zeros(len(df_test))
preds_arr = np.zeros([len(df_test), 5])

for fold in range(FOLDS):
    print('start fold :', fold)
    test = df_test.copy()
    
    use_features, cat_features = basic_set()
    use_features += ['due_date', 'date_year']

    tt = df_train.loc[splits[fold][0], use_features].reset_index(drop=True)
    tt_target = df_train.loc[splits[fold][0], 'N_category'].values
    vv = df_train.loc[splits[fold][1], use_features].reset_index(drop=True)
    vv_target = df_train.loc[splits[fold][1], 'N_category'].values

    for col in ['암의 장경', 'fe1']:
        tmp_dict = {j:i for i,j in enumerate(np.unique(df_train[col].astype(str)))}
        tt[col] = tt[col].astype(str).map(tmp_dict).astype(int)
        vv[col] = vv[col].astype(str).map(tmp_dict).astype(int)
        test[col] = test[col].astype(str).map(tmp_dict).astype(int)

    # ctgan = pd.read_csv(f'../open/ctgan_csv/40/200/{fold}.csv')
    # tt = pd.concat([tt, ctgan[use_features]], ignore_index=False)
    # tt_target = np.concatenate([tt_target, ctgan['target'].values])

    # df = pd.concat([tt, pd.DataFrame(tt_target, columns=['target'])], axis=1)

    # ctgan = CTGAN(epochs=20)
    # ctgan.fit(df)
    # # Create synthetic data
    # synthetic_data = ctgan.sample(500)
    # df = pd.concat([df, synthetic_data], ignore_index=True)
    # tt_target = df['target'].values
    # tt = df.drop(columns='target')
    
    test = test[use_features].reset_index(drop=True)

    # lgb = LGBMClassifier(
    #                     n_estimators=5000,
    #                     learning_rate=0.03,
    #                     max_depth=-1,
    #                     num_leaves=64,
    #                     )

    # lgb.fit(tt, tt_target, eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=500)
    # oof[splits[fold][1]] += lgb.predict_proba(vv)[:, 1]
    # preds += lgb.predict_proba(test)[:, 1] / FOLDS
    # fi_df = pd.concat([fi_df, pd.DataFrame(zip(use_features, lgb.feature_importances_), columns=['feature', 'value'])])
    # tt[cat_features] = tt[cat_features].fillna(-99).astype(int)
    # vv[cat_features] = vv[cat_features].fillna(-99).astype(int)
    
    # lr = LogisticRegression(max_iter=1000)
    # lr.fit(tt.fillna(0), tt_target)
    # tt['lr_preds'] = lr.predict_proba(tt.fillna(0))[:, 1]
    # vv['lr_preds'] = lr.predict_proba(vv.fillna(0))[:, 1]
    # test['lr_preds'] = lr.predict_proba(test.fillna(0))[:, 1]
    # tt = tt.drop(columns=perm_rm_features[fold])
    # vv = vv.drop(columns=perm_rm_features[fold])
    # test = test.drop(columns=perm_rm_features[fold])
    
    cat = CatBoostClassifier(
                            n_estimators=5000,
                            learning_rate=0.03,
                            # border_count=4,
                            # depth=4,
                            # task_type="GPU",
                            # devices='0',
                            # one_hot_max_size=2,
                            # objective='logloss',
                            # eval_metric='F1',
                            # reg_lambda=0.01,
                            # random_seed=1028,
                            allow_writing_files=False,
                            logging_level='Silent',
                            )
    cat.fit(tt, tt_target, eval_set=[(vv, vv_target)], verbose=1000, early_stopping_rounds=500,
            # cat_features=cat_features, 
            )
    # cat.fit(tt[lambda x: x['due_date']<14], tt_target[tt['due_date']<14], eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=500,
    #         # cat_features=cat_features
    #         )
    
    oof[splits[fold][1]] += cat.predict_proba(vv)[:, 1]
    preds += cat.predict_proba(test)[:, 1] / FOLDS
    preds_arr[:, fold] = cat.predict_proba(test)[:, 1]
    fi_df = pd.concat([fi_df, pd.DataFrame(zip(use_features, cat.feature_importances_), columns=['feature', 'value'])])

    # res = permutation_importance(cat, vv, vv_target)

start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4


In [15]:
# no vc : 0.8289984609861489
# vc : 0.8369867959304704
# pca10 : 0.838999838999839
f1_score(df_train['N_category'],
        np.where(oof>0.5, 1, 0),
        average='macro'
        )

0.838999838999839

In [5]:
b = pd.read_csv('../submit/catboost_ctgan60e_600s.csv')
c = pd.Series(np.where(preds>0.5, 1, 0))


np.not_equal(c, b['N_category']).sum()

10

In [6]:
fi_df.groupby('feature').mean().reset_index().sort_values('value', ascending=False)

Unnamed: 0,feature,value
18,date_year,10.816105
19,due_date,10.323797
14,NG,8.008914
11,HG_score_2,6.989409
22,pca_2,5.950414
2,DCIS_or_LCIS_여부,5.22544
21,pca_1,4.426856
13,KI-67_LI_percent,4.150771
26,암의 장경,3.96354
23,나이,3.751106


In [7]:
pd.Series(np.where(oof>0.5, 1, 0)).value_counts(True), df_train['N_category'].value_counts(True)

(0    0.523
 1    0.477
 dtype: float64,
 1    0.514
 0    0.486
 Name: N_category, dtype: float64)

In [8]:
pd.Series(np.where(preds>0.4, 1, 0)).value_counts(True), pd.Series(np.where(preds>0.5, 1, 0)).value_counts(True)

(1    0.5
 0    0.5
 dtype: float64,
 0    0.54
 1    0.46
 dtype: float64)

In [10]:
sub = pd.read_csv(data_path+'sample_submission.csv')
sub['N_category'] = np.where(preds>=0.5, 1, 0)

sub.to_csv('../submit/catboost_img_pca_vc__ctgan40e_200s.csv', index=False)

In [None]:
data_path = '../open/'
df_train = pd.read_csv(data_path+'train.csv')
df_test = pd.read_csv(data_path+'test.csv')
perm_res = []

df_train['fe1'] = df_train['암의 장경'] / df_train['암의 개수'].fillna(1)
df_test['fe1'] = df_test['암의 장경'] / df_test['암의 개수'].fillna(1)


n_components = 2
pca = PCA(n_components=n_components, random_state=42)
train_pca = pca.fit_transform(train_img_df.values)
test_pca = pca.transform(test_img_df.values)
df_train = pd.concat([df_train, 
                pd.DataFrame(train_pca, columns=['pca_'+str(i+1) for i in range(n_components)])], axis=1)
df_test = pd.concat([df_test, 
                pd.DataFrame(test_pca, columns=['pca_'+str(i+1) for i in range(n_components)])], axis=1)

for df in [df_train, df_test]:
    df['암의 장경'] = pd.cut(df['암의 장경'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])
    df['fe1'] = pd.cut(df['fe1'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])

FOLDS = 5

def basic_set():
    use_features = [
                    '나이', 
                    '진단명', '암의 위치', '암의 개수', '암의 장경', 'NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3',
                    'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type', 'T_category', 
                    'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score', 
                    'KI-67_LI_percent', 
                    'HER2', 'HER2_IHC', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation', 
                    'fe1', 
                    ]
    use_features += ['pca_'+str(i+1) for i in range(n_components)]

    cat_features = ['암의 장경', 'fe1']
    return use_features, cat_features

df_train['due_date'] = 2022 - pd.to_datetime(df_train['수술연월일']).dt.year
df_test['due_date'] = 2022 - pd.to_datetime(df_test['수술연월일']).dt.year

df_train['date_year'] = pd.to_datetime(df_train['수술연월일']).dt.year
df_test['date_year'] = pd.to_datetime(df_test['수술연월일']).dt.year

skf = StratifiedKFold(n_splits=FOLDS, random_state=42, shuffle=True)
splits = list(skf.split(df_train, df_train['N_category']))


fi_df = pd.DataFrame()
oof = np.zeros(len(df_train))
preds = np.zeros(len(df_test))
preds_arr = np.zeros([len(df_test), 5])

for fold in range(FOLDS):
    print('start fold :', fold)
    test = df_test.copy()
    
    use_features, cat_features = basic_set()
    use_features += ['due_date', 'date_year']
    # use_features += ['due_date', 'date_year', '암의 장경/개수']

    tt = df_train.loc[splits[fold][0], use_features].reset_index(drop=True)
    tt_target = df_train.loc[splits[fold][0], 'N_category'].values
    vv = df_train.loc[splits[fold][1], use_features].reset_index(drop=True)
    vv_target = df_train.loc[splits[fold][1], 'N_category'].values

    for col in ['암의 장경', 'fe1']:
        tmp_dict = {j:i for i,j in enumerate(np.unique(df_train[col].astype(str)))}
        tt[col] = tt[col].astype(str).map(tmp_dict).astype(int)
        vv[col] = vv[col].astype(str).map(tmp_dict).astype(int)
        test[col] = test[col].astype(str).map(tmp_dict).astype(int)

    # ctgan = pd.read_csv(f'../open/ctgan_csv/50/400/{fold}.csv')
    # tt = pd.concat([tt, ctgan[use_features]], ignore_index=False)
    # tt_target = np.concatenate([tt_target, ctgan['target'].values])

    df = pd.concat([tt, pd.DataFrame(tt_target, columns=['target'])], axis=1)

    ctgan = CTGAN(epochs=20)
    ctgan.fit(df)
    # Create synthetic data
    synthetic_data = ctgan.sample(500)
    df = pd.concat([df, synthetic_data], ignore_index=True)
    tt_target = df['target'].values
    tt = df.drop(columns='target')
    
    test = test[use_features].reset_index(drop=True)

    # lgb = LGBMClassifier(
    #                     n_estimators=5000,
    #                     learning_rate=0.03,
    #                     max_depth=-1,
    #                     num_leaves=64,
    #                     )

    # lgb.fit(tt, tt_target, eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=500)
    # oof[splits[fold][1]] += lgb.predict_proba(vv)[:, 1]
    # preds += lgb.predict_proba(test)[:, 1] / FOLDS
    # fi_df = pd.concat([fi_df, pd.DataFrame(zip(use_features, lgb.feature_importances_), columns=['feature', 'value'])])
    # tt[cat_features] = tt[cat_features].fillna(-99).astype(int)
    # vv[cat_features] = vv[cat_features].fillna(-99).astype(int)
    
    # lr = LogisticRegression(max_iter=1000)
    # lr.fit(tt.fillna(0), tt_target)
    # tt['lr_preds'] = lr.predict_proba(tt.fillna(0))[:, 1]
    # vv['lr_preds'] = lr.predict_proba(vv.fillna(0))[:, 1]
    # test['lr_preds'] = lr.predict_proba(test.fillna(0))[:, 1]
    # tt = tt.drop(columns=perm_rm_features[fold])
    # vv = vv.drop(columns=perm_rm_features[fold])
    # test = test.drop(columns=perm_rm_features[fold])
    
    cat = CatBoostClassifier(
                            n_estimators=5000,
                            learning_rate=0.03,
                            # border_count=4,
                            # depth=4,
                            # task_type="GPU",
                            # devices='0',
                            # one_hot_max_size=2,
                            # objective='logloss',
                            # eval_metric='F1',
                            # reg_lambda=0.01,
                            # random_seed=1028,
                            allow_writing_files=False,
                            logging_level='Silent',
                            )
    cat.fit(tt, tt_target, eval_set=[(vv, vv_target)], verbose=1000, early_stopping_rounds=500,
            # cat_features=cat_features, 
            )
    # cat.fit(tt[lambda x: x['due_date']<14], tt_target[tt['due_date']<14], eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=500,
    #         # cat_features=cat_features
    #         )
    
    oof[splits[fold][1]] += cat.predict_proba(vv)[:, 1]
    preds += cat.predict_proba(test)[:, 1] / FOLDS
    preds_arr[:, fold] = cat.predict_proba(test)[:, 1]
    fi_df = pd.concat([fi_df, pd.DataFrame(zip(use_features, cat.feature_importances_), columns=['feature', 'value'])])

    # res = permutation_importance(cat, vv, vv_target)

In [None]:
0.8449961249031226

In [None]:
f1_score(df_train['N_category'],
        np.where(oof>0.5, 1, 0),
        average='macro'
        )

In [None]:
f1_score(df_train['N_category'],
        np.where(oof>0.5, 1, 0),
        average='macro'
        )

In [None]:
b = pd.read_csv('../submit/catboost_ctgan60e_600s.csv')
c = pd.Series(np.where(preds>0.45, 1, 0))


np.not_equal(c, b['N_category']).sum()

In [None]:
f1_score(df_train['N_category'],
        np.where(oof>0.5, 1, 0),
        average='macro'
        )

In [None]:
fi_df.groupby('feature').mean().reset_index().sort_values('value', ascending=False)

In [None]:
pd.Series(np.where(oof>0.5, 1, 0)).value_counts(True)

In [None]:
df_train['N_category'].value_counts(True)

In [None]:
pd.Series(np.where(preds>0.4, 1, 0)).value_counts(True)

In [None]:
pd.Series(np.where(preds>0.5, 1, 0)).value_counts(True)

In [None]:
sub['N_category'] = np.where(preds>=0.45, 1, 0)

In [None]:
sub.to_csv('../submit/catboost__ctgan60e_800s__round45.csv', index=False)

In [None]:
np.save('../submit/cat_oof.npy', oof)

In [None]:
np.save('../submit/cat_preds.npy', preds)

In [None]:
df_train = pd.read_csv(data_path+'df_train.csv')
df_test = pd.read_csv(data_path+'df_test.csv')

In [None]:
df_train['due_date'] = 2022 - pd.to_datetime(df_train['수술연월일']).dt.year
df_test['due_date'] = 2022 - pd.to_datetime(df_test['수술연월일']).dt.year

In [None]:
df_train['암의 위치']df_train['암의 개수'].astype(str)

In [None]:
seed_oof = []
seed_pred = []
for seed in [1028, 42, 204, 1510, 99]:

    df_train = pd.read_csv(data_path+'train.csv')
    df_test = pd.read_csv(data_path+'test.csv')

    # preprocess outlier
    # df_train['PR_Allred_score'] = df_train['PR_Allred_score'].where((0<=df_train['PR_Allred_score']) & (df_train['PR_Allred_score']<=8))

    # for col in ['NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3', 'DCIS_or_LCIS_type', 'ER_Allred_score', 'PR_Allred_score', 'HER2_SISH_ratio']:
    #     df_train[col].fillna(0, inplace=True)
    #     df_test[col].fillna(0, inplace=True)
        # df_train[col].fillna(-1, inplace=True)
        # df_test[col].fillna(-1, inplace=True)

    # df_test['암의 장경'].fillna(df_train['암의 장경'].median(), inplace=True)
    # df_train['암의 장경'].fillna(df_train['암의 장경'].median(), inplace=True)

    # df_train['BRCA_mutation'] = df_train['BRCA_mutation'].fillna(1)
    # df_test['BRCA_mutation'] = df_test['BRCA_mutation'].fillna(1)

    # for col in ['T_category', 'HER2', 'HER2_IHC', 'HER2_SISH', 'KI-67_LI_percent']:
    #     df_train[col].fillna(-1, inplace=True)
    #     df_test[col].fillna(-1, inplace=True)

    #     df_train[col]+=1
    #     df_test[col]+=1

    # # remove ER, PR nan value
    # df_train = df_train.drop(266).reset_index(drop=True)

    FOLDS = 5

    def basic_set():
        use_features = [
                        '나이', 
                        '진단명', '암의 위치', '암의 개수', '암의 장경', 'NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3',
                        'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type', 'T_category', 
                        'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score', 
                        'KI-67_LI_percent', 
                        'HER2', 'HER2_IHC', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation', 
                        # 'img_height', 'img_width',
                        # 'BRCA_mutation2',
                        ]

        cat_features = ['나이', '진단명', '암의 위치', '암의 개수', '암의 장경', 'NG', 'HG', 'HG_score_1',
                                'HG_score_2', 'HG_score_3', 'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type',
                                'T_category', 'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score',
                                'HER2', 'HER2_IHC', 'HER2_SISH','BRCA_mutation']
        cat_features = ['진단명']
        return use_features, cat_features

    df_train['due_date'] = 2022 - pd.to_datetime(df_train['수술연월일']).dt.year
    df_test['due_date'] = 2022 - pd.to_datetime(df_test['수술연월일']).dt.year
    # df_train['due_date2'] = (2022-pd.to_datetime(df_train['수술연월일']).dt.year)*12 + pd.to_datetime(df_train['수술연월일']).dt.month
    # df_test['due_date2'] = (2022-pd.to_datetime(df_test['수술연월일']).dt.year)*12 + pd.to_datetime(df_test['수술연월일']).dt.month

    df_train['date_year'] = pd.to_datetime(df_train['수술연월일']).dt.year
    df_test['date_year'] = pd.to_datetime(df_test['수술연월일']).dt.year

    # df_train['수술당시나이'] = df_train['나이'] - df_train['due_date']
    # df_test['수술당시나이'] = df_test['나이'] - df_test['due_date']

    # df_train['암의 장경/개수'] = df_train['암의 장경'] / df_train['암의 개수']
    # df_test['암의 장경/개수'] = df_test['암의 장경'] / df_test['암의 개수']

    for col in [
                        '나이', '진단명', '암의 위치', '암의 개수', 'NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3',
                        'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type', 'T_category', 
                        'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score', 
                        'HER2', 'HER2_IHC', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation', 'due_date', 'date_year']:
        df_test[col] = df_test[col].map(df_train[col].value_counts(True))
        df_train[col] = df_train[col].map(df_train[col].value_counts(True))

    skf = StratifiedKFold(n_splits=FOLDS, random_state=seed, shuffle=True)
    splits = list(skf.split(df_train, df_train['N_category']))
    # labels = df_train['N_category'].astype(str) + '_' + df_train['due_date'].astype(str)
    # splits = list(skf.split(df_train, labels))


    fi_df = pd.DataFrame()
    oof = np.zeros(len(df_train))
    preds = np.zeros(len(df_test))

    for fold in range(FOLDS):
        print('start fold :', fold)
        test = df_test.copy()
        
        use_features, cat_features = basic_set()
        use_features += ['due_date', 'date_year']
        # use_features += ['due_date', 'date_year', '암의 장경/개수']

        tt = df_train.loc[splits[fold][0], use_features].reset_index(drop=True)
        tt_target = df_train.loc[splits[fold][0], 'N_category'].values
        vv = df_train.loc[splits[fold][1], use_features].reset_index(drop=True)
        vv_target = df_train.loc[splits[fold][1], 'N_category'].values

        for df in [tt, vv, test]:
            # df['나이'] = pd.cut(df['나이'], [20, 30, 40, 50, 60, 70, 80, 100])
            df['암의 장경'] = pd.cut(df['암의 장경'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])

            # df['due_date'] = pd.cut(df['due_date'], [0, 5, 10, 15])
            # df['date_year'] = pd.cut(df['date_year'], [0, 2005, 2010, 2015, 2020, 2030])
            # df['암의 장경'] = pd.cut(df['암의 장경'], [0, 20, 40, 60, 80, 100, 200])
            # df['암의 장경/개수'] = pd.cut(df['암의 장경/개수'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])
            # df['KI-67_LI_percent'] = pd.cut(df['KI-67_LI_percent'], [0, 20, 40, 60, 80, 100])
            
        # for col in ['나이', '암의 장경', '암의 장경/개수']:
        # for col in ['나이', '암의 장경', 'KI-67_LI_percent']:
        # for col in ['나이', '암의 장경']:
        for col in ['암의 장경']:
            tmp_dict = {j:i for i,j in enumerate(np.unique(tt[col].astype(str)))}
            tt[col] = tt[col].astype(str).map(tmp_dict)
            vv[col] = vv[col].astype(str).map(tmp_dict)
            test[col] = test[col].astype(str).map(tmp_dict)

        # generate dataset
        # tt = tt.fillna(0)
        # vv = vv.fillna(0)
        # test = test.fillna(0)

        # df = pd.concat([tt, pd.DataFrame(tt_target, columns=['target'])], axis=1)

        # # discrete_columns = ['암의 장경', 'target']

        # # ctgan = CTGAN(epochs=10)
        # ctgan = CopulaGAN(epochs=10)
        # ctgan.fit(df)
        # # Create synthetic data
        # synthetic_data = ctgan.sample(50, randomize_samples=False)
        # df = pd.concat([df, synthetic_data], ignore_index=True)
        # tt_target = df['target'].values
        # tt = df.drop(columns='target')
        ctgan = pd.read_csv(f'../open/ctgan_csv/100/500/{fold}.csv')
        tt = pd.concat([tt, ctgan[use_features]], ignore_index=False)
        tt_target = np.concatenate([tt_target, ctgan['target'].values])
        
        # tt['암의 장경2'] = tt['암의 장경'].isnull()
        # vv['암의 장경2'] = vv['암의 장경'].isnull()
        # test['암의 장경2'] = test['암의 장경'].isnull()

        # for df in [tt, vv, test]:
        #     # df['나이_due_date'] = df['나이'].astype(str) + '_' + df['due_date'].astype(str)
        #     df['암'] = df['암의 위치'].astype(str) + '_' + df['암의 장경'].astype(str)
        # df = pd.concat([tt, vv])
        # tmp_dict = {v:i for i, v in enumerate(np.unique(df['암']))}
        # for col in ['암']:
        #     tt[col] = tt[col].astype(str).map(tmp_dict)
        #     vv[col] = vv[col].astype(str).map(tmp_dict)
        #     test[col] = test[col].astype(str).map(tmp_dict)
        
        # use_features += ['암']
        
        # tmp_dict = df_train.loc[splits[fold][0]].groupby('due_date')['N_category'].mean()
        # tt['진단명_target'] = tt['due_date'].map(tmp_dict)
        # vv['진단명_target'] = vv['due_date'].map(tmp_dict)
        # tmp_dict = df_train.groupby('due_date')['N_category'].mean()
        # test['진단명_target'] = test['due_date'].map(tmp_dict)
        # use_features += ['진단명_target']
        


        test = test[use_features].reset_index(drop=True)

        # lgb = LGBMClassifier(
        #                     n_estimators=5000,
        #                     learning_rate=0.03,
        #                     max_depth=-1,
        #                     num_leaves=64,
        #                     )

        # lgb.fit(tt, tt_target, eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=100)
        # oof[splits[fold][1]] += lgb.predict_proba(vv)[:, 1]
        # preds += lgb.predict_proba(test)[:, 1] / 5
        # fi_df = pd.concat([fi_df, pd.DataFrame(zip(use_features, lgb.feature_importances_), columns=['feature', 'value'])])
        # tt[cat_features] = tt[cat_features].fillna(-99).astype(int)
        # vv[cat_features] = vv[cat_features].fillna(-99).astype(int)
        cat = CatBoostClassifier(
                                n_estimators=5000,
                                learning_rate=0.03,
                                # objective='logloss',
                                # eval_metric='F1',
                                # reg_lambda=0.01,
                                # random_seed=1028,
                                allow_writing_files=False,
                                logging_level='Silent',
                                )
        cat.fit(tt, tt_target, eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=500,
                # cat_features=cat_features
                )
        # cat.fit(tt[lambda x: x['due_date']<14], tt_target[tt['due_date']<14], eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=500,
        #         # cat_features=cat_features
        #         )

                
        oof[splits[fold][1]] += cat.predict_proba(vv)[:, 1]
        preds += cat.predict_proba(test)[:, 1] / FOLDS
        fi_df = pd.concat([fi_df, pd.DataFrame(zip(use_features, cat.feature_importances_), columns=['feature', 'value'])])
        
        
    seed_oof.append(oof)
    seed_pred.append(preds)

In [None]:
f1_score(df_train['N_category'],
         np.where(np.mean(seed_oof, 0)>0.5, 1, 0),
         average='macro')

In [None]:
b = pd.read_csv('../submit/catboost_ctgan60e_600s.csv')
c = pd.Series(np.where(np.mean(seed_pred, 0)>=0.5, 1, 0))


np.not_equal(c, b['N_category']).sum()

In [None]:
sub['N_category'] = np.where(np.mean(seed_pred, 0)>=0.5, 1, 0)

In [None]:
sub.to_csv('../submit/catboost_ctgan100e_500s__5seed__F1.csv', index=False)