In [1]:
import os
import yaml
import numpy as np
import pandas as pd

import cv2
import torch
import albumentations as A

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold

from utils import obj
from sdv.tabular.ctgan import CTGAN

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

pd.options.display.max_columns=100

In [2]:
data_path = '../open/'
info = pd.read_excel(data_path+'clinical_info.xlsx')

df_train = pd.read_csv(data_path+'train.csv')
df_test = pd.read_csv(data_path+'test.csv')
sub = pd.read_csv(data_path+'sample_submission.csv')

df_train['img_path'] = df_train['img_path'].apply(lambda x: x.replace('./', '../open/'))
df_test['img_path'] = df_test['img_path'].apply(lambda x: x.replace('./', '../open/'))

In [None]:
df_train = pd.read_csv(data_path+'train.csv')
df_test = pd.read_csv(data_path+'test.csv')

FOLDS = 5

def basic_set():
    use_features = [
                    '나이', 
                    '진단명', '암의 위치', '암의 개수', '암의 장경', 'NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3',
                    'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type', 'T_category', 
                    'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score', 
                    'KI-67_LI_percent', 
                    'HER2', 'HER2_IHC', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation', 
                    # 'img_height', 'img_width',
                    # 'BRCA_mutation2',
                    ]

    cat_features = ['나이', '진단명', '암의 위치', '암의 개수', '암의 장경', 'NG', 'HG', 'HG_score_1',
                            'HG_score_2', 'HG_score_3', 'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type',
                            'T_category', 'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score',
                            'HER2', 'HER2_IHC', 'HER2_SISH','BRCA_mutation']
    cat_features = ['진단명']
    return use_features, cat_features

df_train['due_date'] = 2022 - pd.to_datetime(df_train['수술연월일']).dt.year
df_test['due_date'] = 2022 - pd.to_datetime(df_test['수술연월일']).dt.year
# df_train['due_date2'] = (2022-pd.to_datetime(df_train['수술연월일']).dt.year)*12 + pd.to_datetime(df_train['수술연월일']).dt.month
# df_test['due_date2'] = (2022-pd.to_datetime(df_test['수술연월일']).dt.year)*12 + pd.to_datetime(df_test['수술연월일']).dt.month

df_train['date_year'] = pd.to_datetime(df_train['수술연월일']).dt.year
df_test['date_year'] = pd.to_datetime(df_test['수술연월일']).dt.year

# df_train['수술당시나이'] = df_train['나이'] - df_train['due_date']
# df_test['수술당시나이'] = df_test['나이'] - df_test['due_date']

# df_train['암의 장경/개수'] = df_train['암의 장경'] / df_train['암의 개수']
# df_test['암의 장경/개수'] = df_test['암의 장경'] / df_test['암의 개수']

for col in [
                    '나이', '진단명', '암의 위치', '암의 개수', 'NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3',
                    'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type', 'T_category', 
                    'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score', 
                    'HER2', 'HER2_IHC', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation', 'due_date', 'date_year']:
    df_test[col] = df_test[col].map(df_train[col].value_counts(True))
    df_train[col] = df_train[col].map(df_train[col].value_counts(True))

skf = StratifiedKFold(n_splits=FOLDS, random_state=42, shuffle=True)
splits = list(skf.split(df_train, df_train['N_category']))
# labels = df_train['N_category'].astype(str) + '_' + df_train['due_date'].astype(str)
# splits = list(skf.split(df_train, labels))


fi_df = pd.DataFrame()
oof = np.zeros(len(df_train))
preds = np.zeros(len(df_test))

for fold in range(FOLDS):
    print('start fold :', fold)
    test = df_test.copy()
    
    use_features, cat_features = basic_set()
    use_features += ['due_date', 'date_year']
    # use_features += ['due_date', 'date_year', '암의 장경/개수']

    tt = df_train.loc[splits[fold][0], use_features].reset_index(drop=True)
    tt_target = df_train.loc[splits[fold][0], 'N_category'].values
    vv = df_train.loc[splits[fold][1], use_features].reset_index(drop=True)
    vv_target = df_train.loc[splits[fold][1], 'N_category'].values

    for df in [tt, vv, test]:
        # df['나이'] = pd.cut(df['나이'], [20, 30, 40, 50, 60, 70, 80, 100])
        df['암의 장경'] = pd.cut(df['암의 장경'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])
        # df['암의 장경'] = pd.cut(df['암의 장경'], [0, 20, 40, 60, 80, 100, 200])

        # df['due_date'] = pd.cut(df['due_date'], [0, 5, 10, 15])
        # df['date_year'] = pd.cut(df['date_year'], [0, 2005, 2010, 2015, 2020, 2030])
        
        # df['암의 장경/개수'] = pd.cut(df['암의 장경/개수'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])
        # df['KI-67_LI_percent'] = pd.cut(df['KI-67_LI_percent'], [0, 20, 40, 60, 80, 100])
        
    # for col in ['나이', '암의 장경', '암의 장경/개수']:
    # for col in ['나이', '암의 장경', 'KI-67_LI_percent']:
    # for col in ['나이', '암의 장경']:
    for col in ['암의 장경']:
        tmp_dict = {j:i for i,j in enumerate(np.unique(tt[col].astype(str)))}
        tt[col] = tt[col].astype(str).map(tmp_dict)
        vv[col] = vv[col].astype(str).map(tmp_dict)
        test[col] = test[col].astype(str).map(tmp_dict)

    # generate dataset
    # tt = tt.fillna(0)
    # vv = vv.fillna(0)
    # test = test.fillna(0)

    # df = pd.concat([tt, pd.DataFrame(tt_target, columns=['target'])], axis=1)
    # discrete_columns = ['암의 장경', 'target']

    # ctgan = CTGAN(epochs=40)
    # ctgan.fit(df, discrete_columns)
    # # Create synthetic data
    # synthetic_data = ctgan.sample(200)
    # df = pd.concat([df, synthetic_data], ignore_index=True)
    # tt_target = df['target'].values
    # tt = df.drop(columns='target')
    
    test = test[use_features].reset_index(drop=True)

    # lgb = LGBMClassifier(
    #                     n_estimators=5000,
    #                     learning_rate=0.03,
    #                     max_depth=-1,
    #                     num_leaves=64,
    #                     )

    # lgb.fit(tt, tt_target, eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=100)
    # oof[splits[fold][1]] += lgb.predict_proba(vv)[:, 1]
    # preds += lgb.predict_proba(test)[:, 1] / 5
    # fi_df = pd.concat([fi_df, pd.DataFrame(zip(use_features, lgb.feature_importances_), columns=['feature', 'value'])])
    # tt[cat_features] = tt[cat_features].fillna(-99).astype(int)
    # vv[cat_features] = vv[cat_features].fillna(-99).astype(int)
    cat = CatBoostClassifier(
                            n_estimators=5000,
                            learning_rate=0.03,
                            # objective='logloss',
                            # eval_metric='F1',
                            # reg_lambda=0.01,
                            # random_seed=1028,
                            allow_writing_files=False,
                            logging_level='Silent',
                            )
    cat.fit(tt, tt_target, eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=500,
            # cat_features=cat_features
            )
    # cat.fit(tt[lambda x: x['due_date']<14], tt_target[tt['due_date']<14], eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=500,
    #         # cat_features=cat_features
    #         )

            
    oof[splits[fold][1]] += cat.predict_proba(vv)[:, 1]
    preds += cat.predict_proba(test)[:, 1] / FOLDS
    fi_df = pd.concat([fi_df, pd.DataFrame(zip(use_features, cat.feature_importances_), columns=['feature', 'value'])])

In [None]:
# 0.8409985689871209 -> standard
# 0.8439943837978167 -> generate

f1_score(df_train['N_category'],
        np.where(oof>0.5, 1, 0),
        average='macro'
        )

In [None]:
from utils import seed_everything
seed_everything(42)

In [3]:
for epoch in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
    for samples in [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]:
        

        df_train = pd.read_csv(data_path+'train.csv')
        df_test = pd.read_csv(data_path+'test.csv')
        
        df_train['fe1'] = df_train['암의 장경'] / df_train['암의 개수'].fillna(1)
        df_test['fe1'] = df_test['암의 장경'] / df_test['암의 개수'].fillna(1)
        
        for df in [df_train, df_test]:
            df['암의 장경'] = pd.cut(df['암의 장경'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])
            df['fe1'] = pd.cut(df['fe1'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])

        FOLDS = 5

        def basic_set():
            use_features = [
                            '나이', 
                            '진단명', '암의 위치', '암의 개수', '암의 장경', 'NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3',
                            'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type', 'T_category', 
                            'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score', 
                            'KI-67_LI_percent', 
                            'HER2', 'HER2_IHC', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation', 
                            'fe1',
                            ]

            cat_features = ['나이', '진단명', '암의 위치', '암의 개수', '암의 장경', 'NG', 'HG', 'HG_score_1',
                                    'HG_score_2', 'HG_score_3', 'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type',
                                    'T_category', 'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score',
                                    'HER2', 'HER2_IHC', 'HER2_SISH','BRCA_mutation']
            cat_features = ['진단명']
            return use_features, cat_features

        df_train['due_date'] = 2022 - pd.to_datetime(df_train['수술연월일']).dt.year
        df_test['due_date'] = 2022 - pd.to_datetime(df_test['수술연월일']).dt.year

        df_train['date_year'] = pd.to_datetime(df_train['수술연월일']).dt.year
        df_test['date_year'] = pd.to_datetime(df_test['수술연월일']).dt.year

        for col in [
                            '나이', '진단명', '암의 위치', '암의 개수', 'NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3',
                            'DCIS_or_LCIS_여부', 'DCIS_or_LCIS_type', 'T_category', 
                            'ER', 'ER_Allred_score', 'PR', 'PR_Allred_score', 
                            'HER2', 'HER2_IHC', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation', 'due_date', 'date_year']:
            df_test[col] = df_test[col].map(df_train[col].value_counts(True))
            df_train[col] = df_train[col].map(df_train[col].value_counts(True))

        skf = StratifiedKFold(n_splits=FOLDS, random_state=42, shuffle=True)
        splits = list(skf.split(df_train, df_train['N_category']))

        fi_df = pd.DataFrame()
        oof = np.zeros(len(df_train))
        preds = np.zeros(len(df_test))

        for fold in range(FOLDS):
            print('start fold :', fold)
            test = df_test.copy()
            
            use_features, cat_features = basic_set()
            use_features += ['due_date', 'date_year']
            # use_features += ['due_date', 'date_year', '암의 장경/개수']

            tt = df_train.loc[splits[fold][0], use_features].reset_index(drop=True)
            tt_target = df_train.loc[splits[fold][0], 'N_category'].values
            vv = df_train.loc[splits[fold][1], use_features].reset_index(drop=True)
            vv_target = df_train.loc[splits[fold][1], 'N_category'].values

            for col in ['암의 장경', 'fe1']:
                tmp_dict = {j:i for i,j in enumerate(np.unique(df_train[col].astype(str)))}
                tt[col] = tt[col].astype(str).map(tmp_dict).astype(int)
                vv[col] = vv[col].astype(str).map(tmp_dict).astype(int)
                test[col] = test[col].astype(str).map(tmp_dict).astype(int)

            # generate dataset
            # tt = tt.fillna(0)
            # vv = vv.fillna(0)
            # test = test.fillna(0)

            # discrete_columns = ['암의 장경', 'target']
            df = pd.concat([tt, pd.DataFrame(tt_target, columns=['target'])], axis=1)

            ctgan = CTGAN(epochs=epoch)
            ctgan.fit(df)
            # Create synthetic data
            synthetic_data = ctgan.sample(samples)
            df = pd.concat([df, synthetic_data], ignore_index=True)
            tt_target = df['target'].values
            tt = df.drop(columns='target')
            
            test = test[use_features].reset_index(drop=True)

            cat = CatBoostClassifier(
                                    n_estimators=5000,
                                    learning_rate=0.03,
                                    # objective='logloss',
                                    # eval_metric='F1',
                                    # reg_lambda=0.01,
                                    # random_seed=1028,
                                    allow_writing_files=False,
                                    logging_level='Silent',
                                    )
            cat.fit(tt, tt_target, eval_set=[(tt, tt_target), (vv, vv_target)], verbose=1000, early_stopping_rounds=500,
                    # cat_features=cat_features
                    )
                    
            oof[splits[fold][1]] += cat.predict_proba(vv)[:, 1]
            preds += cat.predict_proba(test)[:, 1] / FOLDS
            fi_df = pd.concat([fi_df, pd.DataFrame(zip(use_features, cat.feature_importances_), columns=['feature', 'value'])])

            os.makedirs(f'../open/ctgan_csv/{epoch}/{samples}/', exist_ok=True)
            synthetic_data.to_csv(f'../open/ctgan_csv/{epoch}/{samples}/{fold}.csv', index=False)

        f1 = f1_score(df_train['N_category'],
                        np.where(oof>0.5, 1, 0),
                        average='macro'
                        )
        
        print(epoch, samples, f1)

start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 50 0.8349985149866348
start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 100 0.8409998409998409
start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 200 0.8409985689871209
start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 300 0.843999375997504
start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 400 0.835989503328213
start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 500 0.8349958748968724
start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 600 0.838998550986959
start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 700 0.8439975039600633
start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 800 0.8429998429998431
start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4
10 900 0.84499860498

In [None]:
df_train['N_category'].value_counts()

In [None]:
pd.Series(np.where(oof>0.55, 1, 0)).value_counts()

In [None]:
pd.Series(np.where(oof>0.5, 1, 0)).value_counts()

In [None]:
pd.Series(np.where(preds>0.5, 1, 0)).value_counts()

In [None]:
pd.Series(np.where(preds>0.65, 1, 0)).value_counts()