In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
pip install catboost #optuna #catboost

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 96 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


In [23]:
#%% Main script
if __name__ == '__main__':

    #% Import packages
    import numpy as np
    import pandas as pd
    import os
    import pickle
    import itertools as it
    import optuna
    import joblib
    import re
    
    from matplotlib import pyplot as plt
    from copy import deepcopy
    from datetime import datetime
    from tqdm import tqdm
    from glob import glob
    from sklearn.naive_bayes import GaussianNB, MultinomialNB
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import f1_score
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from scipy.stats import hmean
    from optuna import visualization
    from optuna.samplers import TPESampler    
    from multiprocessing import cpu_count
    from xgboost import XGBClassifier, XGBRFClassifier
    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression
    from catboost import CatBoostClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import mean_squared_error
    from catboost import CatBoostRegressor
    

In [5]:
def zip(*iterables):
    # zip('ABCD', 'xy') --> Ax By
    sentinel = object()
    iterators = [iter(it) for it in iterables]
    while iterators:
        result = []
        for it in iterators:
            elem = next(it, sentinel)
            if elem is sentinel:
                return
            result.append(elem)
        yield tuple(result)

In [6]:
    #%% Overall settings
    run_new_submission = False
    print('\nMake new submission file <{}>'.format(
        run_new_submission))
    
    cpu_use = int(3*cpu_count()/4)


Make new submission file <False>


In [7]:
dir = "/content/drive/MyDrive/Colab Notebooks/"

In [8]:
os.chdir(dir)

In [9]:
'''
import zipfile
knowzip = zipfile.ZipFile("know.zip").extractall('know')
'''

'\nimport zipfile\nknowzip = zipfile.ZipFile("know.zip").extractall(\'know\')\n'

In [10]:
os.getcwd()

'/content/drive/MyDrive/Colab Notebooks'

In [53]:
#%% Load data
years = [2017, 2018, 2019, 2020]
    
df_smp_subm = pd.read_csv('./know/sample_submission.csv')

path_tr = sorted(glob('./know/train/*'))
path_test = sorted(glob('./know/test/*'))
    
dict_tr = {y: pd.read_csv(p, low_memory=False) for y, p in zip(years, path_tr)}
dict_test = {y: pd.read_csv(p, low_memory=False) for y, p in zip(years, path_test)}

In [54]:
    #%% Check data index match
    # Check index mismatch between train and test
    for y in years:
        i_tr = set(dict_tr[y].idx)
        i_test = set(dict_test[y].idx)
        
        print('\nIndex mismatch between train and test')
        print(y)
        print('Train - Test : ', i_tr.difference(i_test))
        print('Test - Train : ', i_test.difference(i_tr))

    # Check index mismatch between test and submission
    idx_test = pd.concat([dict_test[k].idx for k in dict_test.keys()], axis=0)
    print('\nIndex mismatch between test and submission')
    print('Test - submission : ', set(idx_test) - set(df_smp_subm.idx))
    print('submission - Test : ', set(df_smp_subm.idx) - set(idx_test))  


Index mismatch between train and test
2017
Train - Test :  set()
Test - Train :  set()

Index mismatch between train and test
2018
Train - Test :  {14405, 12966, 17381, 15976, 13866, 9521, 11347, 11380, 9973, 13142, 17811, 17720, 17977, 14591}
Test - Train :  {17856, 10273, 11522, 16872, 10384, 11537, 14928, 16018, 13492, 12245, 17971}

Index mismatch between train and test
2019
Train - Test :  {21265, 24914, 21348, 21503}
Test - Train :  {25312, 20113, 22855}

Index mismatch between train and test
2020
Train - Test :  set()
Test - Train :  set()

Index mismatch between test and submission
Test - submission :  set()
submission - Test :  set()


In [55]:
 #%% Check data column match
    # Check column mismatch between train and test
for year in years:
  c_tr = set(dict_tr[year].columns)
  c_test = set(dict_test[year].columns)
  
  print('\nColumn mismatch between train and test')
  print(year)
  print('Symmetric difference : ', c_tr.symmetric_difference(c_test))


Column mismatch between train and test
2017
Symmetric difference :  {'knowcode'}

Column mismatch between train and test
2018
Symmetric difference :  {'knowcode'}

Column mismatch between train and test
2019
Symmetric difference :  {'knowcode'}

Column mismatch between train and test
2020
Symmetric difference :  {'knowcode'}


In [56]:
    #%% Data preprocessing
    # Set index
    dict_tr = {k:v.set_index('idx') for k, v in dict_tr.items()}
    dict_test = {k:v.set_index('idx') for k, v in dict_test.items()}    
    
    # Fill empty elements
    dict_tr = {k:v.replace(' ', '-1') for k, v in dict_tr.items()}
    dict_test = {k:v.replace(' ', '-1') for k, v in dict_test.items()}
    
    # Change elements
    list_chg = (
        ['없음', '없다'], # Integrate '없다' and '없음'
        )
    for pre, post in list_chg:
        dict_tr = {k:v.replace(pre, post) for k, v in dict_tr.items()}
        dict_test = {k:v.replace(pre, post) for k, v in dict_test.items()}

    # Remove space
    for k, v in dict_tr.items():
        for col in v.columns:
            try: 
                v[col].map(float)
            except:
                if not sum(v[col].str.contains(' ')):
                    pass
                else:
                    v[col] = v[col].str.replace(' ', '')
        dict_tr[k] = v
        
    for k, v in dict_test.items():
        for col in v.columns:
            try: 
                v[col].map(float)
            except:
                if not sum(v[col].str.contains(' ')):
                    pass
                else:
                    v[col] = v[col].str.replace(' ', '')
        dict_test[k] = v
        
    # Preprocess the columns bq31, bq30 for 2017, 2018 respectively
    def process_tool_col(y, df):
        if y == 2017:
            s = df.bq31
            # Make all English letters to uppercase letters
            s = s.str.upper() 
            # Change - into ,
            s = s.str.replace('-', ',')
            # Change . into ,
            s = s.str.replace('.', ',')
            # Remove '등'
            s = s.str.replace('등', '')
            df.bq31 = s
        elif y == 2018:
            s = df.bq30
            # Make all English letters to uppercase letters
            s = s.str.upper() 
            # Change - into ,
            s = s.str.replace('-', ',')
            # Change . into ,
            s = s.str.replace('.', ',')
            # Remove '등'
            s = s.str.replace('등', '')
            df.bq30 = s
        else:
            pass # There is no column of tools for job in 2019, 2020
        
        return df
    
    
    dict_tr = {k: process_tool_col(k, v) for k, v in dict_tr.items()}
    dict_test = {k: process_tool_col(k, v) for k, v in dict_test.items()}

In [57]:
    #%% Manual elements replacing for important features
    from difflib import SequenceMatcher
    
    a = dict_tr[2018]['bq4_1a']
    chg_log = []
    def word_changer(s):
        s_new = deepcopy(s)
        for i, w_i in s.iteritems():
            for _, w_j in s_new.loc[:i].iteritems():
                sim = SequenceMatcher(None, w_i, w_j).quick_ratio()
                if sim == 1:
                    break
                elif (sim >= 0.8) and (sim < 1):
                    if np.random.random() > 0.99: 
                        print(w_i, ' >> ', w_j, round(sim, 2))
                    chg_log.append((w_i, w_j))
                    s_new.at[i] = w_j
                    # sleep(1)
        return s_new
    
    
    # word_changer(a)
    # chg_log = list(set(chg_log))    
    # chg_log.sort(key=lambda x: x[1])
    
    dict_mnl_chg = {
        2017:[
         ('1종대형면허', '1종대형면허증'),
         ('1종대형운전면허자격증', '1종대형면허증'),
         ('1종대형면허자격증', '1종대형먼허증'),
         ('1종대형운전면허증', '1종대형면허증'),
         ('1종대형운전면허', '1종대형면허증'),
         ('자동차대형1종면허', '1종대형면허증'),
         ('1종대형자동차면허증', '1종대형면허증'),
         ('자동차대형1종면허', '1종대형면허증'),
         ('1종대형자동차운전면허', '1종대형면허증'),
         ('자동차운전면허1종', '1종운전면허증'),
         ('1종보통운전면허증', '1종운전면허증'),
         ('1종보통운전면허', '1종운전면허증'),
         ('1종운전면허', '1종운전면허증'),
         ('운전면허1종', '1종운전면허증'),
         ('3D에니메이터자격증', '3D에니메이션자격증'),
         ('간호사면허', '간호사면허증'),
         ('건설기계면허', '건설기계면허증'),
         ('건설기계기관정비기능사', '건설기계정비기능사'),
         ('건설기계정비기사', '건설기계정비기능사'),
         ('건설기계차체정비기능사', '건설기계정비기능사'),
         ('건설기계정비', '건설기계정비사'),
         ('건설기계정비기사', '건설기계정비사'),
         ('건설재료시험기능사', '건설재료시험기사'),
         ('건설중기정비사', '건설중기정비'),
         ('건축기사자격증', '건축사자격증'),
         ('공조냉동기계기능사', '공조냉동기계기사'),
         ('냉동공조기계기사', '공조냉동기계기사'),
         ('관광통역안내', '관광통역안내사'),
         ('관광통역안내원', '관광통역안내사'),
         ('광산보안기능사', '광산보안기사'),
         ('정교사자격증', '교사자격증'),
         ('교정적5급공무원자격증', '교정직공무원5급자격증'),
         ('한의사국가자격증', '한의사면허증'), 
         ('한의사자격증', '한의사면허증'),
         ('국가자격증(정신보건사회복지사자격증)', '국가자격증(정신보건사회복지사)'),
         ('국내여행안내자격증', '국내여행안내사자격증'),
         ('국제의료코디네이터', '의료관광코디네이터'),
         ('국제의료관광코디네이터', '의료관광코디네이터'),
         ('귀금속가공기능사', '귀금속가공기사'),
         ('그래픽운용기능사', '그래픽운용기사'),
         ('그래픽운용사', '그래픽운용기사'),
         # ('귀금속가공기능사', '금속가공기능사'),
         ('기능사자격증', '기사자격증'),
         ('기록물관리', '기록물관리사'),
         ('기중기운전면허', '기중기운전면허증'),
         ('냉동기계산업기사증', '냉동기계산업기사'),
         ('농기계수리정비사자격증', '농기계정비기사'),
         ('농기계수리정비자격증', '농기계정비기사'),
         # ('기계정비기능사', '농기계정비기능사'),
         ('대기환경기술사', '대기환경기사'),
         ('도배기능사자격증', '도배기사'),
         ('도배기사자격증', '도배기사'),
         ('로더운전사', '로더운전기사'),
         ('무대예술전문', '무대예술전문인'),
         ('무선설비기능사', '무선설비기사'),
         ('무술유단자자격', '무술유단자자격증'),
         ('물리치료면허', '물리치료면허증'),
         ('미용사자격', '미용사자격증'),
         ('이미용사자격증', '미용사자격증'),
         ('방사선사면허', '방사선사면허증'),
         ('방사선전문의', '방사선과전문의면허증'),
         ('방사선과전문의', '방사선과전문의면허증'),
         ('방송통신기능사', '방송통신기사'),
         ('변호사자격', '변호사자격증'),
         ('보일러가스시설시공자격증', '보일러가스시설자격증'),
         ('1종보통운전면허증', '보통1종운전면허'),
         ('사회복지자격증', '사회복지사자격증'),
         ('산업위생관리기술사', '산업위생관리기사'),
         ('상담자격증', '상담사자격증'),
         ('소방설비기술사', '소방설비기사'),
         ('소음진동기사', '소음진동기술사'),
         ('기사자격증', '속기사자격증'),
         ('속기자격증', '속기사자격증'),
         ('의사', '의사면허증'),
         ('의사면허', '의사면허증'),
         ('의사국가면허', '의사면허증'),
         ('수의사국가면허', '수의사면허증'),
         ('수의사면허', '수의사면허증'),
         ('의사자격증', '의사면허증'), 
         ('수의사자격증', '수의사면허증'),
         ('수질환경기술사', '수질환경기사'),
         ('승강기기사', '승강기기능사'),
         ('승강기기능자격증', '승강기기능자격'),
         ('시각디자인기능사', '시각디자인기사'),
         ('애견미용사자격증', '애견미용자격증'),
         ('에너지관리산업기사', '에너지산업관리사'),
         ('영양사면허증', '영양사면허'),
         ('운전면허증1,2종', '1종운전면허증'),
         ('운전면허1,2종', '1종운전면허증'),
         ('1종운전면허', '1종운전면허증'),
         ('운전면허증1종', '1종운전면허증'),
         ('운전면허1종', '1종운전면허증'),
         ('운전면허증1종', '1종운전면허증'),
         ('원동기면허', '원동기면허증'),
         ('유리시공기능사', '유리시공기사'),
         ('유치원2급정교사자격증', '유치원교사자격증'),
         ('유치원정교사자격증', '유치원교사자격증'),
         ('유치원교사', '유치원교사자격증'),
         ('유치원정교사', '유치원교사자격증',),
         ('육군3사관학교', '육군사관학교'),
         ('국제의료관광코디네이터', '의료관광코디네이터'),
         ('의료보조기사', '의료보조기기사'),
         ('의지보조기기기사자격증', '의료보조기기사'),
         ('의지보조기기사자격증', '의료보조기기사'),
         ('의지보조기기자격증', '의료보조기기사'), 
         ('의지보조기기사자격증', '의료보조기기사'),
         ('의지보조기기사', '의지보조기사'),
         ('이미용사자격증', '미용사자격증'),
         ('이미용자격증', '미용사자격증'),
         ('이용사자격증','미용사자격증'),
         ('일식조리자격증', '일식조리사자격증'),
         ('임상병리사면허', '임상병리사면허증'),
         ('자동차운전면허', '1종운전면허증'),
         ('자동차운전면허증', '1종운전면허증'),
         ('자동차정비기능사', '자동차정비기사'),
         ('자동차정비', '자동차정비기사'),
         ('자동차정비사', '자동차정비기사'),
         ('자동차정비강사', '자동차정비기사'),
         # ('자동차정비기사', '자동차정비사'),
         ('자동차정비원자격증', '자동차정비기사'),
         ('자동차정비자격증', '자동차정비기사'),
         ('작업치료사면허', '작업치료사면허증'),
         ('전기기능사자격증', '전기기사'), 
         ('전기기사자격증', '전기기사'),
         ('전문의면허', '전문의면허증'),
         ('전자기기기사', '전자기기기능사'),
         ('전자상거래관리사2급', '전자상거래관리사1급'),
         ('정보처리기능사', '정보처리기사'),
         ('정보처리기술사', '정보처리기사'),
         ('정보처리사', '정보처리기사'),
         ('정보통신기사', '정보통신사'),
         ('정수시설운영관리사', '정수시설운영관리'),
         ('학예사자격증', '정학예사자격증'),
         ('조경기사자격', '조경기사자격증'),
         ('중고등2급정교사', '중등교사자격증'), 
         ('중등2급정교사', '중등교사자격증'),
         ('중등교사2급', '중등교사자격증'), 
         ('중등2급정교사', '중등교사자격증'),
         ('중등교원자격', '중등교사자격증'),
         ('중등교원자격증', '중등교사자격증'),
         ('중등교사2급', '중등교사자격증'),
         ('중등정교사2급', '중등교사자격증'),
         ('중등학교2급정교사', '중등교사자격증'),
         ('중등학교정교사(2급)', '중등교사자격증'),
         ('중등학교정교사2급', '중등교사자격증'),
         ('중등학교정교사(2급)', '중등교사자격증'),
         ('철도교통관제', '철도교통관제사'),
         ('철도신호기능사', '철도신호기사'),
         ('철도차량운전면허증', '철도차량운전면허'),
         ('초등정교사자격증', '초등교사자격증'),
         ('초등학교1,2급정교사', '초등교사자격증'),
         ('초등학교정교사1급', '초등교사자격증'),
         ('초등학교정교사(1급)', '초등교사자격증'),
         ('초등학교정교사1급', '초등교사자격증'),
         ('초등학교1,2급정교사', '초등교사자격증'),
         ('초등학교정교사2급', '초등교사자격증'),
         ('치과의사면허', '치과의사면허증'),
         ('컴퓨터그래픽운용기능사', '컴퓨터,그래픽운용기사'),
         ('컴퓨터그래픽운용기능사', '컴퓨터그래픽스운용기능사'),
         ('컴퓨터그래픽기능사', '컴퓨터그래픽운영기능사'),
         ('컴퓨터그래픽운용기능사', '컴퓨터그래픽운영기능사'),
         ('컴퓨터그래픽운용기능사', '컴퓨터그랙픽운용기능사'),
         ('컴퓨터활용법', '컴퓨터활용'),
         ('특수용접기능사', '특수용접기사'),
         ('폐기물처리기술사', '폐기물처리기사'),
         ('품질관리기술사', '품질관리기사'),
         ('한글속기자격증', '한글속기사자격증'),
         ('의사자격증', '의사면허증'),
         ('의사전문의', '전문의면허증'),
         ('항공기관정비기능', '항공기정비사'),
         ('항공기관정비기능사', '항공기정비사'),
         ('항공정비사면장', '항공기정비사'),
         ('항공정비면장', '항공기정비사'),
         ('항공정비사', '항공기정비사'),
         ('해기사면허증', '해기사면허'),
         ('헬리콥터조종면허증', '헬리콥터조종사면허증'),
         ('호스피스간호사', '간호사면허증'),
         ('호스피스간호', '간호사면허증'),
         ('호텔서비스사', '호텔서비스'),
         ('화물운송자자격증', '화물운송자격증'),
         ('회계자격증', '회계사자격증'),
         ]
                    }
    
    def manual_change(y, df):
        chg_list = dict_mnl_chg[2017]#[y]
        for pre, post in chg_list:
            df = df.replace(pre, post)
        
        return df
       
    dict_tr = {k:manual_change(k, v) for k, v in dict_tr.items()}
    dict_test = {k:manual_change(k, v) for k, v in dict_test.items()}

In [58]:
    #%% Remove contents in parenthesis
    def rmv_parenthesis(df):
        for col in df.columns:
            try:
                df[col].map(float)
                pass
            except:
                try:
                    df[col].map(str)
                    df[col] = df[col].replace(r'\([^)]*\)', '', regex=True)
                    # print(col)
                except:
                    pass
                    
        return df
    
    dict_tr = {k: rmv_parenthesis(v) for k, v in dict_tr.items()}
    dict_test = {k: rmv_parenthesis(v) for k, v in dict_test.items()}    

In [59]:
    #%% (Test) Make One-hot encoding DataFrame by 'bq31'
    def one_hot_tool_col(df):
        s = df.bq31
        vals = []
        for i, v in s.iteritems():
            vals += v.split(',')
        
        vals = list(set(vals))
        
        df_tool = pd.DataFrame(np.zeros(shape=(len(df.index), len(vals))),
                               index=df.index,
                               columns=vals,
                               dtype='int32')
        
        for i, v in tqdm(s.iteritems(), total=len(df.index)):
            df_tool.loc[i, list(set(v.split(',')))] = 1
            
        return df_tool
    
    df_tool = one_hot_tool_col(dict_tr[2017])

100%|██████████| 9486/9486 [00:03<00:00, 2496.09it/s]


In [60]:
    #%% Make label encoder for each years of datasets
    # Train set label encoding
    dict_encoder = {}
    
    for y, df in dict_tr.items():
        encoder_pack = {}
        
        for col in df.columns:
            try:
                df[col] = df[col].map(float)
                df[col] = df[col].map(int)
            except:
                encoder = LabelEncoder()
                df[col] = df[col].map(str)
                df[col] = encoder.fit_transform(df[col])
                encoder_pack[col] = encoder
                
        dict_encoder[y] = encoder_pack
                
        
    # Test set label encoding
    for y, df in dict_test.items():
        encoder_pack = dict_encoder[y]        
        
        for col in df.columns:
            try:
                df[col] = df[col].map(float)
                df[col] = df[col].map(int)
            except:
                try:
                    encoder = encoder_pack[col]
                    df[col] = df[col].map(str)
                    category_map = {category: idx for idx, category in
                                    enumerate(encoder.classes_)}
                    df[col] = df[col].apply(
                        lambda x: category_map[x] if x in category_map else -2)
                    # -2 indicates unseen in train set
                except:
                    print('\nThere is no encoder for test set', y, col)
                    df[col] = df[col].apply(
                        lambda x: -3 if len(x)>=2 else x)

In [18]:
    #%% Decrease data size by changing dtype
    dict_tr = {k: v.astype('int32') for k, v in dict_tr.items()}    
    dict_test = {k: v.astype('int32') for k, v in dict_test.items()}

In [40]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [65]:
X_sc ={}
y_sc ={}
for y, df in dict_tr.items():
  X_sc[y] = df.drop(columns ='knowcode')
  y_sc[y] = df.knowcode

  X_sc[y] = StandardScaler().fit_transform(X_sc[y])
  y_sc[y] = y_sc[y].reset_index()
  y_sc[y] = y_sc[y].drop(columns ='idx')

In [67]:
pca = PCA(n_components = 80)
X_pca={}
for y, df in X_sc.items():
  X_pca[y] = pca.fit_transform(X_sc[y])
  X_pca[y] = pd.DataFrame(data = X_pca[y])

In [68]:
dict_tr_pca ={}
for y, df in dict_tr.items():
  dict_tr_pca[y] = pd.concat([X_pca[y], y_sc[y]], axis = 1 )

In [69]:
dict_tr_pca

{2017:              0         1         2  ...        78        79  knowcode
 0     0.237587  5.513830 -2.925984  ...  0.135306  0.387821    825101
 1    -2.944069  1.742888 -2.022714  ... -0.611342 -0.633237    140204
 2    -1.391696  3.245896 -3.214496  ... -0.211908 -0.265042    140204
 3     5.815792  4.673629 -1.529435  ...  0.219218  0.804628    140601
 4    -5.159574  3.394498 -3.012952  ... -0.792365 -1.162545    140204
 ...        ...       ...       ...  ...       ...       ...       ...
 9481  0.805744 -2.474234 -0.751863  ... -0.506415 -0.220074    411301
 9482  7.394329  2.133122 -2.884083  ... -1.096473  0.155024    151105
 9483 -3.998823  7.011724  1.728775  ... -0.473834  0.588745    701101
 9484  5.325355  2.415174 -0.876391  ...  0.218110  0.407821     25402
 9485  3.299652  1.417264 -2.086509  ... -0.446512  0.129153     15201
 
 [9486 rows x 81 columns],
 2018:               0         1         2  ...        78        79  knowcode
 0     -2.269318  1.690403  2.73714

In [70]:
    #%% Train validation split (by stratify)
    X_tr = {}
    y_tr = {}
    X_val = {}
    y_val = {}
    
    for y, df in dict_tr_pca.items():
        tr, val = train_test_split(df, test_size=0.2, random_state=42,
                                   shuffle=True, stratify=df.knowcode)
        X_tr[y] = tr.drop(columns='knowcode')
        y_tr[y] = tr.knowcode
        X_val[y] = val.drop(columns='knowcode')
        y_val[y] = val.knowcode

In [71]:
#%% Import packages
from optuna import Trial

#%% Custom functions
def objectiveXGBRF(trial: Trial, X, y, X_val, y_val):
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [200]),
        'objective':trial.suggest_categorical('objective', ['multi:softmax']),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        # 'subsample': trial.suggest_float('subsample', 0.4, 1),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.2, 1),
        # 'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1),
        # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'n_jobs': trial.suggest_categorical('n_jobs', [cpu_use]),
        'random_state': trial.suggest_categorical('random_state', [42]),
        }
    
    model = XGBRFClassifier(**params)
    xgbrf_model = model.fit(X, y)
    
    score = f1_score(y_val, xgbrf_model.predict(X_val), average='macro')

    return score


def get_xgbrf_optuna(X_tr, y_tr, X_val, y_val, n_trial):
    study = optuna.create_study(
        study_name='nb_param_opt',  
        direction='maximize', 
        sampler=TPESampler(seed=42)
        )
    
    study.optimize(lambda trial: objectiveXGBRF(
        trial, X_tr, y_tr, X_val, y_val),
        n_trials=n_trial)
    
    best_xgbrf = XGBRFClassifier(**study.best_params).fit(
        pd.concat([X_tr, X_val], axis=0),
        pd.concat([y_tr, y_val], axis=0),
        )
    
    return best_xgbrf, study.best_value, study


def objectiveLR(trial: Trial, X, y, X_val, y_val):
    params = {
        'penalty': trial.suggest_categorical('penalty',
                                             ['l1', 'l2', 'elasticnet']),
        'solver': trial.suggest_categorical('solver',
                                             ['saga']),
        'C': trial.suggest_loguniform('C', 1e-3, 1e+3),
        'n_jobs': trial.suggest_categorical('n_jobs', [cpu_use]),
        'random_state': trial.suggest_categorical('random_state', [42]),
        }
    
    model = LogisticRegression(**params)
    lr_model = model.fit(X, y)
    
    score = f1_score(y_val, lr_model.predict(X_val), average='macro')

    return score


def get_lr_optuna(X_tr, y_tr, X_val, y_val, n_trial):
    study = optuna.create_study(
        study_name='nb_param_opt',
        direction='maximize', 
        sampler=TPESampler(seed=42)
        )
    
    study.optimize(lambda trial: objectiveLR(
        trial, X_tr, y_tr, X_val, y_val),
        n_trials=n_trial)
    
    best_lr = LogisticRegression(**study.best_params).fit(
        pd.concat([X_tr, X_val], axis=0),
        pd.concat([y_tr, y_val], axis=0),
        )
    
    return best_lr, study.best_value, study


def objectiveSVC(trial: Trial, X, y, X_val, y_val):
    params = {
        'kernel': trial.suggest_categorical('kernel', ['rbf', 'sigmoid']),
        'C': trial.suggest_loguniform('C', 1e-3, 1e+3),
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto'])
        }
    
    model = SVC(**params)
    svc_model = model.fit(X, y)
    
    score = f1_score(y_val, svc_model.predict(X_val), average='macro')

    return score


def get_svc_optuna(X_tr, y_tr, X_val, y_val, n_trial):
    study = optuna.create_study(
        study_name='nb_param_opt',
        direction='maximize', 
        sampler=TPESampler(seed=42)
        )
    
    study.optimize(lambda trial: objectiveSVC(
        trial, X_tr, y_tr, X_val, y_val),
        n_trials=n_trial)
    
    best_svc = SVC(**study.best_params).fit(
        pd.concat([X_tr, X_val], axis=0),
        pd.concat([y_tr, y_val], axis=0),
        )
    
    return best_svc, study.best_value, study


def objectiveXGB(trial: Trial, X, y, X_val, y_val):
    params = {
        'objective': trial.suggest_categorical('objective', ['multi:softmax']),
        #'n_estimators': trial.suggest_int('n_estimators', 600, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 16),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        #'gamma': trial.suggest_float('gamma', 0.001, 2),
        #'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.8),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel',0.6,0.8),
        # 'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1),
        #'n_jobs': trial.suggest_categorical('n_jobs', [cpu_use]),
        'random_state': trial.suggest_categorical('random_state', [42]),
        #'num_round' : trial.suggest_int('num_round',1400,2000),
        #'eval_metric' : trial.suggest_categorical('eval_metric', ['mlogloss '])
        }
    
    model = XGBClassifier(**params)
    xgb_model = model.fit(X, y)
    
    score = f1_score(y_val, xgb_model.predict(X_val), average='macro')

    return score


def get_xgb_optuna(X_tr, y_tr, X_val, y_val, n_trial):
    study = optuna.create_study(
        study_name='xgb_param_opt',
        direction='maximize', 
        sampler=TPESampler(seed=42)
        )
    
    study.optimize(lambda trial: objectiveXGB(
        trial, X_tr, y_tr, X_val, y_val),
        n_trials=n_trial)
    
    best_xgb = XGBClassifier(**study.best_params).fit(
        pd.concat([X_tr, X_val], axis=0),
        pd.concat([y_tr, y_val], axis=0),
        )
    
    return best_xgb, study.best_value, study


def objectiveRF(trial: Trial, X, y, X_val, y_val):
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [200]),
        'criterion':trial.suggest_categorical('criterion', ['entropy']),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'max_features': trial.suggest_float('max_features', 0.1, 1),
        # 'max_samples': trial.suggest_float('max_samples', 0.5, 1),
        'n_jobs': trial.suggest_categorical('n_jobs', [cpu_use]),
        'random_state': trial.suggest_categorical('random_state', [42]),
        }
    
    model = RandomForestClassifier(**params)
    rf_model = model.fit(X, y)
    
    score = f1_score(y_val, rf_model.predict(X_val), average='macro')
    
    return score


def get_rf_optuna(X_tr, y_tr, X_val, y_val, n_trial):
    study = optuna.create_study(
        study_name='rf_param_opt',
        direction='maximize', 
        sampler=TPESampler(seed=42)
        )
    
    study.optimize(lambda trial: objectiveRF(
        trial, X_tr, y_tr, X_val, y_val),
        n_trials=n_trial)
    
    best_rf = RandomForestClassifier(**study.best_params).fit(
        pd.concat([X_tr, X_val], axis=0),
        pd.concat([y_tr, y_val], axis=0),
        )
    
    return best_rf, study.best_value, study


def fix_random_seed(seed=42):
    import random
    import numpy as np
    import os
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    try:
        tf.random.set_seed(seed)
    except:
        pass

def objectiveCat(trial: Trial, X, y, X_val, y_val):
    
    param = {
      'random_state': trial.suggest_categorical('random_state', [42]),
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),
      #'bagging_temperature' :trial.suggest_float('bagging_temperature', 5.0, 100.00),
      #"n_estimators":trial.suggest_categorical("n_estimators", [1000]),
      #"max_depth":trial.suggest_int("max_depth", 8,10),
      #'random_strength' :trial.suggest_int('random_strength', 0, 100),
      #"colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
      #"l2_leaf_reg":trial.suggest_float("l2_leaf_reg",0.0001,10 ),
      #"min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      #"max_bin": trial.suggest_int("max_bin", 200, 500),
      'od_type': trial.suggest_categorical('od_type', ['IncToDec']),
      #'objective' : trial.suggest_categorical('objective', ['MultiClass','MultiClassOneVsAll','Logloss','CrossEntropy'],
      'task_type': trial.suggest_categorical('task_type', ['GPU'])
  }
    
    #categorical_features_indices2 = np.where(X.dtypes == np.object)[0]

    model = CatBoostClassifier(**param)
    cat_model = model.fit(X, y, eval_set=[(X_val, y_val)], verbose=False) #, cat_features = categorical_features_indices2
     
    score = f1_score(y_val, cat_model.predict(X_val), average='macro')
    
    return score

def get_cat_optuna(X_tr, y_tr, X_val, y_val, n_trial):
    study = optuna.create_study(
        study_name='cat_param_opt',
        direction='maximize', 
        sampler=TPESampler(seed=42)
        )
    
    study.optimize(lambda trial: objectiveCat(
        trial, X_tr, y_tr, X_val, y_val),
        n_trials=n_trial)
    
    best_cat = CatBoostClassifier(**study.best_params).fit(
        pd.concat([X_tr, X_val], axis=0),
        pd.concat([y_tr, y_val], axis=0),
        
        )
    
    return best_cat, study.best_value, study


def objectiveCatreg(trial: Trial, X, y, X_val, y_val):
    
    param = {
      'random_state': trial.suggest_categorical('random_state', [42]),
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),
      #'bagging_temperature' :trial.suggest_float('bagging_temperature', 5.0, 100.00),
      #"n_estimators":trial.suggest_categorical("n_estimators", [1000]),
      #"max_depth":trial.suggest_int("max_depth", 8,10),
      #'random_strength' :trial.suggest_int('random_strength', 0, 100),
      #"colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
      #"l2_leaf_reg":trial.suggest_float("l2_leaf_reg",0.0001,10 ),
      #"min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      #"max_bin": trial.suggest_int("max_bin", 200, 500),
      'od_type': trial.suggest_categorical('od_type', ['IncToDec']),
      #'objective' : trial.suggest_categorical('objective', ['MultiClass','MultiClassOneVsAll','Logloss','CrossEntropy'],
      'task_type': trial.suggest_categorical('task_type', ['GPU'])
  }
    
    #categorical_features_indices2 = np.where(X.dtypes == np.object)[0]

    model = CatBoostRegressor(**param)
    cat_model = model.fit(X, y, eval_set=[(X_val, y_val)], verbose=False) #, cat_features = categorical_features_indices2
     
    score = mean_squared_error(y_val, cat_model.predict(X_val))
    
    return score

def get_catreg_optuna(X_tr, y_tr, X_val, y_val, n_trial):
    study = optuna.create_study(
        study_name='cat_param_opt',
        direction='minimize', 
        sampler=TPESampler(seed=42)
        )
    
    study.optimize(lambda trial: objectiveCatreg(
        trial, X_tr, y_tr, X_val, y_val),
        n_trials=n_trial)
    
    best_cat = CatBoostRegressor(**study.best_params).fit(
        pd.concat([X_tr, X_val], axis=0),
        pd.concat([y_tr, y_val], axis=0),
        
        )
    
    return best_cat, study.best_value, study


In [None]:
    #%% RandomForest hyperparameter search by optuna
    mdl_selc = 'cat'
    num_trial = 5
    num_trial2 = 5
    print('='*15, f'Model selected [{mdl_selc}]', '='*15)
    
    if mdl_selc == 'rf':
        rslt_param_opt = \
            {y: get_rf_optuna(X_tr[y], y_tr[y], X_val[y], y_val[y], num_trial)
             for y in years}
    elif mdl_selc == 'xgb':
        rslt_param_opt = \
            {y: get_xgb_optuna(X_tr[y], y_tr[y], X_val[y], y_val[y], num_trial)
             for y in years}
    elif mdl_selc == 'xgbrf':
        rslt_param_opt = \
            {y: get_xgbrf_optuna(X_tr[y], y_tr[y], X_val[y], y_val[y],
                                 num_trial)
             for y in years}
    elif mdl_selc == 'svc':
        rslt_param_opt = \
            {y: get_svc_optuna(X_tr[y], y_tr[y], X_val[y], y_val[y], num_trial)
             for y in years}
    elif mdl_selc == 'cat':
        rslt_param_opt= \
            {y: get_cat_optuna(X_tr[y], y_tr[y], X_val[y], y_val[y], num_trial)
             for y in years}
    elif mdl_selc == 'catrf' :
        rslt_param_opt ={}
        for y in years:
          if y == 2017:
             rslt_param_opt[2017] = get_rf_optuna(X_tr[y], y_tr[y], X_val[y], y_val[y], num_trial)
          else :
             rslt_param_opt[y] = get_cat_optuna(X_tr[y], y_tr[y], X_val[y], y_val[y], num_trial2)
    
    elif mdl_selc == 'catreg':
        rslt_param_opt= \
            {y: get_catreg_optuna(X_tr[y], y_tr[y], X_val[y], y_val[y], num_trial)
             for y in years}

[32m[I 2022-01-26 13:21:50,220][0m A new study created in memory with name: cat_param_opt[0m




In [27]:
    #%% Divide parameter optimization results
    mdl_best = {k:v[0] for k, v in rslt_param_opt.items()}
    dict_val_f1 = {k:v[1] for k, v in rslt_param_opt.items()}
    rslt_opt = {k:v[2] for k, v in rslt_param_opt.items()}
    
    for k, v in dict_val_f1.items():
        print('Val score of {}: {}'.format(k, round(v, 3)))
    
    print('Val harmonic mean score : {}'.format(round(
        hmean([v for k, v in dict_val_f1.items()]), 3)))

Val score of 2017: 2856090290758.322
Val score of 2018: 20085227287.697
Val score of 2019: 23367241987.025
Val score of 2020: 20952513212.825
Val harmonic mean score : 28437389269.364


In [None]:
X_sc_test ={}
X_pca_test={}
for y, df in dict_test.items():
  X_sc_test[y] = StandardScaler().fit_transform(X_sc_test[y])
  X_pca_test[y] = pca.fit_transform(X_sc_test[y])
  X_pca_test[y] = pd.DataFrame(data = X_pca_test[y])
  

In [28]:
    try:
        f_imp = {y:pd.DataFrame(
            data={'f_nm': X_pca_test[y].columns,
                  'score': mdl_best[y].feature_importances_}).\
                 sort_values(by='score', ascending=False)
                 for y in years}
    except AttributeError:
        f_imp = {y:pd.DataFrame(data={
            'f_nm': X_pca_test[y].columns,
            'score': mdl_best[y][1].feature_importances_}).\
                 sort_values(by='score', ascending=False)
             if y == 2017 or y == 2018 else 
                     pd.DataFrame(
                         data={'f_nm': X_pca_test[y].columns,
                               'score': mdl_best[y].feature_importances_}).\
                              sort_values(by='score', ascending=False)
                 for y in years}

In [35]:
#abs(pre_2017.round())

array([329755., 957246., 186637., ..., 446248., 311082., 801756.])

In [36]:
pre_2017 = mdl_best[2017].predict(X_pca_test[2017])
pre_2018 = mdl_best[2018].predict(X_pca_test[2018])
pre_2019 = mdl_best[2019].predict(X_pca_test[2019])
pre_2020 = mdl_best[2020].predict(X_pca_test[2020])

#m_list = [name for names in pre_2017 for name in names]
#m_list2 = [name for names in pre_2018 for name in names]
#m_list3 = [name for names in pre_2019 for name in names]
#m_list4 = [name for names in pre_2020 for name in names]

pred1 = pd.DataFrame(abs(pre_2017.round()), index = X_pca_test[2017].index)
pred2 = pd.DataFrame(abs(pre_2018.round()), index = X_pca_test[2018].index)
pred3 = pd.DataFrame(abs(pre_2019.round()), index = X_pca_test[2019].index)
pred4 = pd.DataFrame(abs(pre_2020.round()), index = X_pca_test[2020].index)

y_pred = pd.concat([pred1,pred2,pred3, pred4], axis=0)
    
df_subm = y_pred.reset_index()
df_subm.columns = ['idx', 'knowcode']

In [37]:
cur_t = '{}_{}_{}_{}_{}'.format(datetime.now().year, 
                                        datetime.now().month,
                                        datetime.now().day, 
                                        datetime.now().hour,
                                        datetime.now().minute)

df_subm.to_csv('./submission/submission_time_'+cur_t+'.csv', index=False)