## 필요 패키지 불러오기

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.datasets import load_digits
from matplotlib import font_manager
from matplotlib import gridspec
from math import factorial
import sklearn
import pprint
import re
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


import warnings
warnings.filterwarnings('ignore')

font_fname = 'C:/Windows/Fonts/malgun.ttf'
font_family = font_manager.FontProperties(fname=font_fname).get_name()

plt.rcParams["font.family"] = font_family
plt.rcParams["axes.unicode_minus"] = False


%matplotlib inline

In [2]:
from sklearn.ensemble import ExtraTreesClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, mean_absolute_error
from lightgbm import LGBMClassifier

## 데이터 불러오기

In [46]:
train_df = pd.read_csv('./train_transform_weight.csv')

In [47]:
train_no_weight = pd.read_csv('./train_transform.csv')

In [48]:
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df['target'])

In [49]:
X = train_df.drop(['gcd', 'target', 'sample_weight'], axis=1)
y = train_df['target']
sample_weight = train_df['sample_weight']

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=0)
train_weight = sample_weight.iloc[X_train.index]
test_weight = sample_weight.iloc[X_test.index]

## 초기 설정값 확인

ExtraTreesClassifier 과 LGBMClassifier을 사용

In [52]:
et_clf = ExtraTreesClassifier(n_estimators=300, n_jobs=-1)
et_clf.fit(X_train, y_train, train_weight)
pred = et_clf.predict(X_test)
score = accuracy_score(y_test, pred, sample_weight=test_weight)
print(score)

0.9452475470002247

In [16]:
def best_search(X, y, sample_weight, model, params=None, scoring='accuracy', cv=5):
    
    grid_clf = GridSearchCV(model, param_grid=params, scoring=scoring, cv=cv)
    grid_clf.fit(X, y, sample_weight=sample_weight)
    
    print(grid_clf.best_params_, grid_clf.best_score_)
    
    scores_df = pd.DataFrame(grid_clf.cv_results_)
    return scores_df[['params', 'mean_test_score', 'rank_test_score',
                      'split0_test_score','split1_test_score','split2_test_score','split3_test_score','split4_test_score']]

In [8]:
models = {
    ExtraTreesClassifier(warm_start=True, n_estimators=300, n_jobs=-1, random_state=0): 
    {'max_depth': [45, 50, 55],
    'min_samples_split':[4, 8, 12],
    'min_samples_leaf': [4, 8, 12]
},
    LGBMClassifier(n_estimators=300, n_jobs=-1, random_state=0): 
    {'max_depth': [45, 50, 55],
    'min_child_samples':[4, 8, 12],
    'reg_alpha': [1, 3, 5]
}
}
scores_df_list = []
for model, params in models.items():
    print(type(model).__name__)
    scores_df = best_search(X_train, y_train, train_weight, model, params=params)
    scores_df_list.append(scores_df)

ExtraTreesClassifier
{'max_depth': 50, 'min_samples_leaf': 4, 'min_samples_split': 4} 0.9626388708974165
LGBMClassifier
{'max_depth': 45, 'min_child_samples': 12, 'reg_alpha': 1} 0.9653809567868199


In [18]:
pd.set_option('display.max_colwidth', None)

In [19]:
scores_df_list[0].sort_values('rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
9,"{'max_depth': 50, 'min_samples_leaf': 4, 'min_samples_split': 4}",0.962639,1,0.960684,0.963305,0.963657,0.96275,0.962799
10,"{'max_depth': 50, 'min_samples_leaf': 4, 'min_samples_split': 8}",0.962639,1,0.960684,0.963305,0.963657,0.96275,0.962799
0,"{'max_depth': 45, 'min_samples_leaf': 4, 'min_samples_split': 4}",0.962568,3,0.960885,0.962296,0.963758,0.962649,0.963252
1,"{'max_depth': 45, 'min_samples_leaf': 4, 'min_samples_split': 8}",0.962568,3,0.960885,0.962296,0.963758,0.962649,0.963252
19,"{'max_depth': 55, 'min_samples_leaf': 4, 'min_samples_split': 8}",0.962467,5,0.960633,0.961742,0.964162,0.962851,0.96295


In [20]:
scores_df_list[1].sort_values('rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
24,"{'max_depth': 55, 'min_child_samples': 12, 'reg_alpha': 1}",0.965381,1,0.963305,0.965926,0.966833,0.966833,0.964008
6,"{'max_depth': 45, 'min_child_samples': 12, 'reg_alpha': 1}",0.965381,1,0.963305,0.965926,0.966833,0.966833,0.964008
15,"{'max_depth': 50, 'min_child_samples': 12, 'reg_alpha': 1}",0.965381,1,0.963305,0.965926,0.966833,0.966833,0.964008
3,"{'max_depth': 45, 'min_child_samples': 8, 'reg_alpha': 1}",0.96521,4,0.963204,0.965825,0.967186,0.966279,0.963555
21,"{'max_depth': 55, 'min_child_samples': 8, 'reg_alpha': 1}",0.96521,4,0.963204,0.965825,0.967186,0.966279,0.963555


ExtraTreesClassifier
{'max_depth': 52, 'min_samples_leaf': 2, 'min_samples_split': 2} 0.9628808222425708

In [30]:
models = {
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=0): 
    {'max_depth': [50, 52, 54],
    'min_samples_split':[1, 2, 3],
    'min_samples_leaf': [1, 2, 3]
},
    LGBMClassifier(n_estimators=300, n_jobs=-1, random_state=0): 
    {'max_depth': [25, 35, 45],
    'min_child_samples':[12, 16, 20],
    'reg_alpha': [0.3, 0.6, 1]
}}

scores_df_list = []
for model, params in models.items():
    print(type(model).__name__)
    scores_df = best_search(X_train, y_train, train_weight, model, params=params)
    scores_df_list.append(scores_df)

ExtraTreesClassifier
{'max_depth': 52, 'min_samples_leaf': 1, 'min_samples_split': 2} 0.9678206273883123
LGBMClassifier
{'max_depth': 25, 'min_child_samples': 20, 'reg_alpha': 0.3} 0.9655422603211015


In [31]:
scores_df_list[0].sort_values('rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
10,"{'max_depth': 52, 'min_samples_leaf': 1, 'min_samples_split': 2}",0.967821,1,0.965522,0.969152,0.968698,0.968597,0.967134
2,"{'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 3}",0.967458,2,0.965472,0.967186,0.968043,0.968446,0.968142
1,"{'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2}",0.967448,3,0.965371,0.968043,0.96774,0.9689,0.967184
20,"{'max_depth': 54, 'min_samples_leaf': 1, 'min_samples_split': 3}",0.967337,4,0.965674,0.967337,0.968698,0.968648,0.966327
11,"{'max_depth': 52, 'min_samples_leaf': 1, 'min_samples_split': 3}",0.967155,5,0.965623,0.968244,0.967387,0.968093,0.966428


In [32]:
scores_df_list[1].sort_values('rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
15,"{'max_depth': 35, 'min_child_samples': 20, 'reg_alpha': 0.3}",0.965542,1,0.964363,0.965018,0.966379,0.967438,0.964513
24,"{'max_depth': 45, 'min_child_samples': 20, 'reg_alpha': 0.3}",0.965542,1,0.964363,0.965018,0.966379,0.967438,0.964513
6,"{'max_depth': 25, 'min_child_samples': 20, 'reg_alpha': 0.3}",0.965542,1,0.964363,0.965018,0.966379,0.967438,0.964513
3,"{'max_depth': 25, 'min_child_samples': 16, 'reg_alpha': 0.3}",0.965542,4,0.964162,0.965774,0.967186,0.966379,0.96421
21,"{'max_depth': 45, 'min_child_samples': 16, 'reg_alpha': 0.3}",0.965542,4,0.964162,0.965774,0.967186,0.966379,0.96421


In [39]:
models = {
    ExtraTreesClassifier(warm_start=True, n_jobs=-1,max_depth=52, random_state=0): 
    {'max_depth': [52],
    'n_estimators': [300, 400, 500, 600]
},
    LGBMClassifier(n_estimators=300, n_jobs=-1, random_state=0): 
    {'max_depth': [15, 25],
    'min_child_samples':[20, 30, 40],
    'reg_alpha': [0.2]
}}

scores_df_list = []
for model, params in models.items():
    print(type(model).__name__)
    scores_df = best_search(X_train, y_train, train_weight, model, params=params)
    scores_df_list.append(scores_df)

ExtraTreesClassifier
{'max_depth': 52, 'n_estimators': 600} 0.9689094051554982
LGBMClassifier
{'max_depth': 25, 'min_child_samples': 20, 'reg_alpha': 0.2} 0.9658951072922995


In [40]:
scores_df_list[0].sort_values('rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
3,"{'max_depth': 52, 'n_estimators': 600}",0.968909,1,0.966228,0.970311,0.969152,0.970412,0.968444
2,"{'max_depth': 52, 'n_estimators': 500}",0.968647,2,0.965976,0.970109,0.969706,0.969505,0.96794
1,"{'max_depth': 52, 'n_estimators': 400}",0.968446,3,0.965623,0.97016,0.969152,0.969404,0.96789
0,"{'max_depth': 52, 'n_estimators': 300}",0.967821,4,0.965522,0.969152,0.968698,0.968597,0.967134


In [41]:
scores_df_list[1].sort_values('rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
7,"{'max_depth': 25, 'min_child_samples': 20, 'reg_alpha': 0.2}",0.965895,1,0.963557,0.966127,0.967539,0.967085,0.965168
2,"{'max_depth': 15, 'min_child_samples': 30, 'reg_alpha': 0}",0.965895,2,0.964212,0.965774,0.967287,0.967942,0.964261
4,"{'max_depth': 15, 'min_child_samples': 40, 'reg_alpha': 0}",0.965835,3,0.964514,0.96527,0.967035,0.967287,0.965067
9,"{'max_depth': 25, 'min_child_samples': 30, 'reg_alpha': 0.2}",0.965835,4,0.963657,0.96648,0.967589,0.966682,0.964765
5,"{'max_depth': 15, 'min_child_samples': 40, 'reg_alpha': 0.2}",0.965754,5,0.963456,0.966027,0.967337,0.967085,0.964865


In [44]:
models = {
    LGBMClassifier(n_estimators=1000,n_jobs=-1, max_depth=25, min_child_samples=20, reg_alpha=0.2,  random_state=0): 
    {
    'learning_rate': [0.05, 0.07, 0.1]
}}

scores_df_list = []
for model, params in models.items():
    print(type(model).__name__)
    scores_df = best_search(X_train, y_train, train_weight, model, params=params)
    scores_df_list.append(scores_df)

LGBMClassifier
{'learning_rate': 0.1} 0.9674173776997378


In [46]:
models = {
    ExtraTreesClassifier(warm_start=True, n_jobs=-1, max_depth=52, random_state=0): 
    {
    'n_estimators': [500, 1000, 1500, 2000, 2500]
}}

scores_df_list = []
for model, params in models.items():
    print(type(model).__name__)
    scores_df = best_search(X_train, y_train, train_weight, model, params=params)
    scores_df_list.append(scores_df)

ExtraTreesClassifier
{'n_estimators': 2500} 0.970663545645684


In [24]:
def model_pred_proba(df, model, time=True, splits=10, weight=True):
    #%%time
    
    X = df.drop(['gcd', 'target', 'sample_weight'], axis=1)
    y = df['target']
    sample_weight = df['sample_weight']

    print(type(model).__name__)
    
    N_SPLITS = splits
    folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
    scores, y_pred_list, y_proba_list = [], [], []
    
    if time == True:
        folds_split = tqdm(folds.split(X, y), total=N_SPLITS)
    else:
        folds_split = folds.split(X, y)
    
    for fold, (train_id, valid_id) in enumerate(folds_split):
        if time == True:
            print('####### Fold: ', fold)
        
        # Splitting
        X_train, y_train, sample_weight_train = X.iloc[train_id], y.iloc[train_id], sample_weight.iloc[train_id]
        X_valid, y_valid, sample_weight_valid = X.iloc[valid_id], y.iloc[valid_id], sample_weight.iloc[valid_id]

        # Training
        if weight == True:
            model.fit(X_train, y_train, sample_weight_train)
        else:
            model.fit(X_train, y_train)
        
        
        # Validation
        valid_pred = model.predict(X_valid)
        valid_pred_proba = model.predict_proba(X_valid)
        valid_score = accuracy_score(y_valid, valid_pred, sample_weight=sample_weight_valid)
        
        print(f'Accuracy score: {valid_score:5f}\n')
        scores.append(valid_score)
        
        y_pred_list.append(valid_pred)
        y_proba_list.append(valid_pred_proba)


    score = np.array(scores).mean()
    print(f'Mean accuracy score: {score:6f}')
    return y_pred_list, y_proba_list      

In [28]:
model = ExtraTreesClassifier(n_estimators=600, n_jobs=-1,max_depth=52, random_state=0)
y_pred_list, y_proba_list = model_pred_proba(train_df, model, splits=10)

ExtraTreesClassifier


  0%|          | 0/10 [00:00<?, ?it/s]

####### Fold:  0


 10%|█         | 1/10 [00:12<01:55, 12.81s/it]

Accuracy score: 0.951551

####### Fold:  1


 20%|██        | 2/10 [00:24<01:37, 12.13s/it]

Accuracy score: 0.947480

####### Fold:  2


 30%|███       | 3/10 [00:36<01:23, 11.99s/it]

Accuracy score: 0.952381

####### Fold:  3


 40%|████      | 4/10 [00:48<01:11, 12.00s/it]

Accuracy score: 0.948925

####### Fold:  4


 50%|█████     | 5/10 [01:00<00:59, 11.98s/it]

Accuracy score: 0.945633

####### Fold:  5


 60%|██████    | 6/10 [01:12<00:47, 11.98s/it]

Accuracy score: 0.951164

####### Fold:  6


 70%|███████   | 7/10 [01:24<00:36, 12.06s/it]

Accuracy score: 0.945069

####### Fold:  7


 80%|████████  | 8/10 [01:36<00:24, 12.09s/it]

Accuracy score: 0.945765

####### Fold:  8


 90%|█████████ | 9/10 [01:48<00:12, 12.11s/it]

Accuracy score: 0.946958

####### Fold:  9


100%|██████████| 10/10 [02:00<00:00, 12.05s/it]

Accuracy score: 0.953855

Mean accuracy score: 0.948878





stacking

In [43]:
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds, weight=pd.Series(), n_random_state=None):
    
    if n_random_state != None:
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=n_random_state)
    else:
        kf = KFold(n_splits=n_folds, shuffle=False)
        
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    
    for folder_counter, (train_index, valide_index) in enumerate(kf.split(X_train_n)):
        X_tr = X_train_n.iloc[train_index]
        y_tr = y_train_n.iloc[train_index]
        X_te = X_train_n.iloc[valide_index]
        
        if weight.shape[0] != 0:
            X_tr_weight = weight.iloc[train_index]
            model.fit(X_tr, y_tr, sample_weight=X_tr_weight)
        else:
            model.fit(X_tr, y_tr)
            
        train_fold_pred[valide_index, :] = model.predict(X_te).reshape(-1, 1)
        
        test_pred[:, folder_counter] = model.predict(X_test_n)
    
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)
    
    return train_fold_pred, test_pred_mean

In [44]:
lgbm_clf = LGBMClassifier(n_estimators=2500,n_jobs=-1, max_depth=25, min_child_samples=20, reg_alpha=0.2,  random_state=0)
et_clf = ExtraTreesClassifier(n_estimators=2500, n_jobs=-1, max_depth=52, random_state=0)
knn_clf = KNeighborsClassifier(metric='manhattan', n_jobs=-1, weights='distance', n_neighbors=2)
lr_final = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [85]:
# lgbm_train, lgbm_test 저장
np.save('lgbm_train.npy', lgbm_train)
np.save('lgbm_test.npy', lgbm_test)

# et_train, et_test 저장
np.save('et_train.npy', et_train)
np.save('et_test.npy', et_test)

# knn_train, knn_test 저장
np.save('knn_train.npy', knn_train)
np.save('knn_test.npy', knn_test)

In [7]:
# lgbm_train, lgbm_test 불러오기
lgbm_train = np.load('lgbm_train.npy')
lgbm_test = np.load('lgbm_test.npy')

# et_train, et_test 불러오기
et_train = np.load('et_train.npy')
et_test = np.load('et_test.npy')

# knn_train, knn_test 불러오기
knn_train = np.load('knn_train.npy')
knn_test = np.load('knn_test.npy')

In [14]:
lr_final = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [36]:
stack_final_X_train = np.concatenate((lgbm_train, et_train, knn_train), axis=1)
stack_final_X_test= np.concatenate((lgbm_test, et_test, knn_test), axis=1)

lr_final.fit(stack_final_X_train, y_train)
stack_final = lr_final.predict(stack_final_X_test)

accuracy = accuracy_score(y_test, stack_final)
print('정확도:', accuracy)

정확도: 0.9558853179563692


스테킹 하이퍼파라미터 설정

In [19]:
def best_no_weight_search(X, y, model, params=None, scoring='accuracy', cv=5):
    
    grid_clf = GridSearchCV(model, param_grid=params, n_jobs=-1, scoring=scoring, cv=cv)
    grid_clf.fit(X, y)
    
    print(grid_clf.best_params_, grid_clf.best_score_)
    
    scores_df = pd.DataFrame(grid_clf.cv_results_)
    return scores_df[['params', 'mean_test_score', 'rank_test_score',
                      'split0_test_score','split1_test_score','split2_test_score','split3_test_score','split4_test_score']]

In [18]:
params = {'solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
          'penalty': ['l2','l1'],
          'C': [0.01, 0.1, 1, 5, 10]
          }

lr_final = LogisticRegression(multi_class='multinomial', n_jobs=-1)

scores_df = best_no_weight_search(stack_final_X_train, y_train, lr_final, params=params)

{'C': 5, 'penalty': 'l2', 'solver': 'lbfgs'} 0.9596749721943774


In [22]:
scores_df.sort_values(by='rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
24,"{'C': 5, 'penalty': 'l2', 'solver': 'lbfgs'}",0.959675,1,0.959121,0.957911,0.963002,0.959474,0.958867
16,"{'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}",0.95905,2,0.957357,0.957609,0.960179,0.961188,0.958917
7,"{'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}",0.958616,3,0.957054,0.959272,0.959776,0.958718,0.958262
15,"{'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}",0.958526,4,0.957004,0.959272,0.959373,0.958718,0.958262
39,"{'C': 10, 'penalty': 'l1', 'solver': 'saga'}",0.958445,5,0.956601,0.959272,0.959373,0.958718,0.958262


In [28]:
lr_final = LogisticRegression(C=5, penalty='l2',multi_class='multinomial', solver='lbfgs', n_jobs=-1)

accuracy_list3 = {}
for lgbm_weight in [0.5, 1, 1.5]:
    for et_weight in [0.5, 1, 1.5]:
        for knn_weight in [0.3, 0.6, 0.9]:
            stack_final_X_train = np.concatenate((lgbm_train*lgbm_weight, et_train*et_weight, knn_train*knn_weight), axis=1)
            stack_final_X_test= np.concatenate((lgbm_test*lgbm_weight, et_test*et_weight, knn_test*knn_weight), axis=1)

            lr_final.fit(stack_final_X_train, y_train)
            stack_final = lr_final.predict(stack_final_X_test)

            accuracy = accuracy_score(y_test, stack_final)
            accuracy_list3[f'lgbm_weight: {lgbm_weight}, et_weight: {et_weight}, knn_weight: {knn_weight}'] = accuracy

In [29]:
sorted_data = sorted(accuracy_list3.items(), key=lambda x: x[1], reverse=True)
sorted_dict = {k: v for k, v in sorted_data}
 
sorted_dict

{'lgbm_weight: 1, et_weight: 1, knn_weight: 0.3': 0.9584660671801283,
 'lgbm_weight: 1.5, et_weight: 1.5, knn_weight: 0.3': 0.9583047703536433,
 'lgbm_weight: 1, et_weight: 1.5, knn_weight: 0.3': 0.9582241219404009,
 'lgbm_weight: 0.5, et_weight: 1.5, knn_weight: 0.3': 0.9581434735271583,
 'lgbm_weight: 0.5, et_weight: 1, knn_weight: 0.3': 0.9570950441550062,
 'lgbm_weight: 0.5, et_weight: 1, knn_weight: 0.6': 0.9569740715351426,
 'lgbm_weight: 1, et_weight: 1, knn_weight: 0.9': 0.9566111536755514,
 'lgbm_weight: 1.5, et_weight: 1.5, knn_weight: 0.9': 0.9566111536755514,
 'lgbm_weight: 0.5, et_weight: 1.5, knn_weight: 0.9': 0.9564901810556877,
 'lgbm_weight: 1.5, et_weight: 1, knn_weight: 0.3': 0.9563288842292028,
 'lgbm_weight: 1.5, et_weight: 1.5, knn_weight: 0.6': 0.9560869389894754,
 'lgbm_weight: 1, et_weight: 1.5, knn_weight: 0.9': 0.9557643453365054,
 'lgbm_weight: 0.5, et_weight: 1.5, knn_weight: 0.6': 0.9557240211298843,
 'lgbm_weight: 1.5, et_weight: 1, knn_weight: 0.6': 0.95

In [30]:
lr_final = LogisticRegression(C=5, penalty='l2',multi_class='multinomial', solver='lbfgs', n_jobs=-1)

accuracy_list3 = {}
for lgbm_weight in [0.8, 1, 1.2]:
    for et_weight in [0.8, 1, 1.2]:
        for knn_weight in [0.05, 0.1, 0.15]:
            stack_final_X_train = np.concatenate((lgbm_train*lgbm_weight, et_train*et_weight, knn_train*knn_weight), axis=1)
            stack_final_X_test= np.concatenate((lgbm_test*lgbm_weight, et_test*et_weight, knn_test*knn_weight), axis=1)

            lr_final.fit(stack_final_X_train, y_train)
            stack_final = lr_final.predict(stack_final_X_test)

            accuracy = accuracy_score(y_test, stack_final)
            accuracy_list3[f'lgbm_weight: {lgbm_weight}, et_weight: {et_weight}, knn_weight: {knn_weight}'] = accuracy

In [31]:
sorted_data = sorted(accuracy_list3.items(), key=lambda x: x[1], reverse=True)
sorted_dict = {k: v for k, v in sorted_data}
 
sorted_dict

{'lgbm_weight: 1.2, et_weight: 1.2, knn_weight: 0.15': 0.9600387112383564,
 'lgbm_weight: 1.2, et_weight: 1.2, knn_weight: 0.05': 0.9597161175853866,
 'lgbm_weight: 1.2, et_weight: 1.2, knn_weight: 0.1': 0.9597161175853866,
 'lgbm_weight: 1, et_weight: 1.2, knn_weight: 0.1': 0.959635469172144,
 'lgbm_weight: 1, et_weight: 0.8, knn_weight: 0.05': 0.9595144965522804,
 'lgbm_weight: 0.8, et_weight: 1.2, knn_weight: 0.1': 0.9593935239324166,
 'lgbm_weight: 0.8, et_weight: 1.2, knn_weight: 0.15': 0.9592322271059317,
 'lgbm_weight: 1.2, et_weight: 1, knn_weight: 0.15': 0.9590709302794468,
 'lgbm_weight: 1, et_weight: 0.8, knn_weight: 0.15': 0.9589902818662043,
 'lgbm_weight: 1, et_weight: 1, knn_weight: 0.05': 0.958949957659583,
 'lgbm_weight: 1, et_weight: 1.2, knn_weight: 0.15': 0.958949957659583,
 'lgbm_weight: 0.8, et_weight: 1, knn_weight: 0.05': 0.9588289850397194,
 'lgbm_weight: 0.8, et_weight: 0.8, knn_weight: 0.05': 0.9587886608330981,
 'lgbm_weight: 1, et_weight: 0.8, knn_weight: 0

In [38]:
lr_final = LogisticRegression(C=5, penalty='l2',multi_class='multinomial', solver='lbfgs', n_jobs=-1)

lgbm_weight, et_weight, knn_weight = 1.2, 1.2, 0.15

stack_final_X_train = np.concatenate((lgbm_train*lgbm_weight, et_train*et_weight, knn_train*knn_weight), axis=1)
stack_final_X_test= np.concatenate((lgbm_test*lgbm_weight, et_test*et_weight, knn_test*knn_weight), axis=1)

lr_final.fit(stack_final_X_train, y_train)
stack_final = lr_final.predict(stack_final_X_test)

accuracy = accuracy_score(y_test, stack_final)
print('정확도:', accuracy)

정확도: 0.9600387112383564


In [51]:
lgbm_clf = LGBMClassifier(n_estimators=300,n_jobs=-1, max_depth=25, min_child_samples=20, reg_alpha=0.2,  random_state=0)
et_clf = ExtraTreesClassifier(n_estimators=300, n_jobs=-1, max_depth=52, random_state=0)

In [52]:
lgbm_train, lgbm_test = get_stacking_base_datasets(lgbm_clf, X_train, y_train, X_test, 5, n_random_state=1)
et_train, et_test = get_stacking_base_datasets(et_clf, X_train, y_train, X_test, 5, n_random_state=1)

In [120]:
df2 = pd.read_csv('./train_dupcol.csv')

In [92]:
label_encoder = LabelEncoder()
df2['target'] = label_encoder.fit_transform(df2['target'])

In [121]:
X_train = df2.drop(['target', 'gcd'], axis=1)
y_train = df2['target']

In [122]:
test_df = pd.read_csv('./test_transform.csv')

In [123]:
X_test = test_df.drop('gcd', axis=1)

In [4]:
lgbm_clf = LGBMClassifier(n_estimators=2500,n_jobs=-1, max_depth=25, min_child_samples=20, reg_alpha=0.2,  random_state=0)
et_clf = ExtraTreesClassifier(n_estimators=2500, n_jobs=-1, max_depth=52, random_state=0)
knn_clf = KNeighborsClassifier(metric='manhattan', n_jobs=-1, weights='distance', n_neighbors=2)
lr_final = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [9]:
lgbm_clf.fit(X_train, y_train)
lgbm_train_pred = lgbm_clf.predict(X_train)
lgbm_test_pred = lgbm_clf.predict(X_test)

In [None]:
np.save('lgbm_train_pred.npy', lgbm_train_pred)
np.save('lgbm_test_pred.npy', lgbm_test_pred)

In [36]:
lgbm_train_pred = np.load('lgbm_train_pred.npy')
lgbm_test_pred = np.load('lgbm_test_pred.npy')

In [10]:
et_clf.fit(X_train, y_train)
et_train_pred = et_clf.predict(X_train)
et_test_pred = et_clf.predict(X_test)

In [None]:
np.save('et_train_pred.npy', et_train_pred)
np.save('et_test_pred.npy', et_test_pred)

In [35]:
et_train_pred = np.load('et_train_pred.npy')
et_test_pred = np.load('et_test_pred.npy')

In [11]:
knn_clf.fit(X_train, y_train)
knn_train_pred = knn_clf.predict(X_train)
knn_test_pred = knn_clf.predict(X_test)

In [12]:
np.save('knn_train_pred.npy', knn_train_pred)
np.save('knn_test_pred.npy', knn_test_pred)

In [34]:
knn_train_pred = np.load('knn_train_pred.npy')
knn_test_pred = np.load('knn_test_pred.npy')

In [79]:
lgbm_train_pred_reshaped = lgbm_train_pred.reshape(-1, 1)
lgbm_test_pred_reshaped = lgbm_test_pred.reshape(-1, 1)
lgbm_train_pred_float = lgbm_train_pred_reshaped.astype(float)
lgbm_test_pred_float = lgbm_test_pred_reshaped.astype(float)


knn_train_pred_reshaped = knn_train_pred.reshape(-1, 1)
knn_test_pred_reshaped = knn_test_pred.reshape(-1, 1)
knn_train_pred_float = knn_train_pred_reshaped.astype(float)
knn_test_pred_float = knn_test_pred_reshaped.astype(float)


et_train_pred_reshaped = et_train_pred.reshape(-1, 1)
et_test_pred_reshaped = et_test_pred.reshape(-1, 1)
et_train_pred_float = et_train_pred_reshaped.astype(float)
et_test_pred_float = et_test_pred_reshaped.astype(float)

In [96]:
lgbm_weight, et_weight, knn_weight = 1.2, 1.2, 0.15

stack_final_X_train = np.concatenate((lgbm_train_pred_float*lgbm_weight, et_train_pred_float*et_weight, knn_train_pred_float*knn_weight), axis=1)
stack_final_X_test = np.concatenate((lgbm_test_pred_float*lgbm_weight, et_test_pred_float*et_weight, knn_test_pred_float*knn_weight), axis=1)

lr_final.fit(stack_final_X_train, y_train)

stack_final = lr_final.predict(stack_final_X_test)

In [99]:
stack_final.shape

(100000,)

In [101]:
stack_final

array([4, 6, 2, ..., 0, 0, 9])

In [102]:
decoded_stack_final = label_encoder.inverse_transform(stack_final)

In [116]:
row_id = list(range(200000, 300000))

# 데이터프레임 생성
df_result = pd.DataFrame({'row_id': row_id, 'target': decoded_stack_final.tolist()})

In [119]:
df_result.to_csv('result.csv',index=False)

In [125]:
et_clf = ExtraTreesClassifier(n_estimators=600, n_jobs=-1,max_depth=52)
et_clf.fit(X_train, y_train)
et_pred = et_clf.predict(X_test)

In [126]:
et_pred

array(['Escherichia_fergusonii', 'Salmonella_enterica',
       'Enterococcus_hirae', ..., 'Bacteroides_fragilis',
       'Bacteroides_fragilis', 'Streptococcus_pyogenes'], dtype=object)

In [127]:
row_id = list(range(200000, 300000))

# 데이터프레임 생성
df_result = pd.DataFrame({'row_id': row_id, 'target': et_pred.tolist()})
df_result.to_csv('result.csv',index=False)

## 최종모델 선택
- ExtraTreesClassifier
- LGBMClassifier
- KNeighborsClassifier
위의 모델들을 하위모델로 가지고


메타모델로 LogisticRegression을 가지는 stacking모델을 구성

최종 정확도는 96%

## 향후계획 및 맺음말

- 적용하지 못한 특성공학들을 적용하여 더 정확한 결과를 얻기
    * EMD, 사용된 플라스미드의 크기
- 더 다양한 알고리즘 태스트해보기
- 딥러닝에서 적용가능한 모델이 있는지 확인 후 사용해보기

- 데이터 분석을 통해서 많은 내용을 찾아냈지만 적용하지 못해서 너무 아쉬웠습니다.

# 감사합니다.