# CONCEPT 

1. Regressor : A분류/B분류/C분류/BC분류
2. Classifier : A&B&C분류

- 총 다섯 개(Classifier&Regressor)의 모델 성능을 복합적으로 고려

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

from imblearn.over_sampling import SMOTE

In [2]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=1)

In [3]:
train = pd.read_csv('./data/df_train00.csv')
test = pd.read_csv('./data/df_test00.csv')              

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC

In [6]:
random_seed = 6327
strategy = {0:10000, 1:10000}
# strategy = 'auto'

In [7]:
train['class'].value_counts()

1    114
2     79
0     69
Name: class, dtype: int64

In [8]:
ae = pd.read_csv("./data/ae_values.csv")
# target_idx = train[train['class'] != 0].index.tolist()
train2 = train.copy().reset_index(drop=True)
# train2['class'] = train2['class'] -1

train2 = pd.concat([train2, ae[:len(train2)]], axis=1)
test2 = pd.concat([test, ae[len(train2):].reset_index(drop=True)], axis=1)
train2['class'].value_counts()

1    114
2     79
0     69
Name: class, dtype: int64

# STEP 01
- 각 레이블 독립적으로 구분하는 모델

In [9]:
y1 = (train2['class'].values == 0).astype(int)
y2 = (train2['class'].values == 1).astype(int)
y3 = (train2['class'].values == 2).astype(int)

X = train2.drop(['id', 'class'], axis=1)
X.iloc[:,:16] = X.iloc[:,:16].astype('category')


X_test = test2.drop(['id', 'class'], axis=1)
X_test.iloc[:,:16] = X_test.iloc[:,:16].astype('category')

smote = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed, sampling_strategy=strategy)
X_train1, y_train1 = smote.fit_resample(X, y1)
X_train2, y_train2 = smote.fit_resample(X, y2)
X_train3, y_train3 = smote.fit_resample(X, y3)

X_train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 40 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   trait         20000 non-null  category
 1   SNP_01        20000 non-null  category
 2   SNP_02        20000 non-null  category
 3   SNP_03        20000 non-null  category
 4   SNP_04        20000 non-null  category
 5   SNP_05        20000 non-null  category
 6   SNP_06        20000 non-null  category
 7   SNP_07        20000 non-null  category
 8   SNP_08        20000 non-null  category
 9   SNP_09        20000 non-null  category
 10  SNP_10        20000 non-null  category
 11  SNP_11        20000 non-null  category
 12  SNP_12        20000 non-null  category
 13  SNP_13        20000 non-null  category
 14  SNP_14        20000 non-null  category
 15  SNP_15        20000 non-null  category
 16  SNP_01_ratio  20000 non-null  float64 
 17  SNP_02_ratio  20000 non-null  float64 
 18  SNP_03

In [10]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 40 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   trait         175 non-null    category
 1   SNP_01        175 non-null    category
 2   SNP_02        175 non-null    category
 3   SNP_03        175 non-null    category
 4   SNP_04        175 non-null    category
 5   SNP_05        175 non-null    category
 6   SNP_06        175 non-null    category
 7   SNP_07        175 non-null    category
 8   SNP_08        175 non-null    category
 9   SNP_09        175 non-null    category
 10  SNP_10        175 non-null    category
 11  SNP_11        175 non-null    category
 12  SNP_12        175 non-null    category
 13  SNP_13        175 non-null    category
 14  SNP_14        175 non-null    category
 15  SNP_15        175 non-null    category
 16  SNP_01_ratio  175 non-null    float64 
 17  SNP_02_ratio  175 non-null    float64 
 18  SNP_03_rat

In [18]:
def lgbmc(inputX, inputY) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostClassifier(
        cat_features=var_categ,
        iterations=1000,
        learning_rate=0.03,
        task_type='GPU',
        devices='0',
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=10
        );     

    pred = model.predict(inputX)
    score = f1_score(inputY, pred, average='macro')
    print(score)
    
    return model

In [19]:
def lgbmr(inputX, inputY) :  
    var_categ = inputX.columns.tolist()[:16]
    model = CatBoostRegressor(
        cat_features=var_categ,
        iterations=1000,
        learning_rate=0.03,
        task_type='GPU',
        devices='5',
        # random_state=random_seed
        )
  
    model.fit(
        inputX, inputY,
        eval_set=(inputX, inputY),
        verbose=10
        );     

    pred = model.predict(inputX)
    score = f1_score(inputY, np.round(pred), average='macro')
    print(score)
    
    return model

In [20]:
model1_c = lgbmc(X_train1, y_train1)
model1_r = lgbmr(X_train1, y_train1)

0:	learn: 0.5860070	test: 0.5860069	best: 0.5860069 (0)	total: 44.4ms	remaining: 44.3s
10:	learn: 0.0821981	test: 0.0821981	best: 0.0821981 (10)	total: 319ms	remaining: 28.6s
20:	learn: 0.0153152	test: 0.0153111	best: 0.0153111 (20)	total: 656ms	remaining: 30.6s
30:	learn: 0.0037563	test: 0.0037552	best: 0.0037552 (30)	total: 901ms	remaining: 28.2s
40:	learn: 0.0012792	test: 0.0012789	best: 0.0012789 (40)	total: 1.15s	remaining: 26.9s
50:	learn: 0.0005422	test: 0.0005421	best: 0.0005421 (50)	total: 1.37s	remaining: 25.6s
60:	learn: 0.0002842	test: 0.0002841	best: 0.0002841 (60)	total: 1.61s	remaining: 24.8s
70:	learn: 0.0001709	test: 0.0001709	best: 0.0001709 (70)	total: 1.87s	remaining: 24.5s
80:	learn: 0.0001096	test: 0.0001095	best: 0.0001095 (80)	total: 2.08s	remaining: 23.7s
90:	learn: 0.0000818	test: 0.0000817	best: 0.0000817 (90)	total: 2.35s	remaining: 23.5s
100:	learn: 0.0000608	test: 0.0000608	best: 0.0000608 (100)	total: 2.59s	remaining: 23s
110:	learn: 0.0000493	test: 0.000

In [21]:
model2_c = lgbmc(X_train2, y_train2)
model2_r = lgbmr(X_train2, y_train2)

0:	learn: 0.6083303	test: 0.6083304	best: 0.6083304 (0)	total: 43.8ms	remaining: 43.8s
10:	learn: 0.1917412	test: 0.1917398	best: 0.1917398 (10)	total: 470ms	remaining: 42.2s
20:	learn: 0.0783548	test: 0.0783475	best: 0.0783475 (20)	total: 900ms	remaining: 41.9s
30:	learn: 0.0398843	test: 0.0398770	best: 0.0398770 (30)	total: 1.33s	remaining: 41.5s
40:	learn: 0.0240679	test: 0.0240122	best: 0.0240122 (40)	total: 1.77s	remaining: 41.3s
50:	learn: 0.0155614	test: 0.0155125	best: 0.0155125 (50)	total: 2.21s	remaining: 41s
60:	learn: 0.0111368	test: 0.0110970	best: 0.0110970 (60)	total: 2.65s	remaining: 40.7s
70:	learn: 0.0080193	test: 0.0079854	best: 0.0079854 (70)	total: 3.09s	remaining: 40.4s
80:	learn: 0.0061377	test: 0.0061119	best: 0.0061119 (80)	total: 3.53s	remaining: 40s
90:	learn: 0.0048385	test: 0.0048177	best: 0.0048177 (90)	total: 3.97s	remaining: 39.6s
100:	learn: 0.0039244	test: 0.0039088	best: 0.0039088 (100)	total: 4.4s	remaining: 39.2s
110:	learn: 0.0031565	test: 0.003141

In [22]:
model3_c = lgbmc(X_train3, y_train3)
model3_r = lgbmr(X_train3, y_train3)

0:	learn: 0.6121434	test: 0.6121434	best: 0.6121434 (0)	total: 43.2ms	remaining: 43.1s
10:	learn: 0.2131056	test: 0.2130733	best: 0.2130733 (10)	total: 467ms	remaining: 42s
20:	learn: 0.0935784	test: 0.0935544	best: 0.0935544 (20)	total: 900ms	remaining: 42s
30:	learn: 0.0457352	test: 0.0457173	best: 0.0457173 (30)	total: 1.33s	remaining: 41.7s
40:	learn: 0.0247360	test: 0.0247145	best: 0.0247145 (40)	total: 1.77s	remaining: 41.4s
50:	learn: 0.0151644	test: 0.0151418	best: 0.0151418 (50)	total: 2.21s	remaining: 41s
60:	learn: 0.0104531	test: 0.0104352	best: 0.0104352 (60)	total: 2.64s	remaining: 40.7s
70:	learn: 0.0074182	test: 0.0074047	best: 0.0074047 (70)	total: 3.08s	remaining: 40.3s
80:	learn: 0.0055994	test: 0.0055889	best: 0.0055889 (80)	total: 3.52s	remaining: 39.9s
90:	learn: 0.0045125	test: 0.0045038	best: 0.0045038 (90)	total: 3.96s	remaining: 39.5s
100:	learn: 0.0037167	test: 0.0037094	best: 0.0037094 (100)	total: 4.4s	remaining: 39.1s
110:	learn: 0.0031039	test: 0.0030976	

In [24]:
pred1 = model1_c.predict_proba(X_test)[:,1]
pred2 = model1_r.predict(X_test)
pred3 = model2_c.predict_proba(X_test)[:,1]
pred4 = model2_r.predict(X_test)
pred5 = model3_c.predict_proba(X_test)[:,1]
pred6 = model3_r.predict(X_test)

A_prob = (pred1+pred2) / 2
B_prob = (pred3+pred4) / 2
C_prob = (pred5+pred6) / 2

high1 = pd.read_csv("./data/submit_0.99078.csv")
high2 = pd.read_csv("./data/submit_0.99078_2.csv")
high3 = pd.read_csv("./data/submit_0.99078_3.csv")
high4 = pd.read_csv("./data/submit_0.99078_4.csv")

total = pd.DataFrame()
total['high1'] = high1['class']
total['high2'] = high2['class']
total['high3'] = high3['class']
total['high4'] = high4['class']

total['pred1'] = pred1
total['pred2'] = pred2
total['pred3'] = pred3
total['pred4'] = pred4
total['pred5'] = pred5
total['pred6'] = pred6

total['a_prob'] = A_prob
total['b_prob'] = B_prob
total['c_prob'] = C_prob

total

Unnamed: 0,high1,high2,high3,high4,pred1,pred2,pred3,pred4,pred5,pred6,a_prob,b_prob,c_prob
0,A,A,A,A,0.999998,0.999842,0.000043,-0.000210,0.000056,0.019018,0.999920,-0.000084,0.009537
1,B,B,B,B,0.000003,0.000093,0.999956,0.967438,0.000002,-0.004520,0.000048,0.983697,-0.002259
2,C,C,C,C,0.000002,0.000695,0.000018,-0.014920,0.999978,1.019449,0.000349,-0.007451,1.009713
3,C,C,C,C,0.000002,0.001068,0.974353,0.901688,0.044700,0.336554,0.000535,0.938021,0.190627
4,A,A,A,A,0.999998,0.987315,0.000020,-0.005206,0.000031,0.003629,0.993657,-0.002593,0.001830
...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,B,B,B,B,0.000013,0.040736,0.996805,0.911411,0.000980,-0.052490,0.020375,0.954108,-0.025755
171,C,C,C,C,0.000007,0.046851,0.000043,-0.025170,0.999982,0.993343,0.023429,-0.012564,0.996662
172,C,C,C,C,0.000008,0.051530,0.000990,0.192435,0.999384,0.846758,0.025769,0.096713,0.923071
173,B,B,B,B,0.000015,0.040273,0.999985,0.984672,0.000015,-0.033059,0.020144,0.992329,-0.016522


In [25]:
# exception test - Regressor와 Classifier가 서로 다른 판단을 하는 경우 
a_except = total[(total.pred1 >= 0.5) & (total.pred2 < 0.5)].index.tolist() + total[(total.pred2 >= 0.5) & (total.pred1 < 0.5)].index.tolist()
b_except = total[(total.pred3 >= 0.5) & (total.pred4 < 0.5)].index.tolist() + total[(total.pred4 >= 0.5) & (total.pred3 < 0.5)].index.tolist()
c_except = total[(total.pred5 >= 0.5) & (total.pred6 < 0.5)].index.tolist() + total[(total.pred6 >= 0.5) & (total.pred5 < 0.5)].index.tolist()

print(len(a_except), len(b_except), len(c_except))

0 3 4


In [26]:
total.iloc[b_except]

Unnamed: 0,high1,high2,high3,high4,pred1,pred2,pred3,pred4,pred5,pred6,a_prob,b_prob,c_prob
126,B,C,B,B,9e-06,0.058636,0.664106,0.461569,0.154662,0.546794,0.029323,0.562837,0.350728
162,C,C,C,C,7e-06,0.048797,0.670133,0.407301,0.060518,0.402536,0.024402,0.538717,0.231527
119,C,C,C,C,7e-06,0.037209,0.112779,0.649402,0.845461,0.377288,0.018608,0.38109,0.611374


In [27]:
total.iloc[c_except]

Unnamed: 0,high1,high2,high3,high4,pred1,pred2,pred3,pred4,pred5,pred6,a_prob,b_prob,c_prob
119,C,C,C,C,7e-06,0.037209,0.112779,0.649402,0.845461,0.377288,0.018608,0.38109,0.611374
5,C,C,C,C,3e-06,-0.000146,0.884922,0.73692,0.266671,0.711998,-7.2e-05,0.810921,0.489335
19,C,C,C,C,4e-06,0.000761,0.071127,0.11717,0.351018,0.555056,0.000383,0.094148,0.453037
126,B,C,B,B,9e-06,0.058636,0.664106,0.461569,0.154662,0.546794,0.029323,0.562837,0.350728


# STEP 02. 복합적으로 고려하는 모델
- B와 C만 있을 때 구분
- 모두 있을 때 각각 구분

In [30]:
train3 = train2[train2['class'] != 0].copy()

y4 = (train3['class'].values).astype(int) - 1

X2 = train3.drop(['id', 'class'], axis=1)
X2.iloc[:,:15] = X2.iloc[:,:15].astype('category')

strategy1 = {0:12000, 1:10000}
strategy2 = {0:10000, 1:10000}
strategy3 = {0:10000, 1:12000}

smote1 = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed, sampling_strategy=strategy1)
smote2 = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed, sampling_strategy=strategy2)
smote3 = SMOTENC(categorical_features=[x for x in range(16)], random_state=random_seed, sampling_strategy=strategy3)
X_train4, y_train4 = smote1.fit_resample(X2, y4)
X_train5, y_train5 = smote1.fit_resample(X2, y4)
X_train5, y_train6 = smote1.fit_resample(X2, y4)
train3['class'].value_counts(), train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 261
Data columns (total 42 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            193 non-null    object 
 1   trait         193 non-null    int64  
 2   SNP_01        193 non-null    int64  
 3   SNP_02        193 non-null    int64  
 4   SNP_03        193 non-null    int64  
 5   SNP_04        193 non-null    int64  
 6   SNP_05        193 non-null    int64  
 7   SNP_06        193 non-null    int64  
 8   SNP_07        193 non-null    int64  
 9   SNP_08        193 non-null    int64  
 10  SNP_09        193 non-null    int64  
 11  SNP_10        193 non-null    int64  
 12  SNP_11        193 non-null    int64  
 13  SNP_12        193 non-null    int64  
 14  SNP_13        193 non-null    int64  
 15  SNP_14        193 non-null    int64  
 16  SNP_15        193 non-null    int64  
 17  class         193 non-null    int64  
 18  SNP_01_ratio  193 non-null    

(1    114
 2     79
 Name: class, dtype: int64,
 None)

In [31]:
model4_c = lgbmc(X_train4, y_train4)
model4_r = lgbmr(X_train4, y_train4)

0:	learn: 0.6156562	test: 0.6156562	best: 0.6156562 (0)	total: 42.3ms	remaining: 42.2s
10:	learn: 0.2043908	test: 0.2043180	best: 0.2043180 (10)	total: 457ms	remaining: 41.1s
20:	learn: 0.0879308	test: 0.0878722	best: 0.0878722 (20)	total: 876ms	remaining: 40.9s
30:	learn: 0.0419540	test: 0.0419155	best: 0.0419155 (30)	total: 1.3s	remaining: 40.6s
40:	learn: 0.0240930	test: 0.0240653	best: 0.0240653 (40)	total: 1.73s	remaining: 40.4s
50:	learn: 0.0139526	test: 0.0139306	best: 0.0139306 (50)	total: 2.15s	remaining: 40s
60:	learn: 0.0098886	test: 0.0098757	best: 0.0098757 (60)	total: 2.58s	remaining: 39.7s
70:	learn: 0.0074185	test: 0.0074121	best: 0.0074121 (70)	total: 3s	remaining: 39.3s
80:	learn: 0.0056897	test: 0.0056852	best: 0.0056852 (80)	total: 3.42s	remaining: 38.9s
90:	learn: 0.0043517	test: 0.0043458	best: 0.0043458 (90)	total: 3.85s	remaining: 38.4s
100:	learn: 0.0034548	test: 0.0034495	best: 0.0034495 (100)	total: 4.27s	remaining: 38s
110:	learn: 0.0028992	test: 0.0028944	b

In [32]:
model5_c = lgbmc(X_train5, y_train5)
model5_r = lgbmr(X_train5, y_train5)

0:	learn: 0.6156561	test: 0.6156562	best: 0.6156562 (0)	total: 42.4ms	remaining: 42.4s
10:	learn: 0.2043908	test: 0.2043180	best: 0.2043180 (10)	total: 464ms	remaining: 41.7s
20:	learn: 0.0879308	test: 0.0878722	best: 0.0878722 (20)	total: 891ms	remaining: 41.5s
30:	learn: 0.0419657	test: 0.0419266	best: 0.0419266 (30)	total: 1.32s	remaining: 41.4s
40:	learn: 0.0240979	test: 0.0240701	best: 0.0240701 (40)	total: 1.76s	remaining: 41.1s
50:	learn: 0.0142879	test: 0.0142730	best: 0.0142730 (50)	total: 2.19s	remaining: 40.8s
60:	learn: 0.0100396	test: 0.0100318	best: 0.0100318 (60)	total: 2.62s	remaining: 40.4s
70:	learn: 0.0072432	test: 0.0072345	best: 0.0072345 (70)	total: 3.05s	remaining: 40s
80:	learn: 0.0054223	test: 0.0054168	best: 0.0054168 (80)	total: 3.48s	remaining: 39.5s
90:	learn: 0.0041662	test: 0.0041572	best: 0.0041572 (90)	total: 3.91s	remaining: 39.1s
100:	learn: 0.0034151	test: 0.0034074	best: 0.0034074 (100)	total: 4.34s	remaining: 38.7s
110:	learn: 0.0028962	test: 0.002

In [33]:
model6_c = lgbmc(X_train5, y_train5)
model6_r = lgbmr(X_train5, y_train5)

0:	learn: 0.6156561	test: 0.6156562	best: 0.6156562 (0)	total: 43.1ms	remaining: 43s
10:	learn: 0.2043908	test: 0.2043180	best: 0.2043180 (10)	total: 462ms	remaining: 41.6s
20:	learn: 0.0879313	test: 0.0878726	best: 0.0878726 (20)	total: 880ms	remaining: 41s
30:	learn: 0.0419420	test: 0.0419054	best: 0.0419054 (30)	total: 1.3s	remaining: 40.7s
40:	learn: 0.0240885	test: 0.0240619	best: 0.0240619 (40)	total: 1.73s	remaining: 40.4s
50:	learn: 0.0139504	test: 0.0139292	best: 0.0139292 (50)	total: 2.16s	remaining: 40.1s
60:	learn: 0.0098872	test: 0.0098751	best: 0.0098751 (60)	total: 2.59s	remaining: 39.9s
70:	learn: 0.0074178	test: 0.0074118	best: 0.0074118 (70)	total: 3.02s	remaining: 39.5s
80:	learn: 0.0056892	test: 0.0056851	best: 0.0056851 (80)	total: 3.45s	remaining: 39.1s
90:	learn: 0.0043512	test: 0.0043457	best: 0.0043457 (90)	total: 3.88s	remaining: 38.7s
100:	learn: 0.0034546	test: 0.0034495	best: 0.0034495 (100)	total: 4.31s	remaining: 38.4s
110:	learn: 0.0028991	test: 0.002894

In [34]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 40 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   trait         175 non-null    category
 1   SNP_01        175 non-null    category
 2   SNP_02        175 non-null    category
 3   SNP_03        175 non-null    category
 4   SNP_04        175 non-null    category
 5   SNP_05        175 non-null    category
 6   SNP_06        175 non-null    category
 7   SNP_07        175 non-null    category
 8   SNP_08        175 non-null    category
 9   SNP_09        175 non-null    category
 10  SNP_10        175 non-null    category
 11  SNP_11        175 non-null    category
 12  SNP_12        175 non-null    category
 13  SNP_13        175 non-null    category
 14  SNP_14        175 non-null    category
 15  SNP_15        175 non-null    category
 16  SNP_01_ratio  175 non-null    float64 
 17  SNP_02_ratio  175 non-null    float64 
 18  SNP_03_rat

In [35]:
pred7 = model4_c.predict_proba(X_test)[:,1]
pred8 = model4_r.predict(X_test)
pred9 = model5_c.predict_proba(X_test)[:,1]
pred10 = model5_r.predict(X_test)
pred11 = model6_c.predict_proba(X_test)[:,1]
pred12 = model6_r.predict(X_test)

BC_prob_auto = (pred7+pred8) / 2
BC_prob_1000 = (pred9+pred10) / 2
BC_prob_1200 = (pred11+pred12) / 2

In [36]:
total['pred7'] = pred7
total['pred8'] = pred8
total['pred9'] = pred9
total['pred10'] = pred10
total['pred11'] = pred11
total['pred12'] = pred12

total['bc_prob_auto'] = BC_prob_auto
total['bc_prob_1000'] = BC_prob_1000
total['bc_prob_1200'] = BC_prob_1200

total

Unnamed: 0,high1,high2,high3,high4,pred1,pred2,pred3,pred4,pred5,pred6,...,c_prob,pred7,pred8,pred9,pred10,pred11,pred12,bc_prob_auto,bc_prob_1000,bc_prob_1200
0,A,A,A,A,0.999998,0.999842,0.000043,-0.000210,0.000056,0.019018,...,0.009537,0.984896,0.652845,0.985068,0.636599,0.984893,0.638248,0.818871,0.810834,0.811571
1,B,B,B,B,0.000003,0.000093,0.999956,0.967438,0.000002,-0.004520,...,-0.002259,0.000011,0.004551,0.000011,0.023287,0.000011,-0.007182,0.002281,0.011649,-0.003586
2,C,C,C,C,0.000002,0.000695,0.000018,-0.014920,0.999978,1.019449,...,1.009713,0.999993,0.984062,0.999991,0.980808,0.999993,0.976806,0.992028,0.990399,0.988400
3,C,C,C,C,0.000002,0.001068,0.974353,0.901688,0.044700,0.336554,...,0.190627,0.049586,0.139851,0.047156,0.141569,0.049593,0.132007,0.094718,0.094363,0.090800
4,A,A,A,A,0.999998,0.987315,0.000020,-0.005206,0.000031,0.003629,...,0.001830,0.756182,0.537124,0.672003,0.556073,0.756125,0.574288,0.646653,0.614038,0.665207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,B,B,B,B,0.000013,0.040736,0.996805,0.911411,0.000980,-0.052490,...,-0.025755,0.000855,0.011362,0.000492,-0.003736,0.000856,0.018739,0.006109,-0.001622,0.009797
171,C,C,C,C,0.000007,0.046851,0.000043,-0.025170,0.999982,0.993343,...,0.996662,0.999988,1.037154,0.999988,1.036262,0.999988,1.035602,1.018571,1.018125,1.017795
172,C,C,C,C,0.000008,0.051530,0.000990,0.192435,0.999384,0.846758,...,0.923071,0.999650,0.901294,0.999697,0.893223,0.999650,0.893547,0.950472,0.946460,0.946599
173,B,B,B,B,0.000015,0.040273,0.999985,0.984672,0.000015,-0.033059,...,-0.016522,0.000005,0.002835,0.000008,-0.000979,0.000005,0.015713,0.001420,-0.000486,0.007859


In [37]:
total.iloc[b_except]

Unnamed: 0,high1,high2,high3,high4,pred1,pred2,pred3,pred4,pred5,pred6,...,c_prob,pred7,pred8,pred9,pred10,pred11,pred12,bc_prob_auto,bc_prob_1000,bc_prob_1200
126,B,C,B,B,9e-06,0.058636,0.664106,0.461569,0.154662,0.546794,...,0.350728,0.699084,0.62744,0.748797,0.620627,0.699184,0.618404,0.663262,0.684712,0.658794
162,C,C,C,C,7e-06,0.048797,0.670133,0.407301,0.060518,0.402536,...,0.231527,0.649556,0.523219,0.648165,0.504038,0.649673,0.512481,0.586387,0.576102,0.581077
119,C,C,C,C,7e-06,0.037209,0.112779,0.649402,0.845461,0.377288,...,0.611374,0.895793,0.285113,0.884814,0.279076,0.895766,0.277875,0.590453,0.581945,0.58682


In [38]:
total.iloc[c_except]

Unnamed: 0,high1,high2,high3,high4,pred1,pred2,pred3,pred4,pred5,pred6,...,c_prob,pred7,pred8,pred9,pred10,pred11,pred12,bc_prob_auto,bc_prob_1000,bc_prob_1200
119,C,C,C,C,7e-06,0.037209,0.112779,0.649402,0.845461,0.377288,...,0.611374,0.895793,0.285113,0.884814,0.279076,0.895766,0.277875,0.590453,0.581945,0.58682
5,C,C,C,C,3e-06,-0.000146,0.884922,0.73692,0.266671,0.711998,...,0.489335,0.177246,0.475579,0.116034,0.476764,0.177308,0.486337,0.326412,0.296399,0.331822
19,C,C,C,C,4e-06,0.000761,0.071127,0.11717,0.351018,0.555056,...,0.453037,0.915079,0.679139,0.934524,0.704908,0.915041,0.719168,0.797109,0.819716,0.817105
126,B,C,B,B,9e-06,0.058636,0.664106,0.461569,0.154662,0.546794,...,0.350728,0.699084,0.62744,0.748797,0.620627,0.699184,0.618404,0.663262,0.684712,0.658794


In [41]:
a_index = total[total.a_prob >= 0.5].index.tolist()
b_index = total[total.b_prob >= 0.5].index.tolist()
c_index = total[total.c_prob >= 0.5].index.tolist()

except_index = sorted(list(set(b_except + c_except)))
for e in except_index :
    score = np.mean(total.iloc[e, -3:])
    if score <= 0.5 :
        if e in b_index :
            b_index.remove(e)
        if e in c_index :
            c_index.remove(e)
        
        b_index.append(e)
    else :
        if e in b_index :
            b_index.remove(e)
        if e in c_index :
            c_index.remove(e)
            
        c_index.append(e)

print(len(a_index), len(b_index), len(c_index), len(a_index)+len(b_index)+len(c_index))

51 85 39 175


In [42]:
answer = np.zeros(len(X_test))-1
answer[a_index] = 0
answer[b_index] = 1
answer[c_index] = 2
pd.DataFrame(answer).value_counts()

1.0    85
0.0    51
2.0    39
dtype: int64

In [44]:
total['answer'] = answer
total['target'] = total.answer.map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
total[['high1', 'high2', 'high3', 'high4', 'target']]

Unnamed: 0,high1,high2,high3,high4,target
0,A,A,A,A,A
1,B,B,B,B,B
2,C,C,C,C,C
3,C,C,C,C,B
4,A,A,A,A,A
...,...,...,...,...,...
170,B,B,B,B,B
171,C,C,C,C,C
172,C,C,C,C,C
173,B,B,B,B,B


In [47]:
total[total.target != total.high1][['target', 'high1', 'high2', 'high3', 'high4']]

Unnamed: 0,target,high1,high2,high3,high4
3,B,C,C,C,C
5,B,C,C,C,C
126,C,B,C,B,B


In [48]:
total[total.target != total.high2][['target', 'high1', 'high2', 'high3', 'high4']]

Unnamed: 0,target,high1,high2,high3,high4
3,B,C,C,C,C
5,B,C,C,C,C
12,B,B,C,C,C


In [49]:
total[total.target != total.high3][['target', 'high1', 'high2', 'high3', 'high4']]

Unnamed: 0,target,high1,high2,high3,high4
3,B,C,C,C,C
5,B,C,C,C,C
12,B,B,C,C,C
126,C,B,C,B,B


In [50]:
total[total.target != total.high4][['target', 'high1', 'high2', 'high3', 'high4']]

Unnamed: 0,target,high1,high2,high3,high4
3,B,C,C,C,C
5,B,C,C,C,C
12,B,B,C,C,C
126,C,B,C,B,B
168,B,B,B,B,C


In [51]:
submit = high1.copy()
submit['class'] = total.target
submit

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,B
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [53]:
submit.iloc[[3,5,12,119,126,162,168]]

Unnamed: 0,id,class
3,TEST_003,B
5,TEST_005,B
12,TEST_012,B
119,TEST_119,C
126,TEST_126,C
162,TEST_162,C
168,TEST_168,B


In [52]:
submit.to_csv("submit.csv", index=False)

In [54]:
df1 = pd.read_csv("./data/submit_0.99078.csv")
df2 = pd.read_csv("./data/submit_0.99078_2.csv")
df3 = pd.read_csv("./data/submit_0.99078_3.csv")
df4 = pd.read_csv("./data/submit_0.99078_4.csv")

In [55]:
# submit 기준
display(submit[submit['class'] != df1['class']],submit[submit['class'] != df2['class']],submit[submit['class'] != df3['class']],submit[submit['class'] != df4['class']]) 

Unnamed: 0,id,class
3,TEST_003,B
5,TEST_005,B
126,TEST_126,C


Unnamed: 0,id,class
3,TEST_003,B
5,TEST_005,B
12,TEST_012,B


Unnamed: 0,id,class
3,TEST_003,B
5,TEST_005,B
12,TEST_012,B
126,TEST_126,C


Unnamed: 0,id,class
3,TEST_003,B
5,TEST_005,B
12,TEST_012,B
126,TEST_126,C
168,TEST_168,B


In [84]:
total['answer'] = total_index
total.iloc[[3, 5, 12, 119, 168], [0,1,2,3,-4, -3, -2, -1]]

Unnamed: 0,high1,high2,high3,high4,a_prob,b_prob,c_prob,answer
3,C,C,C,C,0.059954,1.81694,0.610793,1
5,C,C,C,C,-0.00088,1.602311,1.031356,1
12,B,C,C,C,0.006151,0.867814,0.338562,1
119,C,C,C,C,0.000587,1.078331,0.992099,1
168,B,B,B,C,-0.001244,1.410336,0.296742,1


In [85]:
total[(total.b_prob > 1.0) & (total.c_prob > 0.5)]

Unnamed: 0,high1,high2,high3,high4,pred1,pred2,pred3,pred4,pred5,pred6,a_prob,b_prob,c_prob,answer
3,C,C,C,C,0.000703,0.059251,0.989858,0.827082,0.257562,0.35323,0.059954,1.81694,0.610793,1
5,C,C,C,C,0.000287,-0.001167,0.975314,0.626997,0.490775,0.540581,-0.00088,1.602311,1.031356,1
35,B,B,B,B,0.000482,0.001193,0.8119,0.645094,0.223033,0.364383,0.001676,1.456993,0.587417,1
119,C,C,C,C,0.000229,0.000358,0.413266,0.665065,0.578031,0.414068,0.000587,1.078331,0.992099,1
126,B,C,B,B,0.000968,0.049571,0.669163,0.379141,0.249582,0.501944,0.050539,1.048304,0.751527,1


In [None]:
total_index2 = np.where(a_prob > 1.0, 0, np.where(c_prob > 0.6, 2, 

In [None]:
# submit 기준
display(submit[submit['class'] != df1['class']],submit[submit['class'] != df2['class']],submit[submit['class'] != df3['class']],submit[submit['class'] != df4['class']]) 

In [79]:
sum(high1['class'] == submit['class'])

172

In [63]:
submit['class'].value_counts()

B    85
A    51
C    39
Name: class, dtype: int64

In [65]:
high1['class'].value_counts()

B    84
A    51
C    40
Name: class, dtype: int64

In [52]:
submit.to_csv("submit.csv", index=False)

In [68]:
high1 = pd.read_csv("./data/submit_0.99078.csv")
high2 = pd.read_csv("./data/submit_0.99078_2.csv")
high3 = pd.read_csv("./data/submit_0.99078_3.csv")
high4 = pd.read_csv("./data/submit_0.99078_4.csv")

middle1 = pd.read_csv("./data/submit_0.98142.csv")

low1 = pd.read_csv("./submit_0.9622.csv")
low2 = pd.read_csv("./submit_0.9622_2.csv")
low3 = pd.read_csv("./submit_0.9719.csv")

In [70]:
targets = [high1, high2, high3, high4, middle1, low1, low2, low3]

idx_list = []
for i in range(len(targets)-1) :
    target = targets[i]
    for j in range(len(targets[i+1:])) :
        diff_idx = target[target['class'] != targets[j]['class']].index.tolist()
        idx_list += diff_idx
        
idx_list = sorted(list(set(idx_list)))
idx_list

[3, 5, 12, 119, 126, 168]

In [72]:
idx_list = sorted(idx_list+[162])

In [73]:
display(high1.iloc[idx_list],high2.iloc[idx_list],high3.iloc[idx_list],high4.iloc[idx_list],middle1.iloc[idx_list],low1.iloc[idx_list],low2.iloc[idx_list],low3.iloc[idx_list])

Unnamed: 0,id,class
3,TEST_003,C
5,TEST_005,C
12,TEST_012,B
119,TEST_119,C
126,TEST_126,B
162,TEST_162,C
168,TEST_168,B


Unnamed: 0,id,class
3,TEST_003,C
5,TEST_005,C
12,TEST_012,C
119,TEST_119,C
126,TEST_126,C
162,TEST_162,C
168,TEST_168,B


Unnamed: 0,id,class
3,TEST_003,C
5,TEST_005,C
12,TEST_012,C
119,TEST_119,C
126,TEST_126,B
162,TEST_162,C
168,TEST_168,B


Unnamed: 0,id,class
3,TEST_003,C
5,TEST_005,C
12,TEST_012,C
119,TEST_119,C
126,TEST_126,B
162,TEST_162,C
168,TEST_168,C


Unnamed: 0,id,class
3,TEST_003,C
5,TEST_005,B
12,TEST_012,B
119,TEST_119,C
126,TEST_126,C
162,TEST_162,C
168,TEST_168,B


Unnamed: 0,id,class
3,TEST_003,B
5,TEST_005,B
12,TEST_012,A
119,TEST_119,B
126,TEST_126,B
162,TEST_162,C
168,TEST_168,B


Unnamed: 0,id,class
3,TEST_003,B
5,TEST_005,B
12,TEST_012,B
119,TEST_119,B
126,TEST_126,B
162,TEST_162,C
168,TEST_168,B


Unnamed: 0,id,class
3,TEST_003,B
5,TEST_005,B
12,TEST_012,B
119,TEST_119,C
126,TEST_126,C
162,TEST_162,C
168,TEST_168,B


In [67]:
display(low3[low3['class'] != low1['class']], low1[low3['class'] != low1['class']])

Unnamed: 0,id,class
12,TEST_012,B
119,TEST_119,C
126,TEST_126,C


Unnamed: 0,id,class
12,TEST_012,A
119,TEST_119,B
126,TEST_126,B


In [62]:
display(high3[high3['class'] != high1['class']], high1[high3['class'] != high1['class']])

Unnamed: 0,id,class
12,TEST_012,C


Unnamed: 0,id,class
12,TEST_012,B


In [63]:
display(high3[high3['class'] != high2['class']], high2[high3['class'] != high2['class']])

Unnamed: 0,id,class
126,TEST_126,B


Unnamed: 0,id,class
126,TEST_126,C
