In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score,recall_score,f1_score,roc_auc_score
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.model_selection import cross_val_score
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
def get_clf_eval(y_test,pred=None,pred_proba=None):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    roc_auc = roc_auc_score(y_test,pred_proba)
    print(confusion)
    print('정확도:{0:4f}, 정밀도:{1:4f}, 재현율:{2:4f},F1 : {3:4f}, ROC AUC : {4:4f}'.format(accuracy,precision,recall,f1,roc_auc))

In [3]:
from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test,pred_proba_c1,thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임계값',custom_threshold)
        get_clf_eval(y_test,custom_predict,pred_proba_c1)
        print('\n')

### Train 데이터 불러오기

In [4]:
df = pd.read_csv("데이콘/train.csv")

In [5]:
df['target'].value_counts()

0    251106
1    250845
Name: target, dtype: int64

In [6]:
drop_columns = ['id','person_rn','contents_rn','contents_open_dt']
df = df.drop(drop_columns,axis=1)

In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
list1 = ['d_l_match_yn','d_m_match_yn','d_s_match_yn','h_l_match_yn','h_m_match_yn','h_s_match_yn']
for i in list1:
    df[i] = encoder.fit_transform(df[i])

In [8]:
df.shape

(501951, 31)

In [9]:
pd.set_option('display.max_columns', None)
df.head(5)

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_f,person_prefer_g,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,target
0,1,1,1,0,0,0,1,4,3,5,275,370,369,8,1,1,4,95,59,3,3,10,2,1,2,1608,275,1,4,139,1
1,0,0,0,1,1,0,1,3,4,1,114,181,175,4,1,1,131,101,96,1,3,5,1,1,2,1608,275,1,4,133,0
2,0,0,0,1,0,0,2,0,3,5,464,175,452,3,1,1,54,263,56,3,1,10,2,1,1,1600,94,1,4,53,0
3,0,0,0,1,0,0,2,0,2,5,703,705,704,3,1,1,72,227,2,1,3,5,1,1,2,1608,275,5,3,74,0
4,1,1,1,0,0,0,1,3,4,5,275,370,369,4,1,1,214,210,209,1,1,10,2,1,2,1608,275,1,4,74,0


### Isolation Forest 이상탐지

In [10]:
from sklearn.ensemble import IsolationForest

In [11]:
clf=IsolationForest(contamination=0.78,max_samples=100)
clf.fit(df)

IsolationForest(contamination=0.78, max_samples=100)

In [12]:
pred = clf.predict(df)
df['anomaly']=pred
outliers=df.loc[df['anomaly']==-1]
outlier_index=list(outliers.index)
print(df['anomaly'].value_counts()) # -1 --> 이상치, 1 --> 정상값

-1    391521
 1    110430
Name: anomaly, dtype: int64


In [13]:
train = df[df['anomaly']==1]
train.shape # 정상값 11,000대 또는 14,000개 이상 출력해야 F1 0.71이상 나옴. 최대 0.72 찍힘 

(110430, 32)

In [14]:
X = train.drop(['target','anomaly'],axis=1,inplace=False)
y = train['target']

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2021)

### 학습 & 결과 

In [16]:
lgbm = LGBMClassifier(random_state=2021)
lgbm.fit(X_train,y_train)      
pred = lgbm.predict(X_test)
pred_proba = lgbm.predict_proba(X_test)[:,1]
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

임계값 0.3
[[ 1431 14427]
 [  330 16941]]
정확도:0.554559, 정밀도:0.540073, 재현율:0.980893,F1 : 0.696601, ROC AUC : 0.653728


임계값 0.35
[[ 2325 13533]
 [  720 16551]]
정확도:0.569773, 정밀도:0.550160, 재현율:0.958312,F1 : 0.699018, ROC AUC : 0.653728


임계값 0.38
[[ 3133 12725]
 [ 1139 16132]]
정확도:0.581515, 정밀도:0.559032, 재현율:0.934051,F1 : 0.699445, ROC AUC : 0.653728


임계값 0.4
[[ 3754 12104]
 [ 1511 15760]]
정확도:0.589031, 정밀도:0.565604, 재현율:0.912512,F1 : 0.698349, ROC AUC : 0.653728


임계값 0.45
[[ 5789 10069]
 [ 2940 14331]]
정확도:0.607323, 정밀도:0.587336, 재현율:0.829772,F1 : 0.687816, ROC AUC : 0.653728


임계값 0.5
[[ 8174  7684]
 [ 5151 12120]]
정확도:0.612575, 정밀도:0.611998, 재현율:0.701754,F1 : 0.653810, ROC AUC : 0.653728




In [17]:
lgbm2 = LGBMClassifier(random_state=2021,
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=5,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=-1,
    scale_pos_weight=1,
    seed=2019
)
lgbm2.fit(X_train,y_train)      
pred = lgbm2.predict(X_test)
pred_proba = lgbm2.predict_proba(X_test)[:,1]
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

임계값 0.3
[[ 2790 13068]
 [  934 16337]]
정확도:0.577349, 정밀도:0.555586, 재현율:0.945921,F1 : 0.700017, ROC AUC : 0.661749


임계값 0.35
[[ 3898 11960]
 [ 1523 15748]]
정확도:0.593015, 정밀도:0.568356, 재현율:0.911817,F1 : 0.700238, ROC AUC : 0.661749


임계값 0.38
[[ 4634 11224]
 [ 2030 15241]]
정확도:0.599928, 정밀도:0.575893, 재현율:0.882462,F1 : 0.696954, ROC AUC : 0.661749


임계값 0.4
[[ 5256 10602]
 [ 2421 14850]]
정확도:0.606900, 정밀도:0.583451, 재현율:0.859823,F1 : 0.695176, ROC AUC : 0.661749


임계값 0.45
[[ 6832  9026]
 [ 3666 13605]]
정확도:0.616892, 정밀도:0.601167, 재현율:0.787737,F1 : 0.681921, ROC AUC : 0.661749


임계값 0.5
[[ 8579  7279]
 [ 5391 11880]]
정확도:0.617556, 정밀도:0.620074, 재현율:0.687858,F1 : 0.652210, ROC AUC : 0.661749




In [18]:
xgb = XGBClassifier(random_state=2021)
xgb.fit(X_train,y_train)
pred = xgb.predict(X_test)
pred_proba = xgb.predict_proba(X_test)[:,1]
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

임계값 0.3
[[ 2407 13451]
 [  736 16535]]
정확도:0.571765, 정밀도:0.551424, 재현율:0.957385,F1 : 0.699791, ROC AUC : 0.653816


임계값 0.35
[[ 3469 12389]
 [ 1367 15904]]
정확도:0.584775, 정밀도:0.562118, 재현율:0.920850,F1 : 0.698095, ROC AUC : 0.653816


임계값 0.38
[[ 4274 11584]
 [ 1846 15425]]
정확도:0.594615, 정밀도:0.571106, 재현율:0.893116,F1 : 0.696703, ROC AUC : 0.653816


임계값 0.4
[[ 4877 10981]
 [ 2252 15019]]
정확도:0.600561, 정밀도:0.577654, 재현율:0.869608,F1 : 0.694183, ROC AUC : 0.653816


임계값 0.45
[[ 6586  9272]
 [ 3600 13671]]
정확도:0.611458, 정밀도:0.595868, 재현율:0.791558,F1 : 0.679912, ROC AUC : 0.653816


임계값 0.5
[[ 8493  7365]
 [ 5409 11862]]
정확도:0.614416, 정밀도:0.616945, 재현율:0.686816,F1 : 0.650008, ROC AUC : 0.653816




In [19]:
xgb2 = XGBClassifier(random_state=2021,
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=5,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    seed=2019
)
xgb2.fit(X_train,y_train)
pred = xgb2.predict(X_test)
pred_proba = xgb2.predict_proba(X_test)[:,1]
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

임계값 0.3
[[ 2847 13011]
 [  928 16343]]
정확도:0.579251, 정밀도:0.556755, 재현율:0.946268,F1 : 0.701040, ROC AUC : 0.664578


임계값 0.35
[[ 3992 11866]
 [ 1571 15700]]
정확도:0.594404, 정밀도:0.569542, 재현율:0.909038,F1 : 0.700314, ROC AUC : 0.664578


임계값 0.38
[[ 4820 11038]
 [ 2087 15184]]
정확도:0.603821, 정밀도:0.579056, 재현율:0.879162,F1 : 0.698227, ROC AUC : 0.664578


임계값 0.4
[[ 5429 10429]
 [ 2501 14770]]
정확도:0.609708, 정밀도:0.586134, 재현율:0.855191,F1 : 0.695550, ROC AUC : 0.664578


임계값 0.45
[[ 6997  8861]
 [ 3741 13530]]
정확도:0.619608, 정밀도:0.604261, 재현율:0.783394,F1 : 0.682265, ROC AUC : 0.664578


임계값 0.5
[[ 8733  7125]
 [ 5396 11875]]
정확도:0.622053, 정밀도:0.625000, 재현율:0.687569,F1 : 0.654793, ROC AUC : 0.664578




In [20]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(random_state=2021)
cat.fit(X_train,y_train)      
pred = cat.predict(X_test) 
pred_proba = cat.predict_proba(X_test)[:,1] 
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

Learning rate set to 0.065946
0:	learn: 0.6910186	total: 168ms	remaining: 2m 48s
1:	learn: 0.6889011	total: 196ms	remaining: 1m 37s
2:	learn: 0.6872175	total: 220ms	remaining: 1m 13s
3:	learn: 0.6856444	total: 244ms	remaining: 1m
4:	learn: 0.6843018	total: 267ms	remaining: 53.1s
5:	learn: 0.6830897	total: 289ms	remaining: 47.8s
6:	learn: 0.6820006	total: 308ms	remaining: 43.7s
7:	learn: 0.6809315	total: 327ms	remaining: 40.5s
8:	learn: 0.6798803	total: 345ms	remaining: 38s
9:	learn: 0.6790574	total: 368ms	remaining: 36.4s
10:	learn: 0.6781366	total: 387ms	remaining: 34.8s
11:	learn: 0.6773098	total: 408ms	remaining: 33.6s
12:	learn: 0.6765546	total: 426ms	remaining: 32.3s
13:	learn: 0.6759555	total: 443ms	remaining: 31.2s
14:	learn: 0.6753838	total: 460ms	remaining: 30.2s
15:	learn: 0.6748432	total: 476ms	remaining: 29.3s
16:	learn: 0.6743014	total: 494ms	remaining: 28.6s
17:	learn: 0.6737676	total: 511ms	remaining: 27.9s
18:	learn: 0.6733910	total: 528ms	remaining: 27.3s
19:	learn: 0.

161:	learn: 0.6533072	total: 3.33s	remaining: 17.2s
162:	learn: 0.6532561	total: 3.35s	remaining: 17.2s
163:	learn: 0.6531818	total: 3.37s	remaining: 17.2s
164:	learn: 0.6530840	total: 3.4s	remaining: 17.2s
165:	learn: 0.6529888	total: 3.42s	remaining: 17.2s
166:	learn: 0.6528877	total: 3.43s	remaining: 17.1s
167:	learn: 0.6528093	total: 3.45s	remaining: 17.1s
168:	learn: 0.6527507	total: 3.47s	remaining: 17s
169:	learn: 0.6526515	total: 3.48s	remaining: 17s
170:	learn: 0.6525271	total: 3.5s	remaining: 17s
171:	learn: 0.6524234	total: 3.52s	remaining: 16.9s
172:	learn: 0.6523308	total: 3.53s	remaining: 16.9s
173:	learn: 0.6522104	total: 3.55s	remaining: 16.8s
174:	learn: 0.6520934	total: 3.56s	remaining: 16.8s
175:	learn: 0.6520125	total: 3.58s	remaining: 16.8s
176:	learn: 0.6519316	total: 3.6s	remaining: 16.7s
177:	learn: 0.6518618	total: 3.62s	remaining: 16.7s
178:	learn: 0.6517646	total: 3.63s	remaining: 16.7s
179:	learn: 0.6516434	total: 3.65s	remaining: 16.6s
180:	learn: 0.6515834

324:	learn: 0.6387809	total: 6.66s	remaining: 13.8s
325:	learn: 0.6386996	total: 6.69s	remaining: 13.8s
326:	learn: 0.6386314	total: 6.71s	remaining: 13.8s
327:	learn: 0.6385753	total: 6.74s	remaining: 13.8s
328:	learn: 0.6384759	total: 6.76s	remaining: 13.8s
329:	learn: 0.6383943	total: 6.78s	remaining: 13.8s
330:	learn: 0.6383229	total: 6.81s	remaining: 13.8s
331:	learn: 0.6382445	total: 6.84s	remaining: 13.8s
332:	learn: 0.6381736	total: 6.88s	remaining: 13.8s
333:	learn: 0.6380852	total: 6.91s	remaining: 13.8s
334:	learn: 0.6380271	total: 6.93s	remaining: 13.8s
335:	learn: 0.6379458	total: 6.95s	remaining: 13.7s
336:	learn: 0.6378680	total: 6.97s	remaining: 13.7s
337:	learn: 0.6378171	total: 6.99s	remaining: 13.7s
338:	learn: 0.6377249	total: 7.01s	remaining: 13.7s
339:	learn: 0.6376704	total: 7.03s	remaining: 13.7s
340:	learn: 0.6375904	total: 7.05s	remaining: 13.6s
341:	learn: 0.6375131	total: 7.08s	remaining: 13.6s
342:	learn: 0.6374544	total: 7.11s	remaining: 13.6s
343:	learn: 

488:	learn: 0.6276170	total: 11.2s	remaining: 11.7s
489:	learn: 0.6275517	total: 11.2s	remaining: 11.7s
490:	learn: 0.6274810	total: 11.2s	remaining: 11.7s
491:	learn: 0.6274066	total: 11.3s	remaining: 11.6s
492:	learn: 0.6273485	total: 11.3s	remaining: 11.6s
493:	learn: 0.6272912	total: 11.3s	remaining: 11.6s
494:	learn: 0.6272314	total: 11.3s	remaining: 11.6s
495:	learn: 0.6271553	total: 11.3s	remaining: 11.5s
496:	learn: 0.6270940	total: 11.4s	remaining: 11.5s
497:	learn: 0.6270346	total: 11.4s	remaining: 11.5s
498:	learn: 0.6269688	total: 11.4s	remaining: 11.5s
499:	learn: 0.6269226	total: 11.4s	remaining: 11.4s
500:	learn: 0.6268686	total: 11.5s	remaining: 11.4s
501:	learn: 0.6267931	total: 11.5s	remaining: 11.4s
502:	learn: 0.6267485	total: 11.5s	remaining: 11.4s
503:	learn: 0.6267087	total: 11.5s	remaining: 11.3s
504:	learn: 0.6266564	total: 11.6s	remaining: 11.3s
505:	learn: 0.6265871	total: 11.6s	remaining: 11.3s
506:	learn: 0.6265319	total: 11.6s	remaining: 11.3s
507:	learn: 

653:	learn: 0.6180882	total: 15s	remaining: 7.92s
654:	learn: 0.6180300	total: 15s	remaining: 7.89s
655:	learn: 0.6179611	total: 15s	remaining: 7.87s
656:	learn: 0.6178944	total: 15s	remaining: 7.85s
657:	learn: 0.6178217	total: 15.1s	remaining: 7.82s
658:	learn: 0.6177896	total: 15.1s	remaining: 7.8s
659:	learn: 0.6176946	total: 15.1s	remaining: 7.78s
660:	learn: 0.6176578	total: 15.1s	remaining: 7.76s
661:	learn: 0.6176232	total: 15.2s	remaining: 7.74s
662:	learn: 0.6175671	total: 15.2s	remaining: 7.72s
663:	learn: 0.6175117	total: 15.2s	remaining: 7.7s
664:	learn: 0.6174576	total: 15.2s	remaining: 7.68s
665:	learn: 0.6173784	total: 15.3s	remaining: 7.67s
666:	learn: 0.6173415	total: 15.3s	remaining: 7.65s
667:	learn: 0.6172675	total: 15.4s	remaining: 7.63s
668:	learn: 0.6172114	total: 15.4s	remaining: 7.61s
669:	learn: 0.6171281	total: 15.4s	remaining: 7.59s
670:	learn: 0.6170692	total: 15.4s	remaining: 7.57s
671:	learn: 0.6169954	total: 15.5s	remaining: 7.55s
672:	learn: 0.6169374	

815:	learn: 0.6094954	total: 19s	remaining: 4.29s
816:	learn: 0.6094604	total: 19s	remaining: 4.26s
817:	learn: 0.6094131	total: 19.1s	remaining: 4.24s
818:	learn: 0.6093655	total: 19.1s	remaining: 4.21s
819:	learn: 0.6093391	total: 19.1s	remaining: 4.19s
820:	learn: 0.6092832	total: 19.1s	remaining: 4.17s
821:	learn: 0.6092483	total: 19.1s	remaining: 4.14s
822:	learn: 0.6092248	total: 19.1s	remaining: 4.12s
823:	learn: 0.6091757	total: 19.2s	remaining: 4.09s
824:	learn: 0.6091179	total: 19.2s	remaining: 4.07s
825:	learn: 0.6090858	total: 19.2s	remaining: 4.04s
826:	learn: 0.6090380	total: 19.2s	remaining: 4.02s
827:	learn: 0.6089783	total: 19.2s	remaining: 4s
828:	learn: 0.6089254	total: 19.3s	remaining: 3.97s
829:	learn: 0.6088786	total: 19.3s	remaining: 3.95s
830:	learn: 0.6088077	total: 19.3s	remaining: 3.92s
831:	learn: 0.6087638	total: 19.3s	remaining: 3.9s
832:	learn: 0.6087227	total: 19.3s	remaining: 3.88s
833:	learn: 0.6086700	total: 19.4s	remaining: 3.85s
834:	learn: 0.608621

981:	learn: 0.6017775	total: 22.8s	remaining: 417ms
982:	learn: 0.6017427	total: 22.8s	remaining: 394ms
983:	learn: 0.6016843	total: 22.8s	remaining: 371ms
984:	learn: 0.6016451	total: 22.8s	remaining: 347ms
985:	learn: 0.6016070	total: 22.8s	remaining: 324ms
986:	learn: 0.6015668	total: 22.9s	remaining: 301ms
987:	learn: 0.6015114	total: 22.9s	remaining: 278ms
988:	learn: 0.6014643	total: 22.9s	remaining: 255ms
989:	learn: 0.6014005	total: 22.9s	remaining: 231ms
990:	learn: 0.6013482	total: 22.9s	remaining: 208ms
991:	learn: 0.6013009	total: 22.9s	remaining: 185ms
992:	learn: 0.6012364	total: 23s	remaining: 162ms
993:	learn: 0.6012177	total: 23s	remaining: 139ms
994:	learn: 0.6011734	total: 23s	remaining: 116ms
995:	learn: 0.6011408	total: 23s	remaining: 92.4ms
996:	learn: 0.6011015	total: 23s	remaining: 69.3ms
997:	learn: 0.6010487	total: 23.1s	remaining: 46.2ms
998:	learn: 0.6010052	total: 23.1s	remaining: 23.1ms
999:	learn: 0.6009477	total: 23.1s	remaining: 0us
임계값 0.3
[[ 1892 1396

In [21]:
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli

In [22]:
ngb = NGBClassifier(Dist=Bernoulli,random_state=2021)
ngb.fit(X_train,y_train)      
pred = ngb.predict(X_test)
pred_proba = ngb.predict_proba(X_test)[:,1]
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

[iter 0] loss=0.6925 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.6719 val_loss=0.0000 scale=2.0000 norm=3.9571
[iter 200] loss=0.6674 val_loss=0.0000 scale=2.0000 norm=3.9550
[iter 300] loss=0.6646 val_loss=0.0000 scale=1.0000 norm=1.9765
[iter 400] loss=0.6625 val_loss=0.0000 scale=1.0000 norm=1.9753
임계값 0.3
[[  859 14999]
 [  180 17091]]
정확도:0.541821, 정밀도:0.532596, 재현율:0.989578,F1 : 0.692490, ROC AUC : 0.638200


임계값 0.35
[[ 1494 14364]
 [  412 16859]]
정확도:0.553986, 정밀도:0.539955, 재현율:0.976145,F1 : 0.695303, ROC AUC : 0.638200


임계값 0.38
[[ 2191 13667]
 [  723 16548]]
정확도:0.565637, 정밀도:0.547675, 재현율:0.958138,F1 : 0.696963, ROC AUC : 0.638200


임계값 0.4
[[ 2942 12916]
 [ 1159 16112]]
정확도:0.575146, 정밀도:0.555050, 재현율:0.932893,F1 : 0.695998, ROC AUC : 0.638200


임계값 0.45
[[ 5396 10462]
 [ 2962 14309]]
정확도:0.594796, 정밀도:0.577651, 재현율:0.828499,F1 : 0.680700, ROC AUC : 0.638200


임계값 0.5
[[ 7877  7981]
 [ 5294 11977]]
정확도:0.599294, 정밀도:0.600110, 재현율:0.693475,F1 : 0.643423, ROC 

### Test 데이터 불러오기 

In [23]:
test = pd.read_csv("데이콘/test.csv")

In [24]:
drop_columns = ['id','person_rn','contents_rn','contents_open_dt']
test = test.drop(drop_columns,axis=1)

In [25]:
list1 = ['d_l_match_yn','d_m_match_yn','d_s_match_yn','h_l_match_yn','h_m_match_yn','h_s_match_yn']
for i in list1:
    test[i] = encoder.fit_transform(test[i])

### 최종 예측값 출력

In [26]:
final_pred = ngb.predict_proba(test)[:,1]
final_pred

array([0.38927834, 0.31945302, 0.53151043, ..., 0.70033679, 0.83110351,
       0.73521683])

In [27]:
threshold=0.4
final_pred = np.where(final_pred >= threshold , 1, 0)
final_pred

array([0, 0, 1, ..., 1, 1, 1])

In [28]:
sample_submission = pd.read_csv("데이콘/sample_submission.csv")
sample_submission['target'] = final_pred

In [29]:
sample_submission['target'].value_counts()

1    34223
0    12181
Name: target, dtype: int64

In [30]:
sample_submission.to_csv('sample_submission.csv', index=False, encoding='utf-8-sig')