In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
def get_clf_eval(y_test,pred=None,pred_proba=None):
    f1 = f1_score(y_test,pred)
    print('F1:',f1.round(3))

In [3]:
from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test,pred_proba_c1,thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임계값',custom_threshold)
        get_clf_eval(y_test,custom_predict,pred_proba_c1)
        print('\n')

### Train 데이터 불러오기

In [4]:
df = pd.read_csv('데이콘/train.csv')
test = pd.read_csv('데이콘/test.csv')
d_code=pd.read_csv('데이콘/속성_D_코드.csv').set_index(keys='속성 D 코드')
h_code=pd.read_csv('데이콘/속성_H_코드.csv').set_index(keys='속성 H 코드')
l_code=pd.read_csv('데이콘/속성_L_코드.csv').set_index(keys='속성 L 코드')

In [5]:
df = df.drop(['id', 'contents_open_dt','person_rn', 'contents_rn','person_prefer_f','person_prefer_g'], axis=1) 
test = test.drop(['id', 'contents_open_dt','person_rn', 'contents_rn','person_prefer_f','person_prefer_g'], axis=1) 

In [6]:
def add_code(df,d_code=d_code,h_code=h_code,l_code=l_code): #n:세분류,s:소분류,m:중분류,l:대분류
    df=df.copy()
#     df 복사본 만들기

    
    
    df["person_prefer_d_1_n"]=df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    df["person_prefer_d_1_s"]=df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    df["person_prefer_d_1_m"]=df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    df["person_prefer_d_1_l"]=df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])
    
    df["person_prefer_d_2_n"]=df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    df["person_prefer_d_2_s"]=df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    df["person_prefer_d_2_m"]=df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    df["person_prefer_d_2_l"]=df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])
    
    df["person_prefer_d_3_n"]=df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    df["person_prefer_d_3_s"]=df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    df["person_prefer_d_3_m"]=df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    df["person_prefer_d_3_l"]=df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])
    
    df['person_prefer_h_1_m']=df['person_prefer_h_1'].apply(lambda x: h_code.loc[x,'속성 H 중분류코드'])
    df['person_prefer_h_2_m']=df['person_prefer_h_2'].apply(lambda x: h_code.loc[x,'속성 H 중분류코드'])
    df['person_prefer_h_3_m']=df['person_prefer_h_3'].apply(lambda x: h_code.loc[x,'속성 H 중분류코드'])
   
    
    df['contents_attribute_l_n']=df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 세분류코드"])
    df['contents_attribute_l_s']=df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 소분류코드"])
    df['contents_attribute_l_m']=df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 중분류코드"])
    df['contents_attribute_l_l']=df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 대분류코드"])
    
    df['contents_attribute_d_n']=df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    df['contents_attribute_d_s']=df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    df['contents_attribute_d_m']=df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    df['contents_attribute_d_l']=df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])

    df['contents_attribute_h_m']=df['contents_attribute_h'].apply(lambda x: h_code.loc[x,'속성 H 대분류코드'])

    
    return df
    
df = add_code(df, d_code, h_code, l_code)
test = add_code(test, d_code, h_code, l_code)

In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
list1 = ['d_l_match_yn','d_m_match_yn','d_s_match_yn','h_l_match_yn','h_m_match_yn','h_s_match_yn']
for i in list1:
    df[i] = encoder.fit_transform(df[i])

In [8]:
print('df shape',df.shape)
print('test shape',test.shape)

df shape (501951, 53)
test shape (46404, 52)


### Isolation Forest 이상탐지

In [9]:
from sklearn.ensemble import IsolationForest

In [10]:
clf=IsolationForest(contamination=0.78,max_samples=100,random_state=2021) # 0.42, 0.45 , 0.78 
clf.fit(df)

IsolationForest(contamination=0.78, max_samples=100, random_state=2021)

In [11]:
pred = clf.predict(df)
df['anomaly']=pred
outliers=df.loc[df['anomaly']==-1]
outlier_index=list(outliers.index)
print(df['anomaly'].value_counts()) # -1 --> 이상치, 1 --> 정상값

-1    391521
 1    110430
Name: anomaly, dtype: int64


In [12]:
train = df[df['anomaly']==1]
train.shape  

(110430, 54)

In [13]:
X = train.drop(['target','anomaly'],axis=1,inplace=False)
y = train['target']

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2021)

In [15]:
lgbm = LGBMClassifier(random_state=2021)
lgbm.fit(X_train,y_train)      
pred = lgbm.predict(X_test)
pred_proba = lgbm.predict_proba(X_test)[:,1]
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

임계값 0.3
F1: 0.72


임계값 0.35
F1: 0.721


임계값 0.38
F1: 0.722


임계값 0.4
F1: 0.722


임계값 0.45
F1: 0.717


임계값 0.5
F1: 0.701




In [16]:
lgbm = LGBMClassifier(random_state=2021,
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=5,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=-1,
    scale_pos_weight=1,
    seed=2019
)
lgbm.fit(X_train,y_train)      
pred = lgbm.predict(X_test)
pred_proba = lgbm.predict_proba(X_test)[:,1]
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

임계값 0.3
F1: 0.721


임계값 0.35
F1: 0.721


임계값 0.38
F1: 0.72


임계값 0.4
F1: 0.719


임계값 0.45
F1: 0.707


임계값 0.5
F1: 0.687




In [17]:
xgb = XGBClassifier(random_state=2021)
xgb.fit(X_train,y_train)
pred = xgb.predict(X_test)
pred_proba = xgb.predict_proba(X_test)[:,1]
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

임계값 0.3
F1: 0.72


임계값 0.35
F1: 0.722


임계값 0.38
F1: 0.721


임계값 0.4
F1: 0.718


임계값 0.45
F1: 0.709


임계값 0.5
F1: 0.684




In [18]:
xgb = XGBClassifier(random_state=2021,
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=5,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    seed=2019
)
xgb.fit(X_train,y_train)
pred = xgb.predict(X_test)
pred_proba = xgb.predict_proba(X_test)[:,1]
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

임계값 0.3
F1: 0.723


임계값 0.35
F1: 0.722


임계값 0.38
F1: 0.72


임계값 0.4
F1: 0.719


임계값 0.45
F1: 0.708


임계값 0.5
F1: 0.687




In [19]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(random_state=2021)
cat.fit(X_train,y_train)      
pred = cat.predict(X_test)
pred_proba = cat.predict_proba(X_test)[:,1] 
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred_proba.reshape(-1,1),thresholds)

Learning rate set to 0.065946
0:	learn: 0.6907487	total: 162ms	remaining: 2m 41s
1:	learn: 0.6885805	total: 199ms	remaining: 1m 39s
2:	learn: 0.6866368	total: 236ms	remaining: 1m 18s
3:	learn: 0.6847686	total: 270ms	remaining: 1m 7s
4:	learn: 0.6832733	total: 302ms	remaining: 1m
5:	learn: 0.6818119	total: 336ms	remaining: 55.7s
6:	learn: 0.6803962	total: 376ms	remaining: 53.4s
7:	learn: 0.6791537	total: 441ms	remaining: 54.7s
8:	learn: 0.6780775	total: 526ms	remaining: 57.9s
9:	learn: 0.6772299	total: 576ms	remaining: 57s
10:	learn: 0.6762716	total: 625ms	remaining: 56.2s
11:	learn: 0.6754542	total: 672ms	remaining: 55.3s
12:	learn: 0.6747393	total: 723ms	remaining: 54.9s
13:	learn: 0.6740336	total: 763ms	remaining: 53.8s
14:	learn: 0.6734765	total: 804ms	remaining: 52.8s
15:	learn: 0.6728884	total: 854ms	remaining: 52.5s
16:	learn: 0.6723210	total: 905ms	remaining: 52.3s
17:	learn: 0.6718298	total: 954ms	remaining: 52.1s
18:	learn: 0.6714151	total: 1.01s	remaining: 52.1s
19:	learn: 0.

162:	learn: 0.6517239	total: 8.32s	remaining: 42.7s
163:	learn: 0.6516483	total: 8.35s	remaining: 42.6s
164:	learn: 0.6515547	total: 8.4s	remaining: 42.5s
165:	learn: 0.6514832	total: 8.45s	remaining: 42.5s
166:	learn: 0.6513974	total: 8.49s	remaining: 42.4s
167:	learn: 0.6513107	total: 8.54s	remaining: 42.3s
168:	learn: 0.6512146	total: 8.59s	remaining: 42.2s
169:	learn: 0.6511346	total: 8.63s	remaining: 42.1s
170:	learn: 0.6510603	total: 8.67s	remaining: 42s
171:	learn: 0.6509682	total: 8.7s	remaining: 41.9s
172:	learn: 0.6509001	total: 8.75s	remaining: 41.8s
173:	learn: 0.6508202	total: 8.79s	remaining: 41.7s
174:	learn: 0.6507416	total: 8.84s	remaining: 41.7s
175:	learn: 0.6506752	total: 8.87s	remaining: 41.5s
176:	learn: 0.6506237	total: 8.9s	remaining: 41.4s
177:	learn: 0.6505384	total: 8.95s	remaining: 41.3s
178:	learn: 0.6504693	total: 8.99s	remaining: 41.2s
179:	learn: 0.6503804	total: 9.04s	remaining: 41.2s
180:	learn: 0.6503054	total: 9.08s	remaining: 41.1s
181:	learn: 0.650

321:	learn: 0.6384091	total: 15.3s	remaining: 32.2s
322:	learn: 0.6383473	total: 15.3s	remaining: 32.1s
323:	learn: 0.6382508	total: 15.4s	remaining: 32.1s
324:	learn: 0.6382218	total: 15.4s	remaining: 32s
325:	learn: 0.6381593	total: 15.5s	remaining: 32s
326:	learn: 0.6381052	total: 15.5s	remaining: 31.9s
327:	learn: 0.6380358	total: 15.5s	remaining: 31.9s
328:	learn: 0.6379489	total: 15.6s	remaining: 31.8s
329:	learn: 0.6378705	total: 15.6s	remaining: 31.8s
330:	learn: 0.6378135	total: 15.7s	remaining: 31.7s
331:	learn: 0.6377462	total: 15.7s	remaining: 31.6s
332:	learn: 0.6376707	total: 15.7s	remaining: 31.5s
333:	learn: 0.6376515	total: 15.8s	remaining: 31.5s
334:	learn: 0.6375694	total: 15.8s	remaining: 31.4s
335:	learn: 0.6374793	total: 15.9s	remaining: 31.4s
336:	learn: 0.6374075	total: 15.9s	remaining: 31.3s
337:	learn: 0.6373366	total: 16s	remaining: 31.2s
338:	learn: 0.6372764	total: 16s	remaining: 31.2s
339:	learn: 0.6371947	total: 16s	remaining: 31.1s
340:	learn: 0.6371207	

480:	learn: 0.6282606	total: 22.2s	remaining: 24s
481:	learn: 0.6281807	total: 22.2s	remaining: 23.9s
482:	learn: 0.6281228	total: 22.3s	remaining: 23.8s
483:	learn: 0.6280524	total: 22.3s	remaining: 23.8s
484:	learn: 0.6280120	total: 22.4s	remaining: 23.7s
485:	learn: 0.6279506	total: 22.4s	remaining: 23.7s
486:	learn: 0.6279018	total: 22.4s	remaining: 23.6s
487:	learn: 0.6278277	total: 22.5s	remaining: 23.6s
488:	learn: 0.6277920	total: 22.5s	remaining: 23.5s
489:	learn: 0.6277236	total: 22.6s	remaining: 23.5s
490:	learn: 0.6276599	total: 22.6s	remaining: 23.4s
491:	learn: 0.6275906	total: 22.6s	remaining: 23.4s
492:	learn: 0.6275158	total: 22.7s	remaining: 23.3s
493:	learn: 0.6274650	total: 22.7s	remaining: 23.3s
494:	learn: 0.6274089	total: 22.8s	remaining: 23.2s
495:	learn: 0.6273324	total: 22.8s	remaining: 23.2s
496:	learn: 0.6272946	total: 22.9s	remaining: 23.2s
497:	learn: 0.6272469	total: 22.9s	remaining: 23.1s
498:	learn: 0.6271749	total: 23s	remaining: 23.1s
499:	learn: 0.62

639:	learn: 0.6193942	total: 29.3s	remaining: 16.5s
640:	learn: 0.6193249	total: 29.3s	remaining: 16.4s
641:	learn: 0.6192385	total: 29.4s	remaining: 16.4s
642:	learn: 0.6191895	total: 29.4s	remaining: 16.3s
643:	learn: 0.6191401	total: 29.4s	remaining: 16.3s
644:	learn: 0.6190794	total: 29.5s	remaining: 16.2s
645:	learn: 0.6190352	total: 29.5s	remaining: 16.2s
646:	learn: 0.6189880	total: 29.6s	remaining: 16.1s
647:	learn: 0.6189444	total: 29.6s	remaining: 16.1s
648:	learn: 0.6188716	total: 29.7s	remaining: 16s
649:	learn: 0.6188143	total: 29.7s	remaining: 16s
650:	learn: 0.6187387	total: 29.8s	remaining: 16s
651:	learn: 0.6187019	total: 29.8s	remaining: 15.9s
652:	learn: 0.6186820	total: 29.9s	remaining: 15.9s
653:	learn: 0.6186607	total: 29.9s	remaining: 15.8s
654:	learn: 0.6186173	total: 29.9s	remaining: 15.8s
655:	learn: 0.6185724	total: 30s	remaining: 15.7s
656:	learn: 0.6185107	total: 30s	remaining: 15.7s
657:	learn: 0.6184755	total: 30s	remaining: 15.6s
658:	learn: 0.6184325	to

799:	learn: 0.6110448	total: 36.5s	remaining: 9.14s
800:	learn: 0.6109927	total: 36.6s	remaining: 9.09s
801:	learn: 0.6109599	total: 36.6s	remaining: 9.04s
802:	learn: 0.6109229	total: 36.7s	remaining: 8.99s
803:	learn: 0.6108749	total: 36.7s	remaining: 8.95s
804:	learn: 0.6108375	total: 36.7s	remaining: 8.9s
805:	learn: 0.6107981	total: 36.8s	remaining: 8.85s
806:	learn: 0.6107492	total: 36.8s	remaining: 8.8s
807:	learn: 0.6107153	total: 36.8s	remaining: 8.76s
808:	learn: 0.6106479	total: 36.9s	remaining: 8.71s
809:	learn: 0.6105915	total: 36.9s	remaining: 8.66s
810:	learn: 0.6105386	total: 36.9s	remaining: 8.61s
811:	learn: 0.6104880	total: 37s	remaining: 8.56s
812:	learn: 0.6104267	total: 37s	remaining: 8.52s
813:	learn: 0.6103689	total: 37.1s	remaining: 8.47s
814:	learn: 0.6103198	total: 37.1s	remaining: 8.42s
815:	learn: 0.6102574	total: 37.1s	remaining: 8.37s
816:	learn: 0.6102120	total: 37.2s	remaining: 8.33s
817:	learn: 0.6101646	total: 37.2s	remaining: 8.28s
818:	learn: 0.6101

960:	learn: 0.6034286	total: 43.6s	remaining: 1.77s
961:	learn: 0.6033764	total: 43.6s	remaining: 1.72s
962:	learn: 0.6033529	total: 43.7s	remaining: 1.68s
963:	learn: 0.6033210	total: 43.7s	remaining: 1.63s
964:	learn: 0.6032738	total: 43.7s	remaining: 1.59s
965:	learn: 0.6032193	total: 43.8s	remaining: 1.54s
966:	learn: 0.6031858	total: 43.8s	remaining: 1.5s
967:	learn: 0.6031550	total: 43.8s	remaining: 1.45s
968:	learn: 0.6030918	total: 43.9s	remaining: 1.4s
969:	learn: 0.6030378	total: 43.9s	remaining: 1.36s
970:	learn: 0.6030265	total: 44s	remaining: 1.31s
971:	learn: 0.6029965	total: 44s	remaining: 1.27s
972:	learn: 0.6029672	total: 44.1s	remaining: 1.22s
973:	learn: 0.6029257	total: 44.1s	remaining: 1.18s
974:	learn: 0.6028871	total: 44.1s	remaining: 1.13s
975:	learn: 0.6028473	total: 44.2s	remaining: 1.09s
976:	learn: 0.6028047	total: 44.3s	remaining: 1.04s
977:	learn: 0.6027701	total: 44.3s	remaining: 997ms
978:	learn: 0.6026973	total: 44.4s	remaining: 952ms
979:	learn: 0.6026

### Test 데이터 조정 

In [20]:
for i in list1:
    test[i] = encoder.fit_transform(test[i])

### 최종 예측값 출력

In [38]:
final_pred = xgb.predict_proba(test)[:,1]
final_pred

array([0.4078637 , 0.06542364, 0.44975764, ..., 0.5329437 , 0.59279513,
       0.41029385], dtype=float32)

In [39]:
threshold=0.3
final_pred = np.where(final_pred >= threshold , 1, 0)
final_pred

array([1, 0, 1, ..., 1, 1, 1])

In [40]:
sample_submission = pd.read_csv("데이콘/sample_submission.csv")
sample_submission['target'] = final_pred

In [41]:
sample_submission['target'].value_counts()

1    35083
0    11321
Name: target, dtype: int64

In [25]:
sample_submission.to_csv('sample_submission.csv', index=False, encoding='utf-8-sig')