In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
def get_clf_eval(y_test,pred=None,pred_proba=None):
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    roc_auc = roc_auc_score(y_test,pred_proba)
    print('정확도:{0:4f}, 정밀도:{1:4f}, 재현율:{2:4f},F1 : {3:4f}, ROC AUC : {4:4f}'.format(accuracy,precision,recall,f1,roc_auc))

In [3]:
from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test,pred_proba_c1,thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임계값',custom_threshold)
        get_clf_eval(y_test,custom_predict,pred_proba_c1)
        print('\n')

### Train 데이터 불러오기

In [4]:
df = pd.read_csv('데이콘/train.csv')
test = pd.read_csv('데이콘/test.csv')
d_code=pd.read_csv('데이콘/속성_D_코드.csv').set_index(keys='속성 D 코드')
h_code=pd.read_csv('데이콘/속성_H_코드.csv').set_index(keys='속성 H 코드')
l_code=pd.read_csv('데이콘/속성_L_코드.csv').set_index(keys='속성 L 코드')

In [5]:
df = df.drop(['id', 'contents_open_dt','person_rn', 'contents_rn','person_prefer_f','person_prefer_g'], axis=1) 
test = test.drop(['id', 'contents_open_dt','person_rn', 'contents_rn','person_prefer_f','person_prefer_g'], axis=1) 

In [6]:
def add_code(df,d_code=d_code,h_code=h_code,l_code=l_code): #n:세분류,s:소분류,m:중분류,l:대분류
    df=df.copy()
#     df 복사본 만들기

    
    
    df["person_prefer_d_1_n"]=df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    df["person_prefer_d_1_s"]=df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    df["person_prefer_d_1_m"]=df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    df["person_prefer_d_1_l"]=df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])
    
    df["person_prefer_d_2_n"]=df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    df["person_prefer_d_2_s"]=df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    df["person_prefer_d_2_m"]=df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    df["person_prefer_d_2_l"]=df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])
    
    df["person_prefer_d_3_n"]=df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    df["person_prefer_d_3_s"]=df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    df["person_prefer_d_3_m"]=df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    df["person_prefer_d_3_l"]=df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])
    
    df['person_prefer_h_1_m']=df['person_prefer_h_1'].apply(lambda x: h_code.loc[x,'속성 H 중분류코드'])
    df['person_prefer_h_2_m']=df['person_prefer_h_2'].apply(lambda x: h_code.loc[x,'속성 H 중분류코드'])
    df['person_prefer_h_3_m']=df['person_prefer_h_3'].apply(lambda x: h_code.loc[x,'속성 H 중분류코드'])
   
    
    df['contents_attribute_l_n']=df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 세분류코드"])
    df['contents_attribute_l_s']=df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 소분류코드"])
    df['contents_attribute_l_m']=df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 중분류코드"])
    df['contents_attribute_l_l']=df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 대분류코드"])
    
    df['contents_attribute_d_n']=df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    df['contents_attribute_d_s']=df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    df['contents_attribute_d_m']=df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    df['contents_attribute_d_l']=df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])

    df['contents_attribute_h_m']=df['contents_attribute_h'].apply(lambda x: h_code.loc[x,'속성 H 대분류코드'])

    
    return df
    
df = add_code(df, d_code, h_code, l_code)
test = add_code(test, d_code, h_code, l_code)

### 전처리 

#### 1. label

In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
list1 = ['d_l_match_yn','d_m_match_yn','d_s_match_yn','h_l_match_yn','h_m_match_yn','h_s_match_yn']
for i in list1:
    df[i] = encoder.fit_transform(df[i])

In [8]:
df.shape

(501951, 53)

###  Isolation Forest 이상탐지 

In [9]:
from sklearn.ensemble import IsolationForest

In [10]:
clf=IsolationForest(contamination=0.8,max_samples=100,random_state=2021) # 0.42 , 0.45, 0.78 성공
clf.fit(df)

IsolationForest(contamination=0.8, max_samples=100, random_state=2021)

In [11]:
pred = clf.predict(df)
df['anomaly']=pred
outliers=df.loc[df['anomaly']==-1]
outlier_index=list(outliers.index)
print(df['anomaly'].value_counts()) # -1 --> 이상치, 1 --> 정상값

-1    401560
 1    100391
Name: anomaly, dtype: int64


In [12]:
df = df[df['anomaly']==1]
df.shape 

(100391, 54)

In [13]:
df = df.reset_index(drop=True)
df.head() # concat 전에 인덱스 초기화 필수 

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,contents_attribute_l_n,contents_attribute_l_s,contents_attribute_l_m,contents_attribute_l_l,contents_attribute_d_n,contents_attribute_d_s,contents_attribute_d_m,contents_attribute_d_l,contents_attribute_h_m,anomaly
0,0,0,0,1,1,0,1,3,4,1,...,1607,1606,1605,2016,275,274,274,216,94,1
1,0,0,0,1,0,0,1,1,2,5,...,1607,1606,1605,2016,275,274,274,216,71,1
2,1,1,0,1,0,0,1,1,2,5,...,529,528,527,2006,1027,1026,1000,926,94,1
3,1,0,0,1,0,0,1,5,3,1,...,529,528,527,2006,92,91,56,1,94,1
4,1,0,0,1,0,0,1,1,2,1,...,1460,1459,1458,2013,97,91,56,1,3,1


In [14]:
X = df.drop(['target','anomaly'],axis=1,inplace=False)
y = df['target']

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2021)

In [16]:
X_train.shape

(70273, 52)

### DNN 

In [1]:
import tensorflow 
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras.models import Sequential

In [19]:
model=models.Sequential()
model.add(layers.Dense(128, activation='relu',input_dim=X_train.shape[1]))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(layers.Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(layers.Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(layers.Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(layers.Dense(8, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(layers.Dense(1,activation='sigmoid'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               6784      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 128)               512       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2

In [20]:
from tensorflow.keras import backend as K

In [21]:
def recall(y_target, y_pred):
    y_target_yn = K.round(K.clip(y_target, 0, 1)) 
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) 
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 
    count_true_positive_false_negative = K.sum(y_target_yn)
    recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())
    return recall

In [22]:
def precision(y_target, y_pred):
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) 
    y_target_yn = K.round(K.clip(y_target, 0, 1)) 
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 
    count_true_positive_false_positive = K.sum(y_pred_yn)
    precision = count_true_positive / (count_true_positive_false_positive + K.epsilon())
    return precision

In [23]:
def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
    return _f1score

In [24]:
model.compile(tensorflow.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy',f1score])

In [25]:
from tensorflow.python.keras.callbacks import EarlyStopping

In [26]:
early_stopping = EarlyStopping(patience=20)

In [27]:
history=model.fit(X_train,y_train,epochs=100,batch_size=200,validation_split=0.3,callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [28]:
pred = model.predict(X_test)
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred,thresholds)

임계값 0.3
정확도:0.564081, 정밀도:0.560683, 재현율:0.990246,F1 : 0.715976, ROC AUC : 0.575922


임계값 0.35
정확도:0.572780, 정밀도:0.566533, 재현율:0.979355,F1 : 0.717823, ROC AUC : 0.575922


임계값 0.38
정확도:0.575968, 정밀도:0.568910, 재현율:0.973251,F1 : 0.718073, ROC AUC : 0.575922


임계값 0.4
정확도:0.577462, 정밀도:0.570102, 재현율:0.969661,F1 : 0.718040, ROC AUC : 0.575922


임계값 0.45
정확도:0.579421, 정밀도:0.571921, 재현율:0.962181,F1 : 0.717412, ROC AUC : 0.575922


임계값 0.5
정확도:0.582210, 정밀도:0.576774, 재현율:0.927892,F1 : 0.711366, ROC AUC : 0.575922




### 1D CNN

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Conv1D,MaxPooling1D,BatchNormalization,Dropout

In [31]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

In [32]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [33]:
X_train=X_train.reshape(X_train.shape[0],X_train.shape[1],1)
X_test=X_test.reshape(X_test.shape[0],X_test.shape[1],1)
X_train.shape, X_test.shape

((70273, 52, 1), (30118, 52, 1))

In [34]:
input_shape=X_train[0].shape
input_shape

(52, 1)

In [35]:
cnn=Sequential()
# Conv + Pooling Layer1
cnn.add(Conv1D(filters=128, kernel_size=2, activation='relu', input_shape=X_train[0].shape))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(BatchNormalization())
cnn.add(Dropout(0.2))

# Conv + Pooling Layer2
cnn.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(BatchNormalization())
cnn.add(Dropout(0.2))

# Conv + Pooling Layer3
cnn.add(Conv1D(filters=32, kernel_size=2, activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(BatchNormalization())
cnn.add(Dropout(0.2))

# Fully Connected Layer 
cnn.add(Flatten())
cnn.add(Dense(256, activation='relu'))
cnn.add(Dropout(0.2))
cnn.add(Dense(128, activation='relu'))
cnn.add(Dropout(0.2))
cnn.add(Dense(64, activation='relu'))
cnn.add(Dropout(0.2))
cnn.add(Dense(1, activation='relu'))
cnn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 51, 128)           384       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 25, 128)           0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 25, 128)           0         
_________________________________________________________________
batch_normalization_5 (Batch (None, 25, 128)           512       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 24, 64)            16448     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 12, 64)            0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 12, 64)           

In [36]:
cnn.compile(tensorflow.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy',f1score])

In [37]:
history=cnn.fit(X_train,y_train,epochs=100,batch_size=200,validation_split=0.3,callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [38]:
pred = cnn.predict(X_test)
thresholds = [0.3,0.35,0.38,0.4,0.45,0.5]
get_eval_by_threshold(y_test,pred,thresholds)

임계값 0.3
정확도:0.554851, 정밀도:0.554851, 재현율:1.000000,F1 : 0.713703, ROC AUC : 0.585114


임계값 0.35
정확도:0.554851, 정밀도:0.554851, 재현율:1.000000,F1 : 0.713703, ROC AUC : 0.585114


임계값 0.38
정확도:0.557507, 정밀도:0.556408, 재현율:0.998743,F1 : 0.714668, ROC AUC : 0.585114


임계값 0.4
정확도:0.562222, 정밀도:0.559420, 재현율:0.993238,F1 : 0.715724, ROC AUC : 0.585114


임계값 0.45
정확도:0.579421, 정밀도:0.571819, 재현율:0.963377,F1 : 0.717664, ROC AUC : 0.585114


임계값 0.5
정확도:0.580019, 정밀도:0.573268, 재현율:0.950931,F1 : 0.715311, ROC AUC : 0.585114




### Test 데이터 불러오기 

In [39]:
for i in list1:
    test[i] = encoder.fit_transform(test[i])

### 최종 예측값 출력 

In [40]:
test = test.to_numpy()

In [41]:
test =test.reshape(test.shape[0],test.shape[1],1)
test.shape

(46404, 52, 1)

In [51]:
final_pred = cnn.predict(test)

In [52]:
threshold=0.4
final_pred = np.where(final_pred >= threshold , 1, 0)

In [53]:
sample_submission = pd.read_csv("데이콘/sample_submission.csv")
sample_submission['target'] = final_pred

In [54]:
sample_submission['target'].value_counts()

1    44415
0     1989
Name: target, dtype: int64

In [46]:
sample_submission.to_csv('cnn_submission.csv', index=False, encoding='utf-8-sig')