In [None]:
# 구글 colab을 사용해주시고 런타임 유형은 GPU로 부탁드립니다.
# preprocess_date 함수 안 data_path 설정 부탁드립니다.
# save_pred 함수 안 model_path 설정 부탁드립니다.

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.utils.data as data_utils
import os

In [None]:
def seed(seed):
    np.random.seed(seed) 
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

def seed_worker(_worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)

def preprocess_data():
    # 학습 데이터 읽기. 경로 설정에 주의 하세요!
    path = '/content/drive/MyDrive/kt 이상탐지/data/Media'
    file_list = os.listdir(path)

    df = []
    for file in file_list:
      file_path = os.path.join(path, file)
      data = pd.read_csv(file_path)
      df.append(data)

    # print(f'전체 데이터 세트. \n{df}\n')

    # TODO: 예시코드 실행을 위한 Train_set/Test_set 분할입니다. 반드시 이 형태로 학습/테스트할 필요는 없습니다.
    train_set = []
    test_set = []
    for data in df:
      data['Timestamp'] = data['Timestamp'].apply(lambda x : x[:4]+'-'+x[4:6]+'-'+x[6:8]+' '+x[9:11]+':'+x[11:13]+':'+'00')
      data['Timestamp'] = pd.to_datetime(data['Timestamp'])
      end_of_year = data.index[data['Timestamp'] == '2017-12-31 23:55:00'].tolist()[0]
      train_set.append(data[:end_of_year+1])  # 2017 1.1 - 12.31 분리
      test_set.append(data[end_of_year+1:])  # 2018 1.1 - 12.31 분리

    # print(f'2017년. \n{train_set}\n')
    # print(f'2018년. \n{test_set}\n')

    # -----------------------------------
    # TODO: 데이터 분석을 통해 다양한 전처리를 시도 해보세요!
    for i in range(len(train_set)):
      train = train_set[i]
      test = test_set[i]
      cn = len(train.columns)
      cl = train.columns[1:]
      n = 1
      if cn == 13:
        for j in range(1,cn//3+1):
          train['menu_svr{}'.format(j)] = train[train.columns[n]] + train[train.columns[n+1]] + train[train.columns[n+2]]
          test['menu_svr{}'.format(j)] = test[train.columns[n]] + test[train.columns[n+1]] + test[train.columns[n+2]]
          n += 3
        train.drop(columns = cl, inplace=True)
        test.drop(columns = cl, inplace=True)
        menu_train = train
        menu_test = test
      if cn == 16:
        for j in range(1,cn//3+1):
          train['login_svr{}'.format(j)] = train[train.columns[n]] + train[train.columns[n+1]] + train[train.columns[n+2]]
          test['login_svr{}'.format(j)] = test[train.columns[n]] + test[train.columns[n+1]] + test[train.columns[n+2]]
          n += 3
        train.drop(columns = cl, inplace=True)
        test.drop(columns = cl, inplace=True)
        login_train = train
        login_test = test
      if cl[0] == 'INFO-01-Request':
        train['info_svr1'] = train[train.columns[1]] + train[train.columns[2]] + train[train.columns[3]]
        test['info_svr1'] = test[train.columns[1]] + test[train.columns[2]] + test[train.columns[3]]
        train.drop(columns = cl, inplace=True)
        test.drop(columns = cl, inplace=True)
        info_train = train
        info_test = test
      if cl[0] == 'STREAM-01-Session':
        stream_train = train
        stream_test = test

    preprocessed_train_set = pd.concat([menu_train.iloc[:,1:], login_train.iloc[:,1:],info_train.iloc[:,1:],stream_train.iloc[:,1:]],axis=1)
    preprocessed_train_set.fillna(0, inplace =True)

    preprocessed_test_set = pd.concat([menu_test.iloc[:,1:], login_test.iloc[:,1:],info_test.iloc[:,1:],stream_test.iloc[:,1:]],axis=1)

    VALID_COLUMNS_IN_TRAIN_DATASET = preprocessed_train_set.columns

    TAG_MIN = preprocessed_train_set[VALID_COLUMNS_IN_TRAIN_DATASET].min()
    TAG_MAX = preprocessed_train_set[VALID_COLUMNS_IN_TRAIN_DATASET].max()

    def normalize(df):
      ndf = df.copy()
      for c in df.columns:
          if TAG_MIN[c] == TAG_MAX[c]:
              ndf[c] = df[c] - TAG_MIN[c]
          else:
              ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
      return ndf

    def boundary_check(df):
        x = np.array(df, dtype=np.float32)
        return np.any(x > 1.0), np.any(x < 0), np.any(np.isnan(x))

    TRAIN_DF = normalize(preprocessed_train_set[VALID_COLUMNS_IN_TRAIN_DATASET]).ewm(alpha=0.9).mean()
    TEST_DF = normalize(preprocessed_test_set[VALID_COLUMNS_IN_TRAIN_DATASET]).ewm(alpha=0.9).mean()

    print(boundary_check(TRAIN_DF))
    print(boundary_check(TEST_DF))

    window_size=12

    windows_train=TRAIN_DF.values[np.arange(window_size)[None, :] + np.arange(TRAIN_DF.shape[0]-window_size)[:, None]]

    atest = np.arange(window_size)[None, :] + np.arange(TEST_DF.shape[0])[:, None]
    windows_test=TEST_DF.values[np.where(atest > TEST_DF.shape[0]-1, TEST_DF.shape[0]-1, atest)]


    return windows_train, windows_test

#--------------------------------------------------------------------------------------------------------------------------
class Encoder(nn.Module):
  def __init__(self, in_size, latent_size):
    super().__init__()
    self.linear1 = nn.Linear(in_size, int(in_size/2))
    self.linear2 = nn.Linear(int(in_size/2), int(in_size/4))
    self.linear3 = nn.Linear(int(in_size/4), latent_size)
    self.relu = nn.ReLU(True)
        
  def forward(self, w):
    out = self.linear1(w)
    out = self.relu(out)
    out = self.linear2(out)
    out = self.relu(out)
    out = self.linear3(out)
    z = self.relu(out)
    return z
    
class Decoder(nn.Module):
  def __init__(self, latent_size, out_size):
    super().__init__()
    self.linear1 = nn.Linear(latent_size, int(out_size/4))
    self.linear2 = nn.Linear(int(out_size/4), int(out_size/2))
    self.linear3 = nn.Linear(int(out_size/2), out_size)
    self.relu = nn.ReLU(True)
    self.sigmoid = nn.Sigmoid()
        
  def forward(self, z):
    out = self.linear1(z)
    out = self.relu(out)
    out = self.linear2(out)
    out = self.relu(out)
    out = self.linear3(out)
    w = self.sigmoid(out)
    return w
    
class UsadModel(nn.Module):
  def __init__(self, w_size, z_size):
    super().__init__()
    self.encoder = Encoder(w_size, z_size)
    self.decoder1 = Decoder(z_size, w_size)
    self.decoder2 = Decoder(z_size, w_size)
  
  def training_step(self, batch, n):
    z = self.encoder(batch)
    w1 = self.decoder1(z)
    w2 = self.decoder2(z)
    w3 = self.decoder2(self.encoder(w1))
    loss1 = 1/n*torch.mean((batch-w1)**2)+(1-1/n)*torch.mean((batch-w3)**2)
    loss2 = 1/n*torch.mean((batch-w2)**2)-(1-1/n)*torch.mean((batch-w3)**2)
    return loss1,loss2

  def validation_step(self, batch, n):
    z = self.encoder(batch)
    w1 = self.decoder1(z)
    w2 = self.decoder2(z)
    w3 = self.decoder2(self.encoder(w1))
    loss1 = 1/n*torch.mean((batch-w1)**2)+(1-1/n)*torch.mean((batch-w3)**2)
    loss2 = 1/n*torch.mean((batch-w2)**2)-(1-1/n)*torch.mean((batch-w3)**2)
    return {'val_loss1': loss1, 'val_loss2': loss2}
        
  def validation_epoch_end(self, outputs):
    batch_losses1 = [x['val_loss1'] for x in outputs]
    epoch_loss1 = torch.stack(batch_losses1).mean()
    batch_losses2 = [x['val_loss2'] for x in outputs]
    epoch_loss2 = torch.stack(batch_losses2).mean()
    return {'val_loss1': epoch_loss1.item(), 'val_loss2': epoch_loss2.item()}
    
  def epoch_end(self, epoch, result):
    print("Epoch [{}], val_loss1: {:.4f}, val_loss2: {:.4f}".format(epoch, result['val_loss1'], result['val_loss2']))
    
def testing(model, test_loader, alpha=.5, beta=.5):
    results=[]
    for [batch] in test_loader:
        batch=to_device(batch,device)
        w1=model.decoder1(model.encoder(batch))
        w2=model.decoder2(model.encoder(w1))
        results.append(alpha*torch.mean((batch-w1)**2,axis=1)+beta*torch.mean((batch-w2)**2,axis=1))
    return results

def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

#--------------------------------------------------------------------------------------------------------------------------

def save_pred(test_data):
    # TODO: 모델을 활용해, 2018년 전체에 대한 예측을 수행하세요!
    
    w_size=test_data.shape[1]*test_data.shape[2]
    z_size=test_data.shape[1]*hidden_size

    model_path = '/content/drive/MyDrive/kt 이상탐지/model/usad_model.pt'
    model = UsadModel(w_size, z_size).cuda()
    model.load_state_dict(torch.load(model_path))

    test_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(
    torch.from_numpy(test_data).float().view(([test_data.shape[0],w_size]))
    ) , batch_size=BATCH_SIZE, shuffle=False, num_workers=0, worker_init_fn=seed_worker)
    
    results=testing(model,test_loader)

    anomalyscore = np.concatenate([torch.stack(results[:-1]).flatten().detach().cpu().numpy(),
                              results[-1].flatten().detach().cpu().numpy()])
    df_as = pd.DataFrame(anomalyscore)
    
    THRESHOLD = df_as[0].quantile(0.9985)
    print(THRESHOLD)

    pred = [1 if e > THRESHOLD else 0 for e in anomalyscore]
    pred = np.array(pred)

    
    # 예측된 결과를 제출하기 위한 포맷팅
    answer = pd.DataFrame(pred, columns=['Prediction'])
    print(f'예측 결과. \n{answer}\n')  # TODO: 제출 전 row size "105120" 확인
    print(answer['Prediction'].value_counts())
    answer.to_csv('Media_answer.csv', index=False)  # 정답을 제출하기 위해 저장


# TODO: 제출 파일은 2018년 1월 1일 00시 00분-05분 부터 2018년 12월 31일 23시 55분-00분 구간의 이상 이벤트를 예측한
# .csv 형식으로 저장해야 합니다.
# 예측 데이터프레임의 크기는 [105120 * 1]입니다.


if __name__ == '__main__':

    seed(15)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    BATCH_SIZE =  256
    N_EPOCHS = 50
    hidden_size = 10

    # 데이터 전처리
    train_data, test_data = preprocess_data()

    # 예측 결과 저장
    save_pred(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(False, False, False)
(True, False, False)
0.1981596269905579
예측 결과. 
        Prediction
0                0
1                0
2                0
3                0
4                0
...            ...
105115           1
105116           1
105117           1
105118           1
105119           1

[105120 rows x 1 columns]

0    104962
1       158
Name: Prediction, dtype: int64
