In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

# 원본 데이터 로드
origin_df = pd.read_csv("./04_hashed.csv")

In [None]:
# IP 전처리
def transform_ip(ip): 
    groups = ip.split(".") 
    equalize_group_length = "".join( map( lambda group: group.zfill(3), groups )) 
    return equalize_group_length 

# Feature 전처리
def preprocess(df):
      
    # IP 전처리
    df['src_ip'] = df.src_ip.apply(lambda ip : transform_ip(ip))
    df['dst_ip'] = df.dst_ip.apply(lambda ip : transform_ip(ip))

    # country 전처리
    label_encoder = LabelEncoder()
    df['src_country'] = label_encoder.fit_transform(df['src_country'])
    df['dst_country'] = label_encoder.fit_transform(df['dst_country'])
    
    Robust_scaler = RobustScaler()
    # Feature에 scaler 적용 
    feature_trans = Robust_scaler.fit_transform(df)
    df = pd.DataFrame(feature_trans, columns=df.columns, index=df.index)
    
    return df

In [None]:
# 모델 학습 및 추론
def model(input_df, model_name):
    # Rdate를 제외한 Feature 선택
    feature_preprocess_df = input_df[["src_ip", "dst_ip", "Proto", "src_port", "dst_port", "Action", "src_country", "dst_country"]]
   
    # One Class SVM 모델 학습
    model = OneClassSVM(gamma='auto').fit(feature_preprocess_df)
    # 예측값 생성
    y_pred = model.predict(feature_preprocess_df)
    
    # 예측값 DataFrame화
    y_pred_df = pd.DataFrame(y_pred, columns=[f'{model_name}_y_pred'], index=feature_preprocess_df.index)
    # Feature 데이터와 예측값 concat
    result_df = pd.concat([feature_preprocess_df,y_pred_df],axis=1)
    return result_df, feature_preprocess_df, y_pred

In [None]:
# Rdate 종류
unique_time = origin_df['Rdate'].unique()

# Rdate를 제외한 나머지  Feature Scaling
preprocess_df = origin_df[["src_ip", "dst_ip", "Proto", "src_port", "dst_port", "Action", "src_country", "dst_country"]]
preprocess_df = preprocess(preprocess_df.astype(str))

# Rdate Feature 다시 추가
preprocess_df['Rdate'] = origin_df['Rdate']
preprocess_df

In [None]:
# window size 선언
window_size = 240

# 전체 데이터 탐지 결과
total_result_df = pd.DataFrame()

count = 0
model_name = 'one_class_svm'

for i in range(int((len(unique_time) / window_size)) + 1):
    # window df 생성
    window_df = pd.DataFrame()
    # window size 만큼 Rdate 탐색
    try:
        for j in range(window_size):
            # 전체 데이터를 순서대로 확인하며, Rdate 종류에 해당하는 데이터를 추출해 window df에 추가
            select_time = preprocess_df[preprocess_df['Rdate']==unique_time[i*window_size+j]]
            window_df = pd.concat([window_df,select_time])
    except:
        pass
    # 모델 실행
    result_df, feature_df, y_pred = model(window_df, model_name)
    total_result_df = pd.concat([total_result_df, result_df])
    
    scatter_matrix(feature_df,
               c = y_pred,
               alpha=0.5,
               figsize=(20, 20),
               diagonal='kde')
    
    plt.show()
    
    count += window_size
    print(" @@@@@@@@@@@@@@@@@@@@ %d 개 Rdate 중, %d 개 확인 완료 @@@@@@@@@@@@@@@@@@@@ " % (len(unique_time), count))
    

In [None]:
# 탐지 결과 파일 저장
origin_df[f'{model_name}_y_pred'] = total_result_df[f'{model_name}_y_pred']
origin_df.to_csv(f"{model_name}_result.csv", index=False)
origin_df