In [1]:
import glob
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

# 폴더 경로 지정
folder_path = r'C:\Users\pc\Desktop\CNN\CIC-AndMal2017'

In [2]:
# 랜섬웨어 클래스별 파일에 있는 샘플 개수를 확인하기 위한 딕셔너리
samples_per_class = {}

# Ransomware 폴더 내의 10개 폴더 경로 리스트 받아오기
ransomware_folder_list = glob.glob(folder_path + '\\Ransomware\\*')

# 모든 파일 로드 및 샘플 개수 확인
for folder in ransomware_folder_list:
    ransomware_class = folder.split('\\')[-1]  # 랜섬웨어 클래스명 추출

    file_paths = glob.glob(folder + '\\*.csv')
    total_samples = 0  # 클래스별 전체 샘플 개수 초기화
    for file_path in file_paths:
        data = pd.read_csv(file_path)
        total_samples += data.shape[0]  # 데이터프레임의 행 수 / 샘플 개수 더하기

    samples_per_class[ransomware_class] = total_samples

# 클래스별 샘플 개수 출력
for ransomware_class, num_samples in samples_per_class.items():
    print(f"{ransomware_class} 클래스의 샘플 개수: {num_samples}")

Charger 클래스의 샘플 개수: 39551
Jisut 클래스의 샘플 개수: 25672
Koler 클래스의 샘플 개수: 44555
Lockerpin 클래스의 샘플 개수: 25307
Pletor 클래스의 샘플 개수: 4715
PornDroid 클래스의 샘플 개수: 46082
RansomBO 클래스의 샘플 개수: 39859
Simplocker 클래스의 샘플 개수: 36340
SVpeng 클래스의 샘플 개수: 54161
WannaLocker 클래스의 샘플 개수: 32701


In [3]:
benign_folder_path = r'C:\Users\pc\Desktop\CNN\CIC-AndMal2017\Benign'

# 'Benign' 클래스의 파일 경로 리스트 받아오기
benign_file_paths = glob.glob(benign_folder_path + '/*.csv')

# 'Benign' 클래스의 데이터프레임 초기화
benign_data = pd.DataFrame()

# 'Benign' 클래스의 모든 파일 로드
for file_path in benign_file_paths:
    data = pd.read_csv(file_path)
    benign_data = pd.concat([benign_data, data], ignore_index=True)

# 'Benign' 클래스에서 200,000개 무작위 샘플 추출
num_samples_benign = 200000
if len(benign_data) > num_samples_benign:
    benign_subset = benign_data.sample(n=num_samples_benign, random_state=42)
else:
    benign_subset = benign_data.copy()

print("benign_subset의 샘플 개수:", len(benign_subset))

benign_subset의 샘플 개수: 200000


In [4]:
ransomware_folder_list = glob.glob(folder_path + '\\Ransomware\\*')
all_ransomware_data = pd.DataFrame()

for ransomware_folder_path in ransomware_folder_list:
    ransomware_file_paths = glob.glob(ransomware_folder_path + '/*.csv')
    
    # 각 랜섬웨어 클래스의 모든 파일 로드
    for file_path in ransomware_file_paths:
        data = pd.read_csv(file_path)
        all_ransomware_data = pd.concat([all_ransomware_data, data], ignore_index=True)

# 전체 랜섬웨어 데이터에서 200,000개 샘플 무작위로 추출
if len(all_ransomware_data) > 200000:
    all_ransomware_subset = all_ransomware_data.sample(n=200000, random_state=42, replace=False)
else:
    all_ransomware_subset = all_ransomware_data.copy()

print("all_ransomware_subset의 샘플 개수:", len(all_ransomware_subset))

all_ransomware_subset의 샘플 개수: 200000


In [5]:
# 'Benign' 클래스와 랜섬웨어 클래스의 데이터프레임들을 합치기
sub_dataset = pd.concat([benign_subset, all_ransomware_subset], ignore_index=True)

print("최종 특성추출 데이터세트의 샘플 개수:", len(sub_dataset))

최종 특성추출 데이터세트의 샘플 개수: 400000


In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# IP 주소를 수치형 데이터로 변환하는 함수
def ip_to_numeric(ip):
    if not isinstance(ip, str):
        return np.nan
    return sum(int(octet) * 256 ** (3 - index) for index, octet in enumerate(ip.split('.')))

# Apply the ip_to_numeric function to the 'Destination IP' column
sub_dataset['Destination IP'] = sub_dataset[' Destination IP'].apply(ip_to_numeric)
benign_subset['Destination IP'] = benign_subset[' Destination IP'].apply(ip_to_numeric)
all_ransomware_subset['Destination IP'] = all_ransomware_subset[' Destination IP'].apply(ip_to_numeric)


# 상관분석으로 선택된 특성들만 사용
selected_features = [
    'Destination IP',
    ' ACK Flag Count',
    ' Active Max',
    ' Active Min',
    ' Active Std',
    ' Average Packet Size',
    ' Avg Bwd Segment Size',
    ' Avg Fwd Segment Size',
    ' Bwd Header Length',
    ' Bwd IAT Max',
    ' Bwd IAT Mean',
    ' Bwd Packet Length Mean',
    ' Bwd Packet Length Std',
    ' Bwd Packets/s',
    ' Flow Duration',
    ' Flow Packets/s',
    ' Fwd Header Length',
    ' Fwd IAT Mean',
    ' Fwd Packet Length Max',
    ' Fwd Packet Length Mean',
    ' Fwd Packet Length Std',
    ' Idle Max',
    ' Idle Min',
    ' Idle Std',
    ' Max Packet Length',
    ' Min Packet Length',
    ' PSH Flag Count',
    ' Packet Length Mean',
    ' Packet Length Std',
    ' Packet Length Variance',
    ' Protocol',
    ' SYN Flag Count',
    ' Subflow Bwd Bytes',
    ' Subflow Bwd Packets',
    ' Subflow Fwd Bytes',
    ' Total Backward Packets',
    ' Total Fwd Packets',
    ' Total Length of Bwd Packets',
    ' URG Flag Count',
    ' act_data_pkt_fwd',
    ' min_seg_size_forward',
    'Active Mean',
    'Bwd IAT Total',
    'Bwd Packet Length Max',
    'Flow Bytes/s',
    'Fwd IAT Total',
    'Fwd PSH Flags',
    'Fwd Packets/s',
    'Idle Mean',
    'Init_Win_bytes_forward',
    'Subflow Fwd Packets',
    'Total Length of Fwd Packets'
]

# 선택된 특성만 포함하는 데이터프레임 생성
X_selected = sub_dataset[selected_features]

# 결과 출력
print("정규화된 선택된 특성들의 데이터프레임:")
print(X_selected.head())

정규화된 선택된 특성들의 데이터프레임:
   Destination IP   ACK Flag Count   Active Max   Active Min   Active Std  \
0       170524673                0          0.0          0.0          0.0   
1      3029698754                1          0.0          0.0          0.0   
2      1760832259                0     152071.0     152071.0          0.0   
3      2362304008                0          0.0          0.0          0.0   
4      2899904270                1          0.0          0.0          0.0   

    Average Packet Size   Avg Bwd Segment Size   Avg Fwd Segment Size  \
0             90.500000                 103.00                   39.0   
1              0.000000                   0.00                    0.0   
2             75.500000                  37.40                  113.6   
3            214.142857                 130.25                  326.0   
4             34.500000                   0.00                   23.0   

    Bwd Header Length   Bwd IAT Max  ...  Bwd IAT Total  \
0                

In [12]:
from sklearn.model_selection import train_test_split

# 학습 및 임시 데이터 분할 (80% 학습, 20% 임시)
train_benign, temp_benign = train_test_split(benign_subset, test_size=0.20, random_state=42)
train_ransomware, temp_ransomware = train_test_split(all_ransomware_subset, test_size=0.20, random_state=42)

# 임시 데이터를 검증 및 테스트 데이터로 분할 (각각 50%, 합쳐서 20%를 10% 검증, 10% 테스트로 분할)
val_benign, test_benign = train_test_split(temp_benign, test_size=0.50, random_state=42)
val_ransomware, test_ransomware = train_test_split(temp_ransomware, test_size=0.50, random_state=42)

# 각 분할된 데이터 세트를 병합하여 최종 학습, 검증, 테스트 데이터 세트 생성
train_data = pd.concat([train_benign, train_ransomware], ignore_index=True)
val_data = pd.concat([val_benign, val_ransomware], ignore_index=True)
test_data = pd.concat([test_benign, test_ransomware], ignore_index=True)

print("Train samples:", len(train_data))
print("Validation samples:", len(val_data))
print("Test samples:", len(test_data))

Train samples: 320000
Validation samples: 40000
Test samples: 40000


In [14]:
selected_feature_columns = X_selected.columns

# 학습, 검증, 테스트 데이터 추출
X_train = train_data[selected_feature_columns]
X_val = val_data[selected_feature_columns]
X_test = test_data[selected_feature_columns]

# 결과 출력
print("학습 데이터:")
print(X_train.head())
print("\n검증 데이터:")
print(X_val.head())
print("\n테스트 데이터:")
print(X_test.head())

학습 데이터:
   Destination IP   ACK Flag Count   Active Max   Active Min   Active Std  \
0      3627736035                0          0.0          0.0          0.0   
1      3075412331                0     859589.0     859589.0          0.0   
2      2899905258                1          0.0          0.0          0.0   
3      3414308222                0          0.0          0.0          0.0   
4      2899905742                1    8074491.0    8074491.0          0.0   

    Average Packet Size   Avg Bwd Segment Size   Avg Fwd Segment Size  \
0            188.142857             315.333333              92.750000   
1            286.157895             375.800000             186.555556   
2             31.000000               0.000000              15.500000   
3              0.000000               0.000000               0.000000   
4             25.555556               0.000000              23.000000   

    Bwd Header Length   Bwd IAT Max  ...  Bwd IAT Total  \
0                 104       455

In [15]:
# 학습, 검증, 테스트 데이터의 타겟 변수 추출
y_train = train_data[' Label']
y_val = val_data[' Label']
y_test = test_data[' Label']

# 타겟 변수에서 'benign'이 아닌 모든 값들을 'ransomware'로 변경
y_train = y_train.apply(lambda x: 'RANSOMWARE' if x != 'BENIGN' else 'BENIGN')
y_val = y_val.apply(lambda x: 'RANSOMWARE' if x != 'BENIGN' else 'BENIGN')
y_test = y_test.apply(lambda x: 'RANSOMWARE' if x != 'BENIGN' else 'BENIGN')

In [16]:
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 시작 시간 기록
start_time = time.time()

# 2. Decision Tree 학습 및 평가
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

dt_val_predictions = dt_classifier.predict(X_val)
print("Decision Tree - Validation Accuracy:", accuracy_score(y_val, dt_val_predictions))
print("Decision Tree - Validation Report:\n", classification_report(y_val, dt_val_predictions))

dt_test_predictions = dt_classifier.predict(X_test)
print("Decision Tree - Test Accuracy:", accuracy_score(y_test, dt_test_predictions))
print("Decision Tree - Test Report:\n", classification_report(y_test, dt_test_predictions))

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Decision Tree training and evaluation took {elapsed_time:.2f} seconds")

Decision Tree - Validation Accuracy: 0.71035
Decision Tree - Validation Report:
               precision    recall  f1-score   support

      BENIGN       0.71      0.71      0.71     20000
  RANSOMWARE       0.71      0.71      0.71     20000

    accuracy                           0.71     40000
   macro avg       0.71      0.71      0.71     40000
weighted avg       0.71      0.71      0.71     40000

Decision Tree - Test Accuracy: 0.712225
Decision Tree - Test Report:
               precision    recall  f1-score   support

      BENIGN       0.71      0.71      0.71     20000
  RANSOMWARE       0.71      0.71      0.71     20000

    accuracy                           0.71     40000
   macro avg       0.71      0.71      0.71     40000
weighted avg       0.71      0.71      0.71     40000

Decision Tree training and evaluation took 12.27 seconds


In [17]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 시작 시간 기록
start_time = time.time()

# 3. Random Forest 학습 및 평가
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

rf_val_predictions = rf_classifier.predict(X_val)
print("Random Forest - Validation Accuracy:", accuracy_score(y_val, rf_val_predictions))
print("Random Forest - Validation Report:\n", classification_report(y_val, rf_val_predictions))

rf_test_predictions = rf_classifier.predict(X_test)
print("Random Forest - Test Accuracy:", accuracy_score(y_test, rf_test_predictions))
print("Random Forest - Test Report:\n", classification_report(y_test, rf_test_predictions))

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Random Forest training and evaluation took {elapsed_time:.2f} seconds")

Random Forest - Validation Accuracy: 0.7274
Random Forest - Validation Report:
               precision    recall  f1-score   support

      BENIGN       0.73      0.73      0.73     20000
  RANSOMWARE       0.73      0.73      0.73     20000

    accuracy                           0.73     40000
   macro avg       0.73      0.73      0.73     40000
weighted avg       0.73      0.73      0.73     40000

Random Forest - Test Accuracy: 0.729275
Random Forest - Test Report:
               precision    recall  f1-score   support

      BENIGN       0.73      0.73      0.73     20000
  RANSOMWARE       0.73      0.73      0.73     20000

    accuracy                           0.73     40000
   macro avg       0.73      0.73      0.73     40000
weighted avg       0.73      0.73      0.73     40000

Random Forest training and evaluation took 116.02 seconds


In [18]:
from sklearn.neighbors import KNeighborsClassifier

# 시작 시간 기록
start_time = time.time()

# 4. K-Nearest Neighbors 학습 및 평가
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

knn_val_predictions = knn_classifier.predict(X_val)
print("KNN - Validation Accuracy:", accuracy_score(y_val, knn_val_predictions))
print("KNN - Validation Report:\n", classification_report(y_val, knn_val_predictions))

knn_test_predictions = knn_classifier.predict(X_test)
print("KNN - Test Accuracy:", accuracy_score(y_test, knn_test_predictions))
print("KNN - Test Report:\n", classification_report(y_test, knn_test_predictions))

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"KNN training and evaluation took {elapsed_time:.2f} seconds")

KNN - Validation Accuracy: 0.667675
KNN - Validation Report:
               precision    recall  f1-score   support

      BENIGN       0.67      0.67      0.67     20000
  RANSOMWARE       0.67      0.66      0.67     20000

    accuracy                           0.67     40000
   macro avg       0.67      0.67      0.67     40000
weighted avg       0.67      0.67      0.67     40000

KNN - Test Accuracy: 0.665575
KNN - Test Report:
               precision    recall  f1-score   support

      BENIGN       0.66      0.67      0.67     20000
  RANSOMWARE       0.67      0.66      0.66     20000

    accuracy                           0.67     40000
   macro avg       0.67      0.67      0.67     40000
weighted avg       0.67      0.67      0.67     40000

KNN training and evaluation took 22.79 seconds
