In [1]:
import glob
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

# 폴더 경로 지정
folder_path = r'C:\Users\pc\Desktop\CNN\CIC-AndMal2017'

In [2]:
# 랜섬웨어 클래스별 파일에 있는 샘플 개수를 확인하기 위한 딕셔너리
samples_per_class = {}

# Ransomware 폴더 내의 10개 폴더 경로 리스트 받아오기
ransomware_folder_list = glob.glob(folder_path + '\\Ransomware\\*')

# 모든 파일 로드 및 샘플 개수 확인
for folder in ransomware_folder_list:
    ransomware_class = folder.split('\\')[-1]  # 랜섬웨어 클래스명 추출

    file_paths = glob.glob(folder + '\\*.csv')
    total_samples = 0  # 클래스별 전체 샘플 개수 초기화
    for file_path in file_paths:
        data = pd.read_csv(file_path)
        total_samples += data.shape[0]  # 데이터프레임의 행 수 / 샘플 개수 더하기

    samples_per_class[ransomware_class] = total_samples

# 클래스별 샘플 개수 출력
for ransomware_class, num_samples in samples_per_class.items():
    print(f"{ransomware_class} 클래스의 샘플 개수: {num_samples}")

Charger 클래스의 샘플 개수: 39551
Jisut 클래스의 샘플 개수: 25672
Koler 클래스의 샘플 개수: 44555
Lockerpin 클래스의 샘플 개수: 25307
Pletor 클래스의 샘플 개수: 4715
PornDroid 클래스의 샘플 개수: 46082
RansomBO 클래스의 샘플 개수: 39859
Simplocker 클래스의 샘플 개수: 36340
SVpeng 클래스의 샘플 개수: 54161
WannaLocker 클래스의 샘플 개수: 32701


In [3]:
benign_folder_path = r'C:\Users\pc\Desktop\CNN\CIC-AndMal2017\Benign'

# 'Benign' 클래스의 파일 경로 리스트 받아오기
benign_file_paths = glob.glob(benign_folder_path + '/*.csv')

# 'Benign' 클래스의 데이터프레임 초기화
benign_data = pd.DataFrame()

# 'Benign' 클래스의 모든 파일 로드
for file_path in benign_file_paths:
    data = pd.read_csv(file_path)
    benign_data = pd.concat([benign_data, data], ignore_index=True)

# 'Benign' 클래스에서 200,000개 무작위 샘플 추출
num_samples_benign = 200000
if len(benign_data) > num_samples_benign:
    benign_subset = benign_data.sample(n=num_samples_benign, random_state=42)
else:
    benign_subset = benign_data.copy()

print("benign_subset의 샘플 개수:", len(benign_subset))

benign_subset의 샘플 개수: 200000


In [4]:
ransomware_folder_list = glob.glob(folder_path + '\\Ransomware\\*')
all_ransomware_data = pd.DataFrame()

for ransomware_folder_path in ransomware_folder_list:
    ransomware_file_paths = glob.glob(ransomware_folder_path + '/*.csv')
    
    # 각 랜섬웨어 클래스의 모든 파일 로드
    for file_path in ransomware_file_paths:
        data = pd.read_csv(file_path)
        all_ransomware_data = pd.concat([all_ransomware_data, data], ignore_index=True)

# 전체 랜섬웨어 데이터에서 200,000개 샘플 무작위로 추출
if len(all_ransomware_data) > 200000:
    all_ransomware_subset = all_ransomware_data.sample(n=200000, random_state=42, replace=False)
else:
    all_ransomware_subset = all_ransomware_data.copy()

print("all_ransomware_subset의 샘플 개수:", len(all_ransomware_subset))

all_ransomware_subset의 샘플 개수: 200000


In [5]:
# 'Benign' 클래스와 랜섬웨어 클래스의 데이터프레임들을 합치기
sub_dataset = pd.concat([benign_subset, all_ransomware_subset], ignore_index=True)

print("최종 특성추출 데이터세트의 샘플 개수:", len(sub_dataset))

최종 특성추출 데이터세트의 샘플 개수: 400000


In [6]:
from sklearn.model_selection import train_test_split

# 제거할 비수치형 열 목록
non_numeric_columns = ['Flow ID', ' Source IP', ' Destination IP', ' Timestamp']

# 학습 및 임시 데이터 분할 (80% 학습, 20% 임시)
train_benign, temp_benign = train_test_split(benign_subset, test_size=0.20, random_state=42)
train_ransomware, temp_ransomware = train_test_split(all_ransomware_subset, test_size=0.20, random_state=42)

# 임시 데이터를 검증 및 테스트 데이터로 분할 (각각 50%, 합쳐서 20%를 10% 검증, 10% 테스트로 분할)
val_benign, test_benign = train_test_split(temp_benign, test_size=0.50, random_state=42)
val_ransomware, test_ransomware = train_test_split(temp_ransomware, test_size=0.50, random_state=42)

# 각 분할된 데이터 세트를 병합하여 최종 학습, 검증, 테스트 데이터 세트 생성
train_data = pd.concat([train_benign, train_ransomware], ignore_index=True)
val_data = pd.concat([val_benign, val_ransomware], ignore_index=True)
test_data = pd.concat([test_benign, test_ransomware], ignore_index=True)

print("Train samples:", len(train_data))
print("Validation samples:", len(val_data))
print("Test samples:", len(test_data))

Train samples: 320000
Validation samples: 40000
Test samples: 40000


In [7]:
# 학습, 검증, 테스트 데이터의 특성 추출
X_train = train_data.drop([' Label'] + non_numeric_columns, axis=1)
X_val = val_data.drop([' Label'] + non_numeric_columns, axis=1)
X_test = test_data.drop([' Label'] + non_numeric_columns, axis=1)

# 결과 출력
print("학습 데이터:")
print(X_train.head())
print("\n검증 데이터:")
print(X_val.head())
print("\n테스트 데이터:")
print(X_test.head())

학습 데이터:
    Source Port   Destination Port   Protocol   Flow Duration  \
0         37241                 80          6          128557   
1         47209                443          6         9164305   
2         58552                443          6            1044   
3         54797                 80          6        19693155   
4         58626                443          6        66444433   

    Total Fwd Packets   Total Backward Packets  Total Length of Fwd Packets  \
0                   4                        3                        371.0   
1                   9                       10                       1679.0   
2                   2                        0                         31.0   
3                   3                        1                          0.0   
4                   9                        0                        207.0   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                         946.0                   371.0   
1       

In [8]:
# 학습, 검증, 테스트 데이터의 타겟 변수 추출
y_train = train_data[' Label']
y_val = val_data[' Label']
y_test = test_data[' Label']

In [9]:
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 시작 시간 기록
start_time = time.time()

# 2. Decision Tree 학습 및 평가
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

dt_val_predictions = dt_classifier.predict(X_val)
print("Decision Tree - Validation Accuracy:", accuracy_score(y_val, dt_val_predictions))
print("Decision Tree - Validation Report:\n", classification_report(y_val, dt_val_predictions))

dt_test_predictions = dt_classifier.predict(X_test)
print("Decision Tree - Test Accuracy:", accuracy_score(y_test, dt_test_predictions))
print("Decision Tree - Test Report:\n", classification_report(y_test, dt_test_predictions))

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Decision Tree training and evaluation took {elapsed_time:.2f} seconds")

Decision Tree - Validation Accuracy: 0.479325
Decision Tree - Validation Report:
                         precision    recall  f1-score   support

                BENIGN       0.70      0.68      0.69     20000
    RANSOMWARE_CHARGER       0.23      0.25      0.24      2230
      RANSOMWARE_JISUT       0.18      0.18      0.18      1465
      RANSOMWARE_KOLER       0.32      0.32      0.32      2574
  RANSOMWARE_LOCKERPIN       0.18      0.19      0.18      1419
     RANSOMWARE_PLETOR       0.45      0.47      0.46       279
  RANSOMWARE_PORNDROID       0.29      0.28      0.28      2706
   RANSOMWARE_RANSOMBO       0.25      0.26      0.25      2260
 RANSOMWARE_SIMPLOCKER       0.22      0.23      0.23      2054
     RANSOMWARE_SVPENG       0.38      0.38      0.38      3169
RANSOMWARE_WANNALOCKER       0.22      0.22      0.22      1844

              accuracy                           0.48     40000
             macro avg       0.31      0.32      0.31     40000
          weighted a

In [11]:
# 시작 시간 기록
start_time = time.time()

# 3. Random Forest 학습 및 평가
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

rf_val_predictions = rf_classifier.predict(X_val)
print("Random Forest - Validation Accuracy:", accuracy_score(y_val, rf_val_predictions))
print("Random Forest - Validation Report:\n", classification_report(y_val, rf_val_predictions))

rf_test_predictions = rf_classifier.predict(X_test)
print("Random Forest - Test Accuracy:", accuracy_score(y_test, rf_test_predictions))
print("Random Forest - Test Report:\n", classification_report(y_test, rf_test_predictions))

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Random Forest training and evaluation took {elapsed_time:.2f} seconds")

Random Forest - Validation Accuracy: 0.548
Random Forest - Validation Report:
                         precision    recall  f1-score   support

                BENIGN       0.64      0.86      0.74     20000
    RANSOMWARE_CHARGER       0.29      0.20      0.23      2230
      RANSOMWARE_JISUT       0.23      0.13      0.16      1465
      RANSOMWARE_KOLER       0.40      0.28      0.33      2574
  RANSOMWARE_LOCKERPIN       0.25      0.13      0.18      1419
     RANSOMWARE_PLETOR       0.58      0.48      0.52       279
  RANSOMWARE_PORNDROID       0.35      0.24      0.29      2706
   RANSOMWARE_RANSOMBO       0.32      0.22      0.26      2260
 RANSOMWARE_SIMPLOCKER       0.29      0.20      0.24      2054
     RANSOMWARE_SVPENG       0.50      0.34      0.40      3169
RANSOMWARE_WANNALOCKER       0.34      0.20      0.25      1844

              accuracy                           0.55     40000
             macro avg       0.38      0.30      0.33     40000
          weighted avg 

In [12]:
from sklearn.neighbors import KNeighborsClassifier

# 시작 시간 기록
start_time = time.time()

# 4. K-Nearest Neighbors 학습 및 평가
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

knn_val_predictions = knn_classifier.predict(X_val)
print("KNN - Validation Accuracy:", accuracy_score(y_val, knn_val_predictions))
print("KNN - Validation Report:\n", classification_report(y_val, knn_val_predictions))

knn_test_predictions = knn_classifier.predict(X_test)
print("KNN - Test Accuracy:", accuracy_score(y_test, knn_test_predictions))
print("KNN - Test Report:\n", classification_report(y_test, knn_test_predictions))

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"KNN training and evaluation took {elapsed_time:.2f} seconds")

KNN - Validation Accuracy: 0.493875
KNN - Validation Report:
                         precision    recall  f1-score   support

                BENIGN       0.56      0.86      0.68     20000
    RANSOMWARE_CHARGER       0.22      0.13      0.17      2230
      RANSOMWARE_JISUT       0.16      0.06      0.09      1465
      RANSOMWARE_KOLER       0.35      0.19      0.25      2574
  RANSOMWARE_LOCKERPIN       0.16      0.06      0.08      1419
     RANSOMWARE_PLETOR       0.47      0.26      0.34       279
  RANSOMWARE_PORNDROID       0.29      0.14      0.19      2706
   RANSOMWARE_RANSOMBO       0.21      0.10      0.13      2260
 RANSOMWARE_SIMPLOCKER       0.22      0.09      0.13      2054
     RANSOMWARE_SVPENG       0.35      0.17      0.22      3169
RANSOMWARE_WANNALOCKER       0.23      0.07      0.11      1844

              accuracy                           0.49     40000
             macro avg       0.29      0.19      0.22     40000
          weighted avg       0.41      0