In [1]:
import glob
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

# 폴더 경로 지정
folder_path = r'C:\Users\pc\Desktop\CNN\CIC-AndMal2017'

In [2]:
# 랜섬웨어 클래스별 파일에 있는 샘플 개수를 확인하기 위한 딕셔너리
samples_per_class = {}

# Ransomware 폴더 내의 10개 폴더 경로 리스트 받아오기
ransomware_folder_list = glob.glob(folder_path + '\\Ransomware\\*')

# 모든 파일 로드 및 샘플 개수 확인
for folder in ransomware_folder_list:
    ransomware_class = folder.split('\\')[-1]  # 랜섬웨어 클래스명 추출

    file_paths = glob.glob(folder + '\\*.csv')
    total_samples = 0  # 클래스별 전체 샘플 개수 초기화
    for file_path in file_paths:
        data = pd.read_csv(file_path)
        total_samples += data.shape[0]  # 데이터프레임의 행 수 / 샘플 개수 더하기

    samples_per_class[ransomware_class] = total_samples

# 클래스별 샘플 개수 출력
for ransomware_class, num_samples in samples_per_class.items():
    print(f"{ransomware_class} 클래스의 샘플 개수: {num_samples}")

Charger 클래스의 샘플 개수: 39551
Jisut 클래스의 샘플 개수: 25672
Koler 클래스의 샘플 개수: 44555
Lockerpin 클래스의 샘플 개수: 25307
Pletor 클래스의 샘플 개수: 4715
PornDroid 클래스의 샘플 개수: 46082
RansomBO 클래스의 샘플 개수: 39859
Simplocker 클래스의 샘플 개수: 36340
SVpeng 클래스의 샘플 개수: 54161
WannaLocker 클래스의 샘플 개수: 32701


In [3]:
benign_folder_path = r'C:\Users\pc\Desktop\CNN\CIC-AndMal2017\Benign'

# 'Benign' 클래스의 파일 경로 리스트 받아오기
benign_file_paths = glob.glob(benign_folder_path + '/*.csv')

# 'Benign' 클래스의 데이터프레임 초기화
benign_data = pd.DataFrame()

# 'Benign' 클래스의 모든 파일 로드
for file_path in benign_file_paths:
    data = pd.read_csv(file_path)
    benign_data = pd.concat([benign_data, data], ignore_index=True)

# 'Benign' 클래스에서 200,000개 무작위 샘플 추출
num_samples_benign = 200000
if len(benign_data) > num_samples_benign:
    benign_subset = benign_data.sample(n=num_samples_benign, random_state=42)
else:
    benign_subset = benign_data.copy()

print("benign_subset의 샘플 개수:", len(benign_subset))

benign_subset의 샘플 개수: 200000


In [4]:
ransomware_folder_list = glob.glob(folder_path + '\\Ransomware\\*')
all_ransomware_data = pd.DataFrame()

for ransomware_folder_path in ransomware_folder_list:
    ransomware_file_paths = glob.glob(ransomware_folder_path + '/*.csv')
    
    # 각 랜섬웨어 클래스의 모든 파일 로드
    for file_path in ransomware_file_paths:
        data = pd.read_csv(file_path)
        all_ransomware_data = pd.concat([all_ransomware_data, data], ignore_index=True)

# 전체 랜섬웨어 데이터에서 200,000개 샘플 무작위로 추출
if len(all_ransomware_data) > 200000:
    all_ransomware_subset = all_ransomware_data.sample(n=200000, random_state=42, replace=False)
else:
    all_ransomware_subset = all_ransomware_data.copy()

print("all_ransomware_subset의 샘플 개수:", len(all_ransomware_subset))

all_ransomware_subset의 샘플 개수: 200000


In [5]:
# 'Benign' 클래스와 랜섬웨어 클래스의 데이터프레임들을 합치기
sub_dataset = pd.concat([benign_subset, all_ransomware_subset], ignore_index=True)

print("최종 특성추출 데이터세트의 샘플 개수:", len(sub_dataset))

최종 특성추출 데이터세트의 샘플 개수: 400000


In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# 상관분석으로 선택된 특성들만 사용
selected_features = [
    ' Bwd Header Length',
    ' Fwd Header Length',
    ' Total Fwd Packets',
    ' act_data_pkt_fwd',
    ' min_seg_size_forward',
    'Subflow Fwd Packets'
]

# 선택된 특성만 포함하는 데이터프레임 생성
X_selected = sub_dataset[selected_features]

# 결과 출력
print("정규화된 선택된 특성들의 데이터프레임:")
print(X_selected.head())

정규화된 선택된 특성들의 데이터프레임:
    Bwd Header Length   Fwd Header Length   Total Fwd Packets  \
0                  20                  20                   1   
1                  20                  20                   1   
2                 168                 168                   5   
3                 100                  80                   3   
4                   0                  64                   2   

    act_data_pkt_fwd   min_seg_size_forward  Subflow Fwd Packets  
0                  0                     20                    1  
1                  0                     20                    1  
2                  2                     32                    5  
3                  1                     20                    3  
4                  1                     32                    2  


In [7]:
from sklearn.model_selection import train_test_split

# 학습 및 임시 데이터 분할 (80% 학습, 20% 임시)
train_benign, temp_benign = train_test_split(benign_subset, test_size=0.20, random_state=42)
train_ransomware, temp_ransomware = train_test_split(all_ransomware_subset, test_size=0.20, random_state=42)

# 임시 데이터를 검증 및 테스트 데이터로 분할 (각각 50%, 합쳐서 20%를 10% 검증, 10% 테스트로 분할)
val_benign, test_benign = train_test_split(temp_benign, test_size=0.50, random_state=42)
val_ransomware, test_ransomware = train_test_split(temp_ransomware, test_size=0.50, random_state=42)

# 각 분할된 데이터 세트를 병합하여 최종 학습, 검증, 테스트 데이터 세트 생성
train_data = pd.concat([train_benign, train_ransomware], ignore_index=True)
val_data = pd.concat([val_benign, val_ransomware], ignore_index=True)
test_data = pd.concat([test_benign, test_ransomware], ignore_index=True)

print("Train samples:", len(train_data))
print("Validation samples:", len(val_data))
print("Test samples:", len(test_data))

Train samples: 320000
Validation samples: 40000
Test samples: 40000


In [9]:
# 선택된 36가지 특성에 해당하는 열만 추출
selected_feature_columns = X_selected.columns

# 학습, 검증, 테스트 데이터 추출
X_train = train_data[selected_feature_columns]
X_val = val_data[selected_feature_columns]
X_test = test_data[selected_feature_columns]

# 결과 출력
print("학습 데이터:")
print(X_train.head())
print("\n검증 데이터:")
print(X_val.head())
print("\n테스트 데이터:")
print(X_test.head())

학습 데이터:
    Bwd Header Length   Fwd Header Length   Total Fwd Packets  \
0                 104                 136                   4   
1                 328                 296                   9   
2                   0                  64                   2   
3                  32                  80                   3   
4                   0                 288                   9   

    act_data_pkt_fwd   min_seg_size_forward  Subflow Fwd Packets  
0                  1                     32                    4  
1                  3                     32                    9  
2                  0                     32                    2  
3                  0                     20                    3  
4                  8                     32                    9  

검증 데이터:
    Bwd Header Length   Fwd Header Length   Total Fwd Packets  \
0                   0                  64                   2   
1                  40                  20                   

In [10]:
# 학습, 검증, 테스트 데이터의 타겟 변수 추출
y_train = train_data[' Label']
y_val = val_data[' Label']
y_test = test_data[' Label']

# 타겟 변수에서 'benign'이 아닌 모든 값들을 'ransomware'로 변경
y_train = y_train.apply(lambda x: 'RANSOMWARE' if x != 'BENIGN' else 'BENIGN')
y_val = y_val.apply(lambda x: 'RANSOMWARE' if x != 'BENIGN' else 'BENIGN')
y_test = y_test.apply(lambda x: 'RANSOMWARE' if x != 'BENIGN' else 'BENIGN')

In [11]:
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 시작 시간 기록
start_time = time.time()

# 2. Decision Tree 학습 및 평가
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

dt_val_predictions = dt_classifier.predict(X_val)
print("Decision Tree - Validation Accuracy:", accuracy_score(y_val, dt_val_predictions))
print("Decision Tree - Validation Report:\n", classification_report(y_val, dt_val_predictions))

dt_test_predictions = dt_classifier.predict(X_test)
print("Decision Tree - Test Accuracy:", accuracy_score(y_test, dt_test_predictions))
print("Decision Tree - Test Report:\n", classification_report(y_test, dt_test_predictions))

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Decision Tree training and evaluation took {elapsed_time:.2f} seconds")

Decision Tree - Validation Accuracy: 0.571725
Decision Tree - Validation Report:
               precision    recall  f1-score   support

      BENIGN       0.56      0.67      0.61     20000
  RANSOMWARE       0.59      0.48      0.53     20000

    accuracy                           0.57     40000
   macro avg       0.57      0.57      0.57     40000
weighted avg       0.57      0.57      0.57     40000

Decision Tree - Test Accuracy: 0.571675
Decision Tree - Test Report:
               precision    recall  f1-score   support

      BENIGN       0.56      0.67      0.61     20000
  RANSOMWARE       0.59      0.48      0.53     20000

    accuracy                           0.57     40000
   macro avg       0.57      0.57      0.57     40000
weighted avg       0.57      0.57      0.57     40000

Decision Tree training and evaluation took 2.61 seconds


In [12]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 시작 시간 기록
start_time = time.time()

# 3. Random Forest 학습 및 평가
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

rf_val_predictions = rf_classifier.predict(X_val)
print("Random Forest - Validation Accuracy:", accuracy_score(y_val, rf_val_predictions))
print("Random Forest - Validation Report:\n", classification_report(y_val, rf_val_predictions))

rf_test_predictions = rf_classifier.predict(X_test)
print("Random Forest - Test Accuracy:", accuracy_score(y_test, rf_test_predictions))
print("Random Forest - Test Report:\n", classification_report(y_test, rf_test_predictions))

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Random Forest training and evaluation took {elapsed_time:.2f} seconds")

Random Forest - Validation Accuracy: 0.572
Random Forest - Validation Report:
               precision    recall  f1-score   support

      BENIGN       0.56      0.66      0.61     20000
  RANSOMWARE       0.59      0.48      0.53     20000

    accuracy                           0.57     40000
   macro avg       0.57      0.57      0.57     40000
weighted avg       0.57      0.57      0.57     40000

Random Forest - Test Accuracy: 0.573075
Random Forest - Test Report:
               precision    recall  f1-score   support

      BENIGN       0.56      0.66      0.61     20000
  RANSOMWARE       0.59      0.48      0.53     20000

    accuracy                           0.57     40000
   macro avg       0.58      0.57      0.57     40000
weighted avg       0.58      0.57      0.57     40000

Random Forest training and evaluation took 15.05 seconds


In [13]:
from sklearn.neighbors import KNeighborsClassifier

# 시작 시간 기록
start_time = time.time()

# 4. K-Nearest Neighbors 학습 및 평가
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

knn_val_predictions = knn_classifier.predict(X_val)
print("KNN - Validation Accuracy:", accuracy_score(y_val, knn_val_predictions))
print("KNN - Validation Report:\n", classification_report(y_val, knn_val_predictions))

knn_test_predictions = knn_classifier.predict(X_test)
print("KNN - Test Accuracy:", accuracy_score(y_test, knn_test_predictions))
print("KNN - Test Report:\n", classification_report(y_test, knn_test_predictions))

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"KNN training and evaluation took {elapsed_time:.2f} seconds")

KNN - Validation Accuracy: 0.517475
KNN - Validation Report:
               precision    recall  f1-score   support

      BENIGN       0.51      0.96      0.67     20000
  RANSOMWARE       0.65      0.08      0.14     20000

    accuracy                           0.52     40000
   macro avg       0.58      0.52      0.40     40000
weighted avg       0.58      0.52      0.40     40000

KNN - Test Accuracy: 0.517025
KNN - Test Report:
               precision    recall  f1-score   support

      BENIGN       0.51      0.96      0.66     20000
  RANSOMWARE       0.64      0.08      0.14     20000

    accuracy                           0.52     40000
   macro avg       0.58      0.52      0.40     40000
weighted avg       0.58      0.52      0.40     40000

KNN training and evaluation took 21.58 seconds


In [14]:
import xgboost as xgb

# 타겟 변수를 수치형으로 변환 (BENIGN: 0, RANSOMWARE: 1)
y_train = y_train.map({'BENIGN': 0, 'RANSOMWARE': 1})
y_val = y_val.map({'BENIGN': 0, 'RANSOMWARE': 1})
y_test = y_test.map({'BENIGN': 0, 'RANSOMWARE': 1})

# 시작 시간 기록
start_time = time.time()

# 5. Gradient Boosting (using XGBoost) 학습 및 평가
xg_classifier = xgb.XGBClassifier(random_state=42)
xg_classifier.fit(X_train, y_train)

xg_val_predictions = xg_classifier.predict(X_val)
print("XGBoost - Validation Accuracy:", accuracy_score(y_val, xg_val_predictions))
print("XGBoost - Validation Report:\n", classification_report(y_val, xg_val_predictions))

xg_test_predictions = xg_classifier.predict(X_test)
print("XGBoost - Test Accuracy:", accuracy_score(y_test, xg_test_predictions))
print("XGBoost - Test Report:\n", classification_report(y_test, xg_test_predictions))

#종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Gradient Boosting training and evaluation took {elapsed_time:.2f} seconds")

XGBoost - Validation Accuracy: 0.56995
XGBoost - Validation Report:
               precision    recall  f1-score   support

           0       0.55      0.74      0.63     20000
           1       0.61      0.40      0.48     20000

    accuracy                           0.57     40000
   macro avg       0.58      0.57      0.56     40000
weighted avg       0.58      0.57      0.56     40000

XGBoost - Test Accuracy: 0.56855
XGBoost - Test Report:
               precision    recall  f1-score   support

           0       0.55      0.73      0.63     20000
           1       0.60      0.40      0.48     20000

    accuracy                           0.57     40000
   macro avg       0.58      0.57      0.56     40000
weighted avg       0.58      0.57      0.56     40000

Gradient Boosting training and evaluation took 0.48 seconds


In [15]:
from sklearn.naive_bayes import GaussianNB

# 시작 시간 기록
start_time = time.time()

# 6. Gaussian Naive Bayes 학습 및 평가
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)

gnb_val_predictions = gnb_classifier.predict(X_val)
print("Gaussian Naive Bayes - Validation Accuracy:", accuracy_score(y_val, gnb_val_predictions))
print("Gaussian Naive Bayes - Validation Report:\n", classification_report(y_val, gnb_val_predictions))

gnb_test_predictions = gnb_classifier.predict(X_test)
print("Gaussian Naive Bayes - Test Accuracy:", accuracy_score(y_test, gnb_test_predictions))
print("Gaussian Naive Bayes - Test Report:\n", classification_report(y_test, gnb_test_predictions))

#종료 시간 기록 및 소요 시간 계산
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Gaussian Navie Bayes training and evaluation took {elapsed_time:.2f} seconds")

Gaussian Naive Bayes - Validation Accuracy: 0.49995
Gaussian Naive Bayes - Validation Report:
               precision    recall  f1-score   support

           0       0.25      0.00      0.00     20000
           1       0.50      1.00      0.67     20000

    accuracy                           0.50     40000
   macro avg       0.37      0.50      0.33     40000
weighted avg       0.37      0.50      0.33     40000

Gaussian Naive Bayes - Test Accuracy: 0.49995
Gaussian Naive Bayes - Test Report:
               precision    recall  f1-score   support

           0       0.33      0.00      0.00     20000
           1       0.50      1.00      0.67     20000

    accuracy                           0.50     40000
   macro avg       0.42      0.50      0.33     40000
weighted avg       0.42      0.50      0.33     40000

Gaussian Navie Bayes training and evaluation took 0.11 seconds
