<a href="https://colab.research.google.com/github/juhumkwon/Data/blob/main/02_%EC%9B%B9%EC%85%80(deeplearning_lstm).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ✅ 웹셸 탐지 딥러닝 예제 (LSTM 기반)
# 코랩에서 실행 가능
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

# 1. 샘플 데이터 (정상 PHP 코드 & 웹셸 코드)
normal_php = [
    '<?php echo "Hello, world!"; ?>',
    '<?php include("menu.php"); echo $content; ?>',
    '<?php if ($_POST["id"] == "admin") { echo "hi"; } ?>'
]

webshell_php = [
    '<?php eval($_GET["cmd"]); ?>',
    '<?php system("ls"); ?>',
    '<?php echo base64_decode($_REQUEST["x"]); ?>',
    '<?php passthru($_GET["exec"]); ?>',
    '<?php eval(base64_decode($_POST["x"])); ?>'
]

texts = normal_php + webshell_php
labels = [0] * len(normal_php) + [1] * len(webshell_php)  # 0=정상, 1=웹셸

# 2. 토큰화 및 시퀀스 변환
tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# 3. 패딩
max_len = max(len(seq) for seq in sequences)
padded = pad_sequences(sequences, maxlen=max_len, padding='post')

# 4. LSTM 모델 정의
model = Sequential([
    Embedding(input_dim=1000, output_dim=32, input_length=max_len),
    LSTM(64),  # ← LSTM 적용
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # 이진 분류
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.summary()

# 5. 학습
X_train = np.array(padded)
y_train = np.array(labels)
model.fit(X_train, y_train, epochs=100, verbose=0)

# 6. 테스트 코드 예시
test_codes = [
    '<?php echo "관리자 페이지입니다."; ?>',          # 정상
    '<?php eval(base64_decode($_GET["cmd"])); ?>',    # 웹셀
    '<?php echo shell_exec($_POST["cmd"]); ?>',       # 웹셀
    '<?php include("footer.php"); ?>'                 # 정상
]

test_seq = tokenizer.texts_to_sequences(test_codes)
test_pad = pad_sequences(test_seq, maxlen=max_len, padding='post')
pred = model.predict(test_pad)

# 7. 결과 출력
for i, code in enumerate(test_codes):
    label = "웹셸" if pred[i] > 0.5 else "정상"
    print(f"[{label}] → {code.strip()[:60]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step
[정상] → <?php echo "관리자 페이지입니다."; ?>
[웹셸] → <?php eval(base64_decode($_GET["cmd"])); ?>
[정상] → <?php echo shell_exec($_POST["cmd"]); ?>
[정상] → <?php include("footer.php"); ?>
