<a href="https://colab.research.google.com/github/juhumkwon/Data/blob/main/04_%EC%9B%B9%EC%85%80(self_attention).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

import tensorflow as tf
from tensorflow.keras.layers import Layer, Input, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# 1. 샘플 데이터
normal_php = [
    '<?php echo "Hello, world!"; ?>',
    '<?php include("menu.php"); echo $content; ?>',
    '<?php if ($_POST["id"] == "admin") { echo "hi"; } ?>'
]

webshell_php = [
    '<?php eval($_GET["cmd"]); ?>',
    '<?php system("ls"); ?>',
    '<?php echo base64_decode($_REQUEST["x"]); ?>',
    '<?php passthru($_GET["exec"]); ?>',
    '<?php eval(base64_decode($_POST["x"])); ?>'
]

texts = normal_php + webshell_php
labels = [0] * len(normal_php) + [1] * len(webshell_php)

# 2. 토큰화 및 패딩
tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_len = max(len(seq) for seq in sequences)
padded = pad_sequences(sequences, maxlen=max_len, padding='post')

X_train = np.array(padded)
y_train = np.array(labels)

# 3. Scaled Dot-Product Self-Attention Layer
class SelfAttention(Layer):
    def __init__(self, d_model, **kwargs):
        super(SelfAttention, self).__init__(**kwargs)
        self.d_model = d_model

    def build(self, input_shape):
        self.Wq = self.add_weight(shape=(input_shape[-1], self.d_model),
                                  initializer='glorot_uniform',
                                  trainable=True,
                                  name="Wq")
        self.Wk = self.add_weight(shape=(input_shape[-1], self.d_model),
                                  initializer='glorot_uniform',
                                  trainable=True,
                                  name="Wk")
        self.Wv = self.add_weight(shape=(input_shape[-1], self.d_model),
                                  initializer='glorot_uniform',
                                  trainable=True,
                                  name="Wv")

    def call(self, inputs):
        # inputs: (batch_size, seq_len, embedding_dim)
        Q = tf.matmul(inputs, self.Wq)  # (batch, seq_len, d_model)
        K = tf.matmul(inputs, self.Wk)  # (batch, seq_len, d_model)
        V = tf.matmul(inputs, self.Wv)  # (batch, seq_len, d_model)

        # scaled dot-product attention
        matmul_qk = tf.matmul(Q, K, transpose_b=True)  # (batch, seq_len, seq_len)

        dk = tf.cast(tf.shape(K)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (batch, seq_len, seq_len)

        output = tf.matmul(attention_weights, V)  # (batch, seq_len, d_model)

        # 문장 벡터로 압축: 평균 풀링
        return tf.reduce_mean(output, axis=1)  # (batch, d_model)

# 4. 모델 정의
vocab_size = 1000
embedding_dim = 64
d_model = 64

inputs = Input(shape=(max_len,))
x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)(inputs)
x = SelfAttention(d_model=d_model)(x)
x = Dense(32, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)

model = Model(inputs, outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.summary()

# 5. 학습
model.fit(X_train, y_train, epochs=200, verbose=0)

# 6. 테스트
test_codes = [
    '<?php echo "관리자 페이지입니다."; ?>',         # 정상
    '<?php eval(base64_decode($_GET["cmd"])); ?>',   # 웹셀
    '<?php echo shell_exec($_POST["cmd"]); ?>',      # 웹셀
    '<?php include("footer.php"); ?>'                # 정상
]

test_seq = tokenizer.texts_to_sequences(test_codes)
test_pad = pad_sequences(test_seq, maxlen=max_len, padding='post')
pred = model.predict(test_pad)

for i, code in enumerate(test_codes):
    label = "웹셸" if pred[i] > 0.5 else "정상"
    print(f"[{label}] → {code.strip()[:60]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[정상] → <?php echo "관리자 페이지입니다."; ?>
[웹셸] → <?php eval(base64_decode($_GET["cmd"])); ?>
[웹셸] → <?php echo shell_exec($_POST["cmd"]); ?>
[정상] → <?php include("footer.php"); ?>
