In [1]:
import pandas as pd
import numpy as np
import spacy as sp
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.language import Language
from keras.preprocessing.text import Tokenizer

In [2]:
data = pd.read_json(r"CS663_Project_Data.json", orient='index')
data = data

In [3]:
df = data.copy()
df

Unnamed: 0,invocation,cmd
1,"Copy loadable kernel module ""mymodule.ko"" to t...",sudo cp mymodule.ko /lib/modules/$(uname -r)/k...
2,"Display all lines containing ""IP_MROUTE"" in th...",cat /boot/config-`uname -r` | grep IP_MROUTE
3,Display current running kernel's compile-time ...,cat /boot/config-`uname -r`
4,"Find all loadable modules for current kernel, ...",find /lib/modules/`uname -r` -regex .*perf.*
5,"Look for any instance of ""HIGHMEM"" in the curr...",grep “HIGHMEM” /boot/config-`uname -r`
...,...,...
10343,using exec in find command to dispaly the sear...,find . ... -exec cat {} \; -exec echo \;
10344,verbosely create intermediate directoriy tmp a...,mkdir -pv /tmp/boostinst
10345,view the manual page of find,man find
10346,"wait 2 seconds and then print ""hello""","echo ""hello `sleep 2 &`"""


In [4]:
from spacy.tokens import Doc


nlp = sp.load('en_core_web_sm')

@Language.component('clean_inv')
def clean_inv(doc):
    # Create a new list of filtered tokens
    filtered_tokens = []

    # Iterate over the tokens in the input document
    for token in doc:
        # Check conditions for token filtering
        if (not token.is_punct and not token.is_space and not token.is_digit and
            not token.like_num and not token.is_currency and not token.is_stop):

            # Append the lowercase version of the token's text to the filtered_tokens list
            filtered_tokens.append(token.lower_)

    # Create a new Doc object from the filtered tokens and return it
    return Doc(doc.vocab, words=filtered_tokens)

nlp.add_pipe('clean_inv', name='clean_inv', last=True)

def tokenize_inv(inv):
    doc = nlp(inv)
    tokens = [token.text for token in doc]
    return tokens


In [5]:
df['inv_tokens'] = df['invocation'].apply(tokenize_inv)
df

Unnamed: 0,invocation,cmd,inv_tokens
1,"Copy loadable kernel module ""mymodule.ko"" to t...",sudo cp mymodule.ko /lib/modules/$(uname -r)/k...,"[copy, loadable, kernel, module, mymodule.ko, ..."
2,"Display all lines containing ""IP_MROUTE"" in th...",cat /boot/config-`uname -r` | grep IP_MROUTE,"[display, lines, containing, ip_mroute, curren..."
3,Display current running kernel's compile-time ...,cat /boot/config-`uname -r`,"[display, current, running, kernel, compile, t..."
4,"Find all loadable modules for current kernel, ...",find /lib/modules/`uname -r` -regex .*perf.*,"[find, loadable, modules, current, kernel, inc..."
5,"Look for any instance of ""HIGHMEM"" in the curr...",grep “HIGHMEM” /boot/config-`uname -r`,"[look, instance, highmem, current, kernel, com..."
...,...,...,...
10343,using exec in find command to dispaly the sear...,find . ... -exec cat {} \; -exec echo \;,"[exec, find, command, dispaly, searched, files]"
10344,verbosely create intermediate directoriy tmp a...,mkdir -pv /tmp/boostinst,"[verbosely, create, intermediate, directoriy, ..."
10345,view the manual page of find,man find,"[view, manual, page, find]"
10346,"wait 2 seconds and then print ""hello""","echo ""hello `sleep 2 &`""","[wait, seconds, print, hello]"


In [6]:
invocations = df['inv_tokens'].values
commands = ["startseq " + cmd + " endseq" for cmd in df['cmd'].values]

# Define the maximum number of words to keep based on word frequency
max_words = 10000

# Create a tokenizer for invocations and fit it on the tokenized text
inv_tokenizer = Tokenizer(num_words=max_words)
inv_tokenizer.fit_on_texts(invocations)

# Create a tokenizer for commands and fit it on the tokenized text
cmd_tokenizer = Tokenizer(num_words=max_words)
cmd_tokenizer.fit_on_texts(commands)

# Convert the tokenized text to sequences
inv_sequences = inv_tokenizer.texts_to_sequences(invocations)
cmd_sequences = cmd_tokenizer.texts_to_sequences(commands)

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_input_length = max([len(seq) for seq in inv_sequences] + [len(seq) for seq in inv_sequences])
max_output_length = max([len(seq) for seq in cmd_sequences] + [len(seq) for seq in cmd_sequences])

inv_padded = pad_sequences(inv_sequences, maxlen=max_input_length, padding='post')
cmd_padded = pad_sequences(cmd_sequences, maxlen=max_output_length, padding='post')

In [8]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(inv_padded, cmd_padded, test_size=0.1)

In [9]:
input_vocab_size = len(inv_tokenizer.word_index) + 1
output_vocab_size = len(cmd_tokenizer.word_index) + 1
latent_dim = 128  # Adjust this to the desired size of LSTM/GRU hidden state

In [32]:
input_vocab_size

5712

In [11]:
from tensorflow.keras.layers import Embedding
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding

embedding_dim = 128  # Adjust this to the desired size of the embeddings

# Define input layer
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)

# Add LSTM layer for the encoder
encoder_lstm = LSTM(latent_dim, return_state=True)
_, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding)

# Save the encoder states for use in the decoder
encoder_states = [encoder_state_h, encoder_state_c]

In [None]:
# Define input layer
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(output_vocab_size, embedding_dim)(decoder_inputs)

# Add LSTM layer for the decoder
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Add a Dense layer to generate output tokens
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [12]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [14]:
decoder_target_data = np.zeros((len(train_y), max_output_length, output_vocab_size), dtype='float32')
for i, seq in enumerate(train_y):
    for j, word in enumerate(seq):
        if j > 0:  # Ignore the first padding element
            decoder_target_data[i, j - 1, word] = 1.0

In [17]:
# Set the training parameters
batch_size = 8
epochs = 10
validation_split = 0.2

# Train the model
history = model.fit(
    [train_x, train_y], decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=validation_split
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
# Define the encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Define the decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)


In [30]:
def predict(input_seq):
    # Encode the input sequence to get the encoder states
    encoder_states_value = encoder_model.predict(input_seq)

    # Generate an empty target sequence
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = cmd_tokenizer.word_index['startseq']

    # Loop through each step of the output sequence
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + encoder_states_value
        )

        # Sample the most probable token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = cmd_tokenizer.index_word[sampled_token_index]
        decoded_sentence.append(sampled_word)

        # Check for the stop condition
        if sampled_word == 'endseq' or len(decoded_sentence) > max_output_length:
            stop_condition = True

        # Update the target sequence and states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        encoder_states_value = [h, c]

    return ' '.join(decoded_sentence[:-1])  # Remove <eos> from the output


In [31]:
for i in range(10):  # Change this to the number of examples you want to test
    input_seq = test_x[i:i+1]
    decoded_sentence = predict(input_seq)
    print('Input:', ' '.join([inv_tokenizer.index_word[w] for w in input_seq[0] if w != 0]))
    print('Predicted command:', decoded_sentence)
    print('True command:', ' '.join([cmd_tokenizer.index_word[w] for w in test_y[i] if w != 0]))
    print('\n')

Input: search files modified months days current folder display total disk usage mb
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq find tmp type f mtime 0 exec du ks cut f1 awk ' total total 1 end print total 1024 ' endseq


Input: delete files current folder
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq find delete endseq




Input: read line standard input argument interactive shell prompt >
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq read e p ' ' 1 endseq


Input: find .log files current directory contain string exception
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq find name ' log' mtime 2 exec grep hc exception grep v 0 endseq




Input: replace newlines testing\r_tested display named characters
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq echo e testing r tested awk v rs ' print 0 ' od a endseq


Input: find files named new current directory tree display contents
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq find name new print exec cat endseq




Input: generates randomly sorted list numbers
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq seq 1 10 sort r tee tmp lst cat cat tmp lst echo ' ' tac endseq


Input: read lookup requests text file 1.txt uses fetch txt records
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq dig txt f 1 txt endseq




Input: run rm process recursively remove cache making immune sighup signals sent
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq nohup rm rf cache endseq


Input: recursively copies files current directory ones names match pattern dirtoexclude|targetdir targetdir directory printing info message operation
Predicted command: find name ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '
True command: startseq cp rv ls a grep ve dirtoexclude targetdir targetdir endseq


