In [89]:
import tensorflow as tf
import numpy as np
from sklearn.ensemble import IsolationForest

In [90]:
with open('add.py', 'r') as file:
    program = file.read()

In [91]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

In [92]:
tokenizer.fit_on_texts([program])

In [93]:
sequence = tokenizer.texts_to_sequences([program])[0]

In [94]:
padded_sequence = tf.keras.preprocessing.sequence.pad_sequences([sequence])

In [95]:
vectorized_program = tf.one_hot(padded_sequence, depth=len(tokenizer.word_index)+1)

In [96]:
print(vectorized_program)

tf.Tensor(
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]]], shape=(1, 47, 22), dtype=float32)


In [97]:
vectorized_program_2d = np.reshape(vectorized_program, (-1, vectorized_program.shape[-1]))

In [98]:
print(vectorized_program_2d)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]]


In [99]:
train_size = int(0.8 * vectorized_program_2d.shape[0])
train_data = vectorized_program_2d[:train_size]
test_data = vectorized_program_2d[train_size:]

In [100]:
print(train_data.shape) 
print(test_data.shape)

(37, 22)
(10, 22)


In [101]:
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
model.fit(train_data)

In [102]:
predictions = model.predict(test_data)

In [103]:
print("Number of anomalies detected:", sum(predictions == -1))

Number of anomalies detected: 3
