In [9]:
import numpy as np
import tensorflow as tf
from sklearn.ensemble import IsolationForest

def vectorize_program(program):
    # Tokenize and vectorize the program
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts([program])
    sequence = tokenizer.texts_to_sequences([program])[0]
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences([sequence])
    vectorized_program = tf.one_hot(padded_sequence, depth=len(tokenizer.word_index)+1)

    # Pad the program vector with zeros along the second dimension
    max_len = max(len(sequence), 1)
    padded_program = tf.keras.preprocessing.sequence.pad_sequences(vectorized_program, maxlen=max_len, padding='post')
    vectorized_program_2d = np.reshape(padded_program, (-1, padded_program.shape[-1]))

    return vectorized_program_2d

# Read both programs from files
with open('add1.py', 'r') as file:
    program1 = file.read()
with open('kruskal1.py', 'r') as file:
    program2 = file.read()

# Concatenate the programs
combined_program = program1 + '\n\n' + program2

# Vectorize the combined program
vectorized_program = vectorize_program(combined_program)

# Split the data into training and testing sets
train_size = int(0.8 * vectorized_program.shape[0])
train_data = vectorized_program[:train_size]
test_data = vectorized_program[train_size:]

# Fit the IsolationForest model on the training set
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
model.fit(train_data)

# Use the trained model to predict anomalies in the test set
predictions = model.predict(test_data)

# Print the number of anomalies detected by the model
print("Number of anomalies detected:", sum(predictions == -1))


Number of anomalies detected: 0
