In [19]:
import numpy as np
import tensorflow as tf
from sklearn.ensemble import IsolationForest

# Define the vectorize_program function
def vectorize_program(program):
    # Tokenize and vectorize the program
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts([program])
    sequence = tokenizer.texts_to_sequences([program])[0]
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences([sequence])
    vectorized_program = tf.one_hot(padded_sequence, depth=len(tokenizer.word_index)+1)

    # Pad the program vector with zeros along the second dimension
    max_len = max(len(sequence), 1)
    padded_program = tf.keras.preprocessing.sequence.pad_sequences(vectorized_program, maxlen=max_len, padding='post')
    vectorized_program_2d = np.reshape(padded_program, (-1, padded_program.shape[-1]))

    return vectorized_program_2d

# Define the list of filenames
filenames = ['code1.py', 'code2.py', 'code3.py', 'code4.py', 'code5.py', 'code6.py', 'code7.py', 'code8.py', 'code9.py', 'code10.py', 'code11.py', 'code12.py', 'code13.py', 'code14.py', 'code15.py', 'code16.py', 'code17.py', 'code18.py', 'code19.py', 'code20.py','code21.py', 'code22.py', 'code23.py', 'code24.py', 'code25.py', 'code26.py', 'code27.py', 'code28.py', 'code29.py', 'code30.py', 'code31.py', 'code32.py', 'code33.py', 'code34.py', 'code35.py', 'code36.py', 'code37.py', 'code38.py', 'code39.py', 'code40.py', 'code41.py', 'code42.py', 'code43.py', 'code44.py', 'code45.py', 'code46.py', 'code47.py', 'code48.py', 'code49.py', 'code50.py', 'code51.py', 'code52.py', 'code53.py', 'code54.py', 'code55.py', 'code56.py', 'code57.py', 'code58.py', 'code59.py', 'code60.py']
# Loop over the filenames
for filename in filenames:
    # Read the program from the file
    with open(filename, 'r') as file:
        program = file.read()

    # Vectorize the program
    vectorized_program = vectorize_program(program)

    # Split the data into training and testing sets
    train_size = int(0.8 * vectorized_program.shape[0])
    train_data = vectorized_program[:train_size]
    test_data = vectorized_program[train_size:]

    # Fit the IsolationForest model on the training set
    model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
    model.fit(train_data)

    # Use the trained model to predict anomalies in the test set
    predictions = model.predict(test_data)

    # Print the number of anomalies detected by the model
    print(f"Number of anomalies detected in {filename}: {sum(predictions == -1)}")


Number of anomalies detected in code1.py: 0
Number of anomalies detected in code2.py: 0
Number of anomalies detected in code3.py: 0
Number of anomalies detected in code4.py: 0
Number of anomalies detected in code5.py: 0
Number of anomalies detected in code6.py: 0
Number of anomalies detected in code7.py: 0
Number of anomalies detected in code8.py: 0
Number of anomalies detected in code9.py: 1
Number of anomalies detected in code10.py: 1
Number of anomalies detected in code11.py: 0
Number of anomalies detected in code12.py: 0
Number of anomalies detected in code13.py: 1
Number of anomalies detected in code14.py: 0
Number of anomalies detected in code15.py: 0
Number of anomalies detected in code16.py: 0
Number of anomalies detected in code17.py: 1
Number of anomalies detected in code18.py: 0
Number of anomalies detected in code19.py: 0
Number of anomalies detected in code20.py: 0
Number of anomalies detected in code21.py: 0
Number of anomalies detected in code22.py: 0
Number of anomalies

In [20]:
import pickle


IF_pkl_filename = r'SavedModels/IsolationForest.pkl'
# Open the file to save as pkl file
IF_Model_pkl = open(IF_pkl_filename, 'wb')
pickle.dump(model, IF_Model_pkl)
# Close the pickle instances
IF_Model_pkl.close()
