In [None]:
import os
import gzip
import shutil
from pathlib import Path

# Global variable - directory where cfg.json and .dot files generated by our code will be stored 
part1_output_directory = "output/part_1/"
part2_output_directory = "output/part_2/"

# Utility functions taken from TP1
def get_json_files(extension, directory):
   directory = Path(directory)
   return [str(file) for file in directory.rglob(extension)]

def create_output_file(filename, directory):
    # Check if output directory exists, if not, create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Check if output file already exists, if so, delete and create new file
    file_path = os.path.join(directory, filename)
    if os.path.exists(file_path):
        os.remove(file_path)

    # Open in "append" mode to avoid overwriting the whole file after each modification
    return open(directory + filename, "a")

def close_output_file(file):
   file.close()

# The following function has been adapted from the following sources:
## https://www.tutorialspoint.com/How-to-scan-through-a-directory-recursively-in-Python
## https://bito.ai/resources/unzip-gz-file-python-python-explained/
## https://stackoverflow.com/questions/42445831/python-3-creating-files-in-relative-directories
def extract_zipped_files(source_directory, target_directory):
    # Check if target directory exists, if not, create it
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    # Use os.walk to recursively visit the source folder
    for current_dir, dir_names, file_names in os.walk(source_directory):
        for filename in file_names:
            # Find all zipped files
            if filename.endswith('.gz'):
                # Construct source path from source directory and the filename
                source_path = os.path.join(current_dir, filename)

                # Construct target path by replacing the source directory with target directory
                target_rel_path = os.path.relpath(current_dir, source_directory)
                target_dir = os.path.join(target_directory, target_rel_path)
                target_path = os.path.join(target_dir, filename[:-3])
                
                # Check if target directory exists, if not, create it (this is for the target directories within the main directory)
                if not os.path.exists(target_dir):
                    os.makedirs(target_dir)
                
                # Open and extract zipped files
                with gzip.open(source_path, 'rb') as f_in:
                    with open(target_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)


In [None]:
import numpy

def __compute_manhatan_distance(vector_a, vector_b):
    # Assuming we are receiving a numpy.array type datastructure (better than iterating over each element, substracting then doing the sum)
    return numpy.abs(vector_b - vector_a).sum()

def is_similar_vector(vector_a, vector_b, similarity_treshold):
    return __compute_manhatan_distance(vector_a, vector_b) <= similarity_treshold * numpy.sum(vector_a)

In [None]:
# Prepare output file and reader
from code_analysis import ASTReader
reader = ASTReader ()

def find_similar_files(directory, min_nodes, similarity_treshold, enable_identical_skip=False):
    # Prepare output file
    part1_output_directory = "output/part_1/"
    # part_1_output_file = create_output_file("part_1_output_file.txt", part1_output_directory)

    # Prepare output arrays (we decided agains using a dictionnary for efficiency, a dict with 900 elements seemed to cause the program to be slower)
    ## We assumed that since we are populating these arrays at the same time and are not modifying them after, the same index in both arrays should
    ## point to the same file.
    filename_array = []
    vector_array = []

    # Retrieve filenames of all ast in the specified directory
    astFilenames = get_json_files('*.json', directory)

    # Iterate over the filenames array once to visit all ast and retrieve all those with nodes > 100
    for filename in astFilenames:
        # Load ast in memory
        ast = reader.read_ast(filename)

        # Only take ASTs with more than 100 nodes into consideration
        if len(ast.get_node_ids()) > min_nodes:
            vector = ast.vectorize()
            filename_array.append(filename)
            vector_array.append(vector)  


    # Compare all vectors between themselves with using manhattan distance to find the similar ones (where MD < 0.3 in part 1)
    for v_a in range(len(vector_array)):
        for v_b in range(v_a + 1, len(vector_array)):

            # enable_identical_skip = True: compare all files
            # enable_identical_skip = False: if identical vectors are found within the same kit, skip the similarity check
            if not enable_identical_skip:
                # Retrieve the kits' directory name
                ## Necessary to check if the identical vectors found are within the same kit
                va_kit_dir_name = filename_array[v_a].split('/')[:3]
                vb_kit_dir_name = filename_array[v_b].split('/')[:3]

                if va_kit_dir_name == vb_kit_dir_name and vector_array[v_a] == vector_array[v_b]:
                    continue

            if is_similar_vector(vector_array[v_a], vector_array[v_b], similarity_treshold):
                print(f"File {filename_array[v_a]} is similar to {filename_array[v_b]}")
    


    # close_output_file(part_1_output_file)
            
source_dir_part_1 = './ast'
target_dir_part_1 = './output/extracted_ast'
extract_zipped_files(source_dir_part_1, target_dir_part_1)
find_similar_files(target_dir_part_1, 100, 0.3)
# find_similar_files(target_dir, 100, 0.3, enable_identical_skip=True)