In [14]:
import os
import glob
# Function to read .bc files
def read_bc_files(directory, file_index):
    bc_files = []

    # Get a list of all subdirectories
    subdirs = [os.path.join(directory, subdir) for subdir in os.listdir(directory) if os.path.isdir(os.path.join(directory, subdir))]

    # Iterate through each subdirectory to find all .bc files
    for subdir in subdirs:
        bc_files.extend(glob.glob(os.path.join(subdir, '*.bc')))
    
    # Check if the file_index is valid
    if 1 <= file_index <= len(bc_files):
        # Adjust for zero-based index
        bc_file = bc_files[file_index - 1]
        with open(bc_file, 'r') as file:
            content = file.read()
            print(f"Contents of {bc_file}:")
            print(content)
            print("\n")
        return content
    else:
        return "Invalid file index. Please choose a number between 1 and {}.".format(len(bc_files))


In [None]:
# Example usage:
# Define the directory path
base_directory = '/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR-1/processed_data/CFG'
file_number = 1  # This number can be from 1 to the total number of .bc files
ir_code = read_bc_files(base_directory, file_number)
print(ir_code)

In [1]:
import os
import openai
import json
# Retrieve the OpenAI API key from the environment variable
#api_key = os.getenv('OPENAI_API_KEY')


In [19]:
chat_completion = openai.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-4o",
)

In [20]:
chat_completion 

ChatCompletion(id='chatcmpl-9n3tPaY4qmGh8sej9eWhfbwDp8BaM', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='This is a test.', role='assistant', function_call=None, tool_calls=None))], created=1721480063, model='gpt-4o-2024-05-13', object='chat.completion', service_tier=None, system_fingerprint='fp_400f27fa1f', usage=CompletionUsage(completion_tokens=5, prompt_tokens=12, total_tokens=17))

In [23]:
def generate_cfg_decompile(file_content, output_file_path):
    
    prompt = f"""
    You are an expert in high-performance computation programs. I will provide you with an LLVM IR (Intermediate Representation) `.bc` code file, which has been compiled from source code using LLVM.

    You will follow the instructions below and output a file for me:
    1. Decompile the LLVM IR code into equivalent C code.
    2. Generate the source code file in either C format.

The input file content is as follows:

    ```
    {file_content}
    ```
    

    """
    response = openai.chat.completions.create(
            messages=[
            {
                "role": "user", 
                "content": prompt,
            }
        ],
        model = "gpt-4o",
        max_tokens=4096,
        temperature=0.7
            )
    analysis = response.choices[0].message.content.strip()
    print(analysis)
    
    # Save to a .c file
    with open(output_file_path, 'w') as c_file:
        c_file.write(analysis)
    
    return analysis
    return response.choices[0].text.strip().split()[0]  # Get the first word of the response as the algorithm name


In [27]:
output_file_path = '/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR-1/processed_data/CFG_LLM/counting_sort2/countingsort2.c'

In [24]:
# Specify the path to the .bc file
file_path = '/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR-1/processed_data/CFG_LLM/counting_sort2/test_countingsort2.bc'

# Read the content of the file into a variable
try:
    with open(file_path, 'r') as file:
        content_file = file.read()
        print("File read successfully.")
except Exception as e:
    print(f"Failed to read file: {e}")

# Now the variable content_file holds the contents of the .bc file


File read successfully.


In [25]:
content_file

'%"class.std::ios_base::Init" = type { i8 }\n%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, %struct._IO_codecvt*, %struct._IO_wide_data*, %struct._IO_FILE*, i8*, i64, i32, [20 x i8] }\n%struct._IO_marker = type opaque\n%struct._IO_codecvt = type opaque\n%struct._IO_wide_data = type opaque\n%"class.std::basic_istream" = type { i32 (...)**, i64, %"class.std::basic_ios" }\n%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* }\n%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" }\n%"struct.std::ios_base::_Callback_list" = type { %"struct.

In [28]:
generate_cfg_decompile(content_file, output_file_path)

To convert the provided LLVM IR code to equivalent C code, we need to carefully follow the structure and data flow within the LLVM IR and then translate it into a high-level C representation. Here is the decompiled C code based on the provided LLVM IR:

```c
#include <stdio.h>
#include <stdlib.h>

int main() {
    int n, value;
    int *count_array;
    FILE *file;

    // Open the input file
    file = freopen("../input_files/counting_sort2", "r", stdin);
    if (!file) {
        perror("Failed to open file");
        return 1;
    }

    // Read the number of elements
    if (scanf("%d", &n) != 1) {
        perror("Failed to read the number of elements");
        return 1;
    }

    // Allocate memory for the counting array
    count_array = (int *)calloc(100, sizeof(int));
    if (!count_array) {
        perror("Failed to allocate memory");
        return 1;
    }

    // Read n integers and count their occurrences
    for (int i = 0; i < n; ++i) {
        if (scanf("%d", &value) !

'To convert the provided LLVM IR code to equivalent C code, we need to carefully follow the structure and data flow within the LLVM IR and then translate it into a high-level C representation. Here is the decompiled C code based on the provided LLVM IR:\n\n```c\n#include <stdio.h>\n#include <stdlib.h>\n\nint main() {\n    int n, value;\n    int *count_array;\n    FILE *file;\n\n    // Open the input file\n    file = freopen("../input_files/counting_sort2", "r", stdin);\n    if (!file) {\n        perror("Failed to open file");\n        return 1;\n    }\n\n    // Read the number of elements\n    if (scanf("%d", &n) != 1) {\n        perror("Failed to read the number of elements");\n        return 1;\n    }\n\n    // Allocate memory for the counting array\n    count_array = (int *)calloc(100, sizeof(int));\n    if (!count_array) {\n        perror("Failed to allocate memory");\n        return 1;\n    }\n\n    // Read n integers and count their occurrences\n    for (int i = 0; i < n; ++i) {\

In [13]:
# Function to read .bc files, generate CFG, and save as .dot files
def read_and_process_bc_files(directory):
    # Get a list of all subdirectories
    subdirs = [os.path.join(directory, subdir) for subdir in os.listdir(directory) if os.path.isdir(os.path.join(directory, subdir))]

    # Iterate through each subdirectory
    for subdir in subdirs:
        # Find all .bc files in the subdirectory
        bc_files = glob.glob(os.path.join(subdir, '*.bc'))
        
        # Iterate through each .bc file and process it
        for bc_file in bc_files:
            with open(bc_file, 'r') as file:
                file_content = file.read()
                
                # Generate the CFG .dot content
                dot_content = generate_cfg_llms(file_content)
                
                # Define the output .dot file path
                output_dot_file_path = bc_file.replace('.bc', '_LLMs_cfg.dot')
                
                # Save the .dot content to the file
                with open(output_dot_file_path, 'w') as dot_file:
                    dot_file.write(dot_content)
                
                print(f"CFG has been saved to {output_dot_file_path}")

# Call the function to read and process .bc files
read_and_process_bc_files(base_directory)

```dot
digraph CFG {
    node [shape = rectangle];

    entry [label = "entry", shape = circle];
    exit [label = "exit", shape = circle];

    entry -> %1
    %1 [label = "%1 = alloca i32, align 4"]
    %1 -> %2
    %2 [label = "%2 = alloca i32, align 4"]
    %2 -> %3
    %3 [label = "%3 = load %struct._IO_FILE*, %struct._IO_FILE** @stdin, align 8, !tbaa !3"]
    %3 -> %4
    %4 [label = "%4 = tail call %struct._IO_FILE* @freopen(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0), %struct._IO_FILE* %3)"]
    %4 -> %5
    %5 [label = "%5 = bitcast i32* %1 to i8*"]
    %5 -> %6
    %6 [label = "call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %5) #9"]
    %6 -> %7
    %7 [label = "%7 = bitcast i32* %2 to i8*"]
    %7 -> %8
    %8 [label = "call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %6) #9"]
    %8 -> %9
    %9 [label = "%7 = call nonnull align 8 dereferenceable(16) %"class.std:

In [17]:
import os
import glob
import openai

# Define the directory path
input_directory = '/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR/processed_data/CFG'
output_directory = '/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR/processed_data/CFG_LLMs'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

def read_and_process_bc_files(input_dir, output_dir):
    # Find all .bc files in the directory and its subdirectories
    bc_files = glob.glob(os.path.join(input_dir, '**', '*.bc'), recursive=True)
    
    # Iterate through each .bc file and process it
    for bc_file in bc_files:
        with open(bc_file, 'r') as file:
            file_content = file.read()
            
            # Generate the CFG .dot content
            dot_content = generate_cfg_llms(file_content)
            
            # Define the output .dot file path in the output directory
            base_filename = os.path.basename(bc_file).replace('.bc', '_LLMs_cfg.dot')
            output_dot_file_path = os.path.join(output_dir, base_filename)
            
            # Save the .dot content to the file, overwriting if it exists
            if os.path.exists(output_dot_file_path):
                print(f"Overwriting existing file: {output_dot_file_path}")
            else:
                print(f"Creating new file: {output_dot_file_path}")

            with open(output_dot_file_path, 'w') as dot_file:
                dot_file.write(dot_content)
            
            print(f"CFG has been saved to {output_dot_file_path}")

# Call the function to read and process .bc files
read_and_process_bc_files(input_directory, output_directory)


To generate the Control Flow Graph (CFG) for the provided IR code, we need to analyze the basic blocks and their relationships. Here is the .dot file content representing the CFG:

```dot
digraph CFG {
    node [shape=rectangle];
    entry [label="entry"];
    block_1 [label="%1 = load %struct._IO_FILE*, %struct._IO_FILE** @stdin, align 8"];
    block_2 [label="%2 = tail call %struct._IO_FILE* @freopen(...)"];
    block_3 [label="%3 = tail call zeroext i1 @_ZNSt8ios_base15sync_with_stdioEb(...)"];
    block_4 [label="%4 = tail call nonnull align 8 dereferenceable(16) %"class.std::basic_istream"* @_ZNSi10_M_extractIxEERSiRT_(...)"];
    block_5 [label="%5 = tail call nonnull align 8 dereferenceable(16) %"class.std::basic_istream"* @_ZNSi10_M_extractIxEERSiRT_(...)"];
    block_6 [label="%6 = tail call nonnull align 8 dereferenceable(16) %"class.std::basic_istream"* @_ZNSi10_M_extractIxEERSiRT_(...)"];
    block_7 [label="%7 = load i64, i64* @x, align 8"];
    block_8 [label="%8 = icmp e

In [18]:
def generate_cfg_llms2(file_content):
    model = "gpt-3.5-turbo"
    prompt = f"""
        You are an expert in high-performance computation program area. I will give you an IR (Intermediate Representation) .bc code file, which is compiled from source code with LLVM. 
        You will follow the next instructions and output a file for me. 
        First, you have to draw the Control Flow Graph (CFG) of the IR code.
        Then, output the CFG in a .dot file format. 
        The input file is as follows:

    ```
    {file_content}
    ```
    Note: The IR code may contain strings with quotes. For example:
    block_54 [label="%51 = bitcast i8* %50 to \\"class.std::ctype\\"**"];
    label = \\"string\\";
    Be careful when handling quotes within quotes.

    Generate the CFG and provide the .dot file content.

    """
    response = openai.chat.completions.create(
            messages=[
            {
                "role": "user", 
                "content": prompt,
            }
        ],
        model="gpt-3.5-turbo",
        max_tokens=4096,
        temperature=0.7
            )
    analysis = response.choices[0].message.content.strip()
    print(analysis)
    return analysis
    return response.choices[0].text.strip().split()[0]  # Get the first word of the response as the algorithm name


In [19]:
import os
import glob
import openai

# Define the directory path
input_directory = '/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR/processed_data/CFG'
output_directory = '/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR/processed_data/CFG_LLMs'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

def read_and_process_bc_files(input_dir, output_dir):
    # Find all .bc files in the directory and its subdirectories
    bc_files = glob.glob(os.path.join(input_dir, '**', '*.bc'), recursive=True)
    
    # Iterate through each .bc file and process it
    for bc_file in bc_files:
        with open(bc_file, 'r') as file:
            file_content = file.read()
            
            # Generate the CFG .dot content
            dot_content = generate_cfg_llms2(file_content)
            
            # Define the output .dot file path in the output directory
            base_filename = os.path.basename(bc_file).replace('.bc', '_LLMs_cfg.dot')
            output_dot_file_path = os.path.join(output_dir, base_filename)
            
            # Save the .dot content to the file, overwriting if it exists
            if os.path.exists(output_dot_file_path):
                print(f"Overwriting existing file: {output_dot_file_path}")
            else:
                print(f"Creating new file: {output_dot_file_path}")

            with open(output_dot_file_path, 'w') as dot_file:
                dot_file.write(dot_content)
            
            print(f"CFG has been saved to {output_dot_file_path}")

# Call the function to read and process .bc files
read_and_process_bc_files(input_directory, output_directory)


To generate the Control Flow Graph (CFG) from the provided LLVM IR code, we need to analyze the code and identify the basic blocks and the control flow between them. Here is the CFG in .dot file format:

```dot
digraph CFG {
    // Nodes
    entry [shape=ellipse,label="entry"]
    block_0 [shape=rectangle,label="block_0\n%1 = load %struct._IO_FILE*, %struct._IO_FILE** @stdin"]
    block_18 [shape=rectangle,label="block_18\n%19 = load i64, i64* getelementptr inbounds ([510031 x [3 x i64]], [510031 x [3 x i64]]* @dp, i64 0, i64 1, i64 0)"]
    block_21 [shape=rectangle,label="block_21\n%22 = phi i64 [ %20, %18 ], [ %33, %21 ]\n%23 = phi i64 [ %19, %18 ], [ %36, %21 ]\n%24 = phi i64 [ 2, %18 ], [ %37, %21 ]"]
    block_39 [shape=rectangle,label="block_39\n%40 = getelementptr inbounds [510031 x [3 x i64]], [510031 x [3 x i64]]* @dp, i64 0, i64 %11, i64 1"]
    block_54 [shape=rectangle,label="%51 = bitcast i8* %50 to \"class.std::ctype\"**"]
    block_55 [shape=rectangle,label="%56 = getel

In [26]:
import openai

def read_file_content(file_path):
    """Helper function to read file content."""
    try:
        with open(file_path, 'r') as file:
            return file.read()
    except FileNotFoundError:
        return "File not found."    


In [27]:
def generate_cfg_llms3(file_content):
    
    # Read the contents of the BC and DOT files
    bc_content = read_file_content('/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR/processed_data/CFG_LLMs/test.bc')
    dot_content = read_file_content('/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR/processed_data/CFG_LLMs/test.dot')
    
    model = "gpt-3.5-turbo"
    prompt = f"""
        You are an expert in high-performance computation program area. I will give you an IR (Intermediate Representation) .bc code file, which is compiled from source code with LLVM. 
        You will follow the next instructions and output a file for me. 
        First, you have to draw the Control Flow Graph (CFG) of the IR code.
        Then, output the CFG in a .dot file format. 
        The input file is as follows:

    ```
    {file_content}
    ```
    Note: The IR code may contain strings with quotes. For example:
    block_54 [label="%51 = bitcast i8* %50 to \\"class.std::ctype\\"**"];
    label = \\"string\\";
    Be careful when handling quotes within quotes.
    Here is an example of an IR code from 'test.bc' and its corresponding CFG in 'test.dot':

    Input IR code (test.bc):
    {bc_content}
    
    Expected output CFG (test.dot):
    {dot_content}

    Please process the input file and generate the CFG in the same format as the example above.
    Generate the CFG and provide the .dot file content.

    """
    response = openai.chat.completions.create(
            messages=[
            {
                "role": "user", 
                "content": prompt,
            }
        ],
        model="gpt-3.5-turbo",
        max_tokens=4096,
        temperature=0.7
            )
    analysis = response.choices[0].message.content.strip()
    print(analysis)
    return analysis
    return response.choices[0].text.strip().split()[0]  # Get the first word of the response as the algorithm name


In [28]:
import os
import glob
import openai

# Define the directory path
input_directory = '/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR/processed_data/CFG'
output_directory = '/Users/jianfengzhu/Desktop/LLMs/LLMs_Evaluation_IR/LLMs-in-IR/processed_data/CFG_LLMs'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

def read_and_process_bc_files(input_dir, output_dir):
    # Find all .bc files in the directory and its subdirectories
    bc_files = glob.glob(os.path.join(input_dir, '**', '*.bc'), recursive=True)
    
    # Iterate through each .bc file and process it
    for bc_file in bc_files:
        with open(bc_file, 'r') as file:
            file_content = file.read()
            
            # Generate the CFG .dot content
            dot_content = generate_cfg_llms3(file_content)
            
            # Define the output .dot file path in the output directory
            base_filename = os.path.basename(bc_file).replace('.bc', '_LLMs_cfg.dot')
            output_dot_file_path = os.path.join(output_dir, base_filename)
            
            # Save the .dot content to the file, overwriting if it exists
            if os.path.exists(output_dot_file_path):
                print(f"Overwriting existing file: {output_dot_file_path}")
            else:
                print(f"Creating new file: {output_dot_file_path}")

            with open(output_dot_file_path, 'w') as dot_file:
                dot_file.write(dot_content)
            
            print(f"CFG has been saved to {output_dot_file_path}")

# Call the function to read and process .bc files
read_and_process_bc_files(input_directory, output_directory)


```dot
digraph "CFG for 'main' function" {
	label="CFG for 'main' function";

	Node0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{%1:\l  %1 = load %struct._IO_FILE*, %struct._IO_FILE** @stdin, align 8, !tbaa !3\l  %2 = tail call %struct._IO_FILE* @freopen(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0), %struct._IO_FILE* %1)\l  %3 = tail call zeroext i1 @_ZNSt8ios_base15sync_with_stdioEb(i1 zeroext false)\l  %4 = tail call nonnull align 8 dereferenceable(16) %"class.std::basic_istream"* @_ZNSi10_M_extractIxEERSiRT_(%"class.std::basic_istream"* nonnull align 8 dereferenceable(16) @_ZSt3cin, i64* nonnull align 8 dereferenceable(8) @n)\l  %5 = tail call nonnull align 8 dereferenceable(16) %"class.std::basic_istream"* @_ZNSi10_M_extractIxEERSiRT_(%"class.std::basic_istream"* nonnull align 8 dereferenceable(16) %4, i64* nonnull align 8 dereferenceable(8) @k)\l  %6 =