In [2]:
# python ./preprocess.py --csv-dir ./input/Execution-Paths --output-dir ./tmpoutput/results --prompt ../prompts/prompt1-10.txt --model llama3.2 --num-ctx 25000

import os
import argparse
import json
import re
import csv
import pandas as pd
# import ollama
from tqdm import tqdm
import threading
from openai import OpenAI
import time

In [7]:

def process_csv_files(csv_directory):
    """Process CSV files and structure data into DataFrame."""
    data_dict = {}
    
    for filename in os.listdir(csv_directory):
        if filename.endswith('.csv'):
            service_name = os.path.splitext(filename)[0]
            filepath = os.path.join(csv_directory, filename)
            
            with open(filepath, 'r', newline='', encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    try:
                        class_name = row['Class'].strip('"')
                        method = row['Method'].strip('"')
                        depth = row['Depth'].strip('"')
                        trace = row['Trace Instruction(s) ...'].strip('"')
                        java_code = row['Code Merged'].strip('"')
                        access_control_level = row['Access Control Level'].strip('"')
                        key = (service_name, class_name, method, access_control_level)
                        
                        depth_entry = {
                            'depth': int(depth),
                            'trace': trace,
                            'java_code': java_code
                        }
                        
                        if key not in data_dict:
                            data_dict[key] = []
                        data_dict[key].append(depth_entry)
                        
                    except KeyError as e:
                        print(f"Missing column in {filename}: {e}")
                        continue

    rows = []
    for key in data_dict:
        service, cls, method, acl = key
        depths = sorted(data_dict[key], key=lambda x: x['depth'])
        rows.append({
            'service_name': service,
            'class': cls,
            'method': method,
            'depths': depths,
            'access control level': acl
        })
    
    return pd.DataFrame(rows)

def get_java_code(row):
    """Extract Java code from max depth entries."""
    max_depth = max([x['depth'] for x in row["depths"]])
    return '\n'.join([f"This is path {i+1} for the API with depth {max_depth}:\n{code['java_code']}"
                     for i, code in enumerate([d for d in row["depths"] if d['depth'] == max_depth])])

def get_three_java_codes(row):
    """Extract 3 Java code snippets: one from max depth, one from max-1, one from max-2.
    
    If there are not enough depths, take more from the highest available.
    """
    from collections import defaultdict

    # Group by depth
    depth_dict = defaultdict(list)
    for entry in row["depths"]:
        depth_dict[entry['depth']].append(entry)

    # Sort available depths in descending order
    sorted_depths = sorted(depth_dict.keys(), reverse=True)
    
    selected_entries = []
    
    # Try to get 1 code from max, max-1, and max-2 depths
    for i in range(3):
        if i < len(sorted_depths):  # Check if that depth exists
            depth = sorted_depths[i]
            selected_entries.append(depth_dict[depth].pop(0))  # Get one entry from this depth
    
    # If we still need more codes, fill from highest available
    all_remaining_entries = sum(depth_dict.values(), [])  # Flatten remaining entries
    while len(selected_entries) < 3 and all_remaining_entries:
        selected_entries.append(all_remaining_entries.pop(0))

    # Format output with depth information
    return '\n\n'.join([
        f"This is path {i+1} for the API with depth {entry['depth']}:\n{entry['java_code']}"
        for i, entry in enumerate(selected_entries)
    ])

def try_extract_and_parse(pattern, input_string, remove_comments_first=False):
    """Extract using the given regex pattern and parse JSON."""
    json_blocks = re.findall(pattern, input_string, re.DOTALL)
    for block in reversed(json_blocks):
        block = block.strip()
        # First attempt without removing comments
        try:
            return json.loads(block)
        except json.JSONDecodeError:
            pass

        # If that fails and remove_comments_first is True, try after removing comments
        if remove_comments_first:
            cleaned_block = remove_comments(block)
            try:
                return json.loads(cleaned_block)
            except json.JSONDecodeError:
                continue
    return None

def try_extract_boxed_json(input_string, remove_comments_first=False):
    """Try to extract JSON from LaTeX-style boxed expressions of the form: $\boxed{ ... }$."""
    boxed_blocks = re.findall(r'\$\s*\\boxed\s*\{(.*?)\}\s*\$', input_string, re.DOTALL)
    for block in boxed_blocks:
        block = block.strip()

        # First attempt without removing comments
        try:
            return json.loads(block)
        except json.JSONDecodeError:
            pass

        # If that fails and remove_comments_first is True, try after removing comments
        if remove_comments_first:
            cleaned_block = remove_comments(block)
            try:
                return json.loads(cleaned_block)
            except json.JSONDecodeError:
                continue
    return None

def try_extract_curly_braces(input_string, remove_comments_first=False):
    """Final fallback: Look for the first substring that starts with '{' and ends with '}'."""
    match = re.search(r'(\{.*\})', input_string, re.DOTALL)
    if match:
        block = match.group(1).strip()

        # First attempt without removing comments
        try:
            return json.loads(block)
        except json.JSONDecodeError:
            pass

        # If that fails and remove_comments_first is True, try after removing comments
        if remove_comments_first:
            cleaned_block = remove_comments(block)
            try:
                return json.loads(cleaned_block)
            except json.JSONDecodeError:
                return None
    return None

def remove_comments(json_string):
    """Remove single-line comments (//) from the JSON string."""
    return re.sub(r'//.*', '', json_string)

def extract_json_from_string(input_string):
    """
    Extract JSON from a response string by trying, in order:
      1. Code blocks explicitly tagged as JSON (```json).
      2. Any code blocks delimited by triple backticks (```).
      3. The entire string (if valid JSON).
      4. LaTeX-style boxed JSON (e.g. $\boxed{ ... }$).
      5. The first substring that starts with '{' and ends with '}'.
    Tries first without removing comments, then retries with removing comments.
    """

    # 1. Try blocks tagged explicitly as JSON
    pattern_json = r"```json\s*\n(.*?)```"
    result = try_extract_and_parse(pattern_json, input_string, remove_comments_first=False)
    if result is not None:
        return result

    # Retry with comment removal
    result = try_extract_and_parse(pattern_json, input_string, remove_comments_first=True)
    if result is not None:
        return result

    # 2. Fallback: try any block delimited by triple backticks
    pattern_any = r"```\s*\n(.*?)```"
    result = try_extract_and_parse(pattern_any, input_string, remove_comments_first=False)
    if result is not None:
        return result

    # Retry with comment removal
    result = try_extract_and_parse(pattern_any, input_string, remove_comments_first=True)
    if result is not None:
        return result

    # 3. Try parsing the entire input string as JSON
    try:
        return json.loads(input_string.strip())
    except json.JSONDecodeError:
        pass

    # Retry with comment removal
    try:
        return json.loads(remove_comments(input_string).strip())
    except json.JSONDecodeError:
        pass

    # 4. Look for LaTeX-style boxed JSON (e.g. $\boxed{ ... }$)
    result = try_extract_boxed_json(input_string, remove_comments_first=False)
    if result is not None:
        return result

    # Retry with comment removal
    result = try_extract_boxed_json(input_string, remove_comments_first=True)
    if result is not None:
        return result

    # 5. Final fallback: search for a substring that starts with '{' and ends with '}'
    result = try_extract_curly_braces(input_string, remove_comments_first=False)
    if result is not None:
        return result

    # Retry with comment removal
    return try_extract_curly_braces(input_string, remove_comments_first=True)




In [None]:
import argparse
import os
import json
import threading
from tqdm import tqdm
from openai import OpenAI

BASE_URL    = "http://localhost:8000/v1"
API_KEY     = "token123"
MODEL_NAME  = "Qwen/Qwen2.5-Coder-32B-Instruct"
MAX_TOKENS  = 2500
TEMPERATURE = 0.3
TIMEOUT     = 3 * 24 * 3600  # 3 days

# Initialize client (will be re-assigned in main if you choose)
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

def run_vllm_prompt(user_prompt, system_prompt, idx, results):
    """
    Sends one chat completion to vLLM using the OpenAI client.
    Stores a dict with system_message, user_message, and response in results[idx].
    """
    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": user_prompt}
        ],
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
        stream=False,
        timeout=TIMEOUT
    )
    results[idx] = {
        "system_message": system_prompt,
        "user_message":   user_prompt,
        "response":       resp.choices[0].message.content
    }

def process_dataframe(df, output_folder, system_prompt, num_ctx, batch_size):
    # Prepare columns
    df['prompt1']      = df.apply(lambda row: get_three_java_codes(row), axis=1)
    df['res1']         = None
    df['json_answer']  = None

    # Process in batches
    for start in tqdm(range(0, len(df), batch_size), desc="Batches"):
        end = min(start + batch_size, len(df))
        batch_idxs   = list(range(start, end))
        batch_prompts = [df.at[i, 'prompt1'] for i in batch_idxs]

        # launch threads
        results = [None] * len(batch_prompts)
        threads = []
        for i, prompt in enumerate(batch_prompts):
            t = threading.Thread(
                target=run_vllm_prompt,
                args=(prompt, system_prompt, i, results)
            )
            t.start()
            threads.append(t)

        # wait
        for t in threads:
            t.join()

        # collect & write out
        for i, row_idx in enumerate(batch_idxs):
            res = results[i]
            df.at[row_idx, 'res1']        = res['response']
            parsed = extract_json_from_string(res['response'])
            df.at[row_idx, 'json_answer'] = json.dumps(parsed) if isinstance(parsed, (dict, list)) else str(parsed)

            # save files
            method_name  = df.at[row_idx, 'method'].split("(")[0]
            service_name = df.at[row_idx, 'service_name']
            folder = os.path.join(output_folder, service_name, method_name)
            os.makedirs(folder, exist_ok=True)
            for fname, content in [
                ('system_message.txt', res['system_message']),
                ('user_message.txt',   res['user_message']),
                ('response.txt',       res['response'])
            ]:
                with open(os.path.join(folder, fname), 'w') as f:
                    f.write(content)

def main():
    parser = argparse.ArgumentParser(description='Batch vLLM analysis of Java methods')
    # Define variables for testing in Jupyter Notebook
    args = {
        "csv_dir": "/u1/hfaheem/DLAndroidArtifact/my-paths-code/input/Execution-Paths", 
        "output_dir": "./output_results_vllm_Qwen2.5-Coder",  
        "prompt": "/u1/hfaheem/DLAndroidArtifact/prompts/prompt1-11.txt",  
        "num_ctx": 25000,  # Context window size (ignored by vLLM)
        "batch_size": 150  # Number of prompts per VLLM batch
    }

    # load system prompt
    with open(args["prompt"], 'r') as f:
        system_prompt = f.read().strip()

    # re-init client in case you want to override via env or args
    global client
    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

    # load your CSVs and flatten into one df
    df = process_csv_files(args["csv_dir"])
    print(f"Loaded {len(df)} rows to analyze.")
    
    df = df[:10]
    # run batched inference
    process_dataframe(
        df,
        output_folder=args["output_dir"],
        system_prompt=system_prompt,
        num_ctx=args["num_ctx"],
        batch_size=args["batch_size"]
    )

    # save final dataframe
    out_file = os.path.join(args["output_dir"], "android_services_methods.parquet")
    df.to_parquet(out_file)
    print(f"Saved results to {out_file}")




if __name__ == "__main__":
    main()


Loaded 1523 rows to analyze.


Batches: 100%|██████████| 1/1 [27:07<00:00, 1627.66s/it]


ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [13]:
!pip install pyarrow


Collecting pyarrow
  Downloading pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl (42.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-19.0.1
