In [4]:
import json
import re


In [136]:
import re
import json

def try_extract_and_parse(pattern, input_string, remove_comments_first=False):
    """Extract using the given regex pattern and parse JSON."""
    json_blocks = re.findall(pattern, input_string, re.DOTALL)
    for block in reversed(json_blocks):
        block = block.strip()
        # First attempt without removing comments
        try:
            return json.loads(block)
        except json.JSONDecodeError:
            pass

        # If that fails and remove_comments_first is True, try after removing comments
        if remove_comments_first:
            cleaned_block = remove_comments(block)
            try:
                return json.loads(cleaned_block)
            except json.JSONDecodeError:
                continue
    return None

def try_extract_boxed_json(input_string, remove_comments_first=False):
    """Try to extract JSON from LaTeX-style boxed expressions of the form: $\boxed{ ... }$."""
    boxed_blocks = re.findall(r'\$\s*\\boxed\s*\{(.*?)\}\s*\$', input_string, re.DOTALL)
    for block in boxed_blocks:
        block = block.strip()

        # First attempt without removing comments
        try:
            return json.loads(block)
        except json.JSONDecodeError:
            pass

        # If that fails and remove_comments_first is True, try after removing comments
        if remove_comments_first:
            cleaned_block = remove_comments(block)
            try:
                return json.loads(cleaned_block)
            except json.JSONDecodeError:
                continue
    return None

def try_extract_curly_braces(input_string, remove_comments_first=False):
    """Final fallback: Look for the first substring that starts with '{' and ends with '}'."""
    match = re.search(r'(\{.*\})', input_string, re.DOTALL)
    if match:
        block = match.group(1).strip()

        # First attempt without removing comments
        try:
            return json.loads(block)
        except json.JSONDecodeError:
            pass

        # If that fails and remove_comments_first is True, try after removing comments
        if remove_comments_first:
            cleaned_block = remove_comments(block)
            try:
                return json.loads(cleaned_block)
            except json.JSONDecodeError:
                return None
    return None

def remove_comments(json_string):
    """Remove single-line comments (//) from the JSON string."""
    return re.sub(r'//.*', '', json_string)


def extract_json_from_string(input_string):
    """
    Extract JSON from a response string by trying, in order:
      1. Code blocks explicitly tagged as JSON (```json).
      2. Any code blocks delimited by triple backticks (```).
      3. The entire string (if valid JSON).
      4. LaTeX-style boxed JSON (e.g. $\boxed{ ... }$).
      5. The first substring that starts with '{' and ends with '}'.
    Tries first without removing comments, then retries with removing comments.
    """

    # 1. Try blocks tagged explicitly as JSON
    pattern_json = r"```json\s*\n(.*?)```"
    result = try_extract_and_parse(pattern_json, input_string, remove_comments_first=False)
    if result is not None:
        return result

    # Retry with comment removal
    result = try_extract_and_parse(pattern_json, input_string, remove_comments_first=True)
    if result is not None:
        return result

    # 2. Fallback: try any block delimited by triple backticks
    pattern_any = r"```\s*\n(.*?)```"
    result = try_extract_and_parse(pattern_any, input_string, remove_comments_first=False)
    if result is not None:
        return result

    # Retry with comment removal
    result = try_extract_and_parse(pattern_any, input_string, remove_comments_first=True)
    if result is not None:
        return result

    # 3. Try parsing the entire input string as JSON
    try:
        return json.loads(input_string.strip())
    except json.JSONDecodeError:
        pass

    # Retry with comment removal
    try:
        return json.loads(remove_comments(input_string).strip())
    except json.JSONDecodeError:
        pass

    # 4. Look for LaTeX-style boxed JSON (e.g. $\boxed{ ... }$)
    result = try_extract_boxed_json(input_string, remove_comments_first=False)
    if result is not None:
        return result

    # Retry with comment removal
    result = try_extract_boxed_json(input_string, remove_comments_first=True)
    if result is not None:
        return result

    # 5. Final fallback: search for a substring that starts with '{' and ends with '}'
    result = try_extract_curly_braces(input_string, remove_comments_first=False)
    if result is not None:
        return result

    # Retry with comment removal
    return try_extract_curly_braces(input_string, remove_comments_first=True)


In [None]:
teststr = """
To analyze the provided code paths for potential sinks, we need to apply the given rules and consider how data flows through each path.

### Path 1 Analysis
In Path 1, several operations are performed:
- The `mGuestRestrictions` field is cleared and updated with the contents of `p1`.
- An `AtomicFile` object is created based on `this.mUserListFile`, and a file output stream is started.
- XML serialization is initiated to write data into the file, including tags for "users", "guestRestrictions", and potentially "restrictions" if `mGuestRestrictions` is not empty.

The key instructions contributing to potential sinks in Path 1 include:
- `this.mGuestRestrictions.clear();`
- `this.mGuestRestrictions.putAll(p1);`
- The creation of the `AtomicFile` object and the subsequent XML serialization process, which involves writing data from `mGuestRestrictions`, `mNextSerialNumber`, `mUserVersion`, `mUserTypeVersion`, and `mDeviceOwnerUserId`.

### Path 2 Analysis
In Path 2:
- Similar to Path 1, `mGuestRestrictions` is updated with `p1`.
- An `AtomicFile` object is created for writing, and XML serialization is used to write data.
- The main difference here is the use of `UserRestrictionsUtils.writeRestrictions(v8, this.mGuestRestrictions, "restrictions");` instead of manually iterating over restrictions.

The key instructions in Path 2 include:
- `this.mGuestRestrictions.clear();`
- `this.mGuestRestrictions.putAll(p1);`
- The XML serialization process, including writing data from `mGuestRestrictions`, `mNextSerialNumber`, `mUserVersion`, `mUserTypeVersion`, and `mDeviceOwnerUserId`.

### Path 3 Analysis
In Path 3:
- `mGuestRestrictions` is cleared and updated with `p1`.
- The method `p0.writeUserListLP();` is called, which presumably writes the user list data.

The key instructions in Path 3 are straightforward:
- `this.mGuestRestrictions.clear();`
- `this.mGuestRestrictions.putAll(p1);`
- `p0.writeUserListLP();`

### Identifying Sinks
Based on the rules provided:
- **Method Invocations**: In all paths, the methods invoked (e.g., `clear()`, `putAll()`, XML serialization methods) are directly related to processing the input parameter `p1` or accessing/modifying class fields.
- **Field Access and Updates**: The field `mGuestRestrictions` is accessed and updated in all paths, with a naming similarity to the API name `setDefaultGuestRestrictions`.
- **Object Creation**: In Paths 1 and 2, objects are created (e.g., `AtomicFile`, `TypedXmlSerializer`) using data that flows from the API parameters or class fields.

### JSON Representation of Sinks
Given the analysis, here is a representation of the sinks in JSON format. Note that each path's relevant instructions are grouped together as they represent coherent sets of operations related to the API's functionality:

```json
{
  "Sinks": [
    [
      "this.mGuestRestrictions.clear();",
      "this.mGuestRestrictions.putAll(p1);",
      "AtomicFile v5 = new AtomicFile(this.mUserListFile);",
      "FileOutputStream v7 = v5.startWrite();",
      "TypedXmlSerializer v8 = Xml.resolveSerializer(v7);",
      "v8.startDocument(0, Boolean.TRUE);",
      "v8.setFeature(\"http://xmlpull.org/v1/doc/features.html#indent-output\", 1);",
      "...", // Other XML serialization instructions in Path 1
      "v5.finishWrite(v7);"
    ],
    [
      "this.mGuestRestrictions.clear();",
      "this.mGuestRestrictions.putAll(p1);",
      "AtomicFile v5 = new AtomicFile(this.mUserListFile);",
      "FileOutputStream v7 = v5.startWrite();",
      "TypedXmlSerializer v8 = Xml.resolveSerializer(v7);",
      "v8.startDocument(0, Boolean.TRUE);",
      "v8.setFeature(\"http://xmlpull.org/v1/doc/features.html#indent-output\", 1);",
      "...", // Other XML serialization instructions in Path 2
      "v5.finishWrite(v7);"
    ],
    [
      "this.mGuestRestrictions.clear();",
      "this.mGuestRestrictions.putAll(p1);",
      "p0.writeUserListLP();"
    ]
  ]
}
```

### Ranking Sinks by Sensitivity
The ranking from most sensitive to least sensitive can be subjective but generally follows the amount of data exposure or modification capability. Paths 1 and 2 involve detailed XML serialization, potentially exposing more internal state, making them more sensitive than Path 3, which involves a simpler write operation.

Thus, the order in the JSON representation reflects this ranking, with the most detailed and potentially sensitive operations first (Paths 1 and 2), followed by the less detailed operation in Path 3.
"""



teststr2 = """
```json
{
  "Sinks": [
      [
      "this.mGuestRestrictions.clear();",
      "this.mGuestRestrictions.putAll(p1);",
      "AtomicFile v5 = new AtomicFile(this.mUserListFile);",
      "FileOutputStream v7 = v5.startWrite();",
      "TypedXmlSerializer v8 = Xml.resolveSerializer(v7);",
      "v8.startDocument(0, Boolean.TRUE);",
      "v8.setFeature(\"http://xmlpull.org/v1/doc/features.html#indent-output\", 1);",
      "...",
      "v5.finishWrite(v7);"
    ],
    [
      "this.mGuestRestrictions.clear();",
      "this.mGuestRestrictions.putAll(p1);",
      "p0.writeUserListLP();"
    ]
  ]
}
```
"""

In [126]:
print(extract_json_from_string(teststr2))

{'Sinks': [['this.mGuestRestrictions.clear();', 'this.mGuestRestrictions.putAll(p1);', 'AtomicFile v5 = new AtomicFile(this.mUserListFile);', 'FileOutputStream v7 = v5.startWrite();', 'TypedXmlSerializer v8 = Xml.resolveSerializer(v7);', 'v8.startDocument(0, Boolean.TRUE);', '...', 'v5.finishWrite(v7);'], ['this.mGuestRestrictions.clear();', 'this.mGuestRestrictions.putAll(p1);', 'p0.writeUserListLP();']]}


In [137]:
import pandas as pd
df = pd.read_parquet("/u1/hfaheem/DLAndroidArtifact/my-paths-code/output8/android_services_methods.parquet")


df.columns

counter = 0
# traverse the rows of the dataframe
for index, row in df.iterrows():
    # print(row['res1'])
    # print(row['method_body'])
    # print(extract_json_from_string(row['method_body']))
    if extract_json_from_string(row['res1']) is None:
        print(row['res1'])
        counter += 1

print(counter)

To analyze the provided code paths for potential sinks, we need to apply the given rules and consider how data flows through each path.

### Path 1 Analysis
In Path 1, several operations are performed:
- The `mGuestRestrictions` field is cleared and updated with the contents of `p1`.
- An `AtomicFile` object is created based on `this.mUserListFile`, and a file output stream is started.
- XML serialization is initiated to write data into the file, including tags for "users", "guestRestrictions", and potentially "restrictions" if `mGuestRestrictions` is not empty.

The key instructions contributing to potential sinks in Path 1 include:
- `this.mGuestRestrictions.clear();`
- `this.mGuestRestrictions.putAll(p1);`
- The creation of the `AtomicFile` object and the subsequent XML serialization process, which involves writing data from `mGuestRestrictions`, `mNextSerialNumber`, `mUserVersion`, `mUserTypeVersion`, and `mDeviceOwnerUserId`.

### Path 2 Analysis
In Path 2:
- Similar to Path 1, `m