In [1]:
!nvidia-smi


Fri Nov 15 20:14:29 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A40                     Off |   00000000:61:00.0 Off |                    0 |
|  0%   24C    P8             21W /  300W |       0MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from transformers import AutoModel, AutoTokenizer
from torch.nn.functional import cosine_similarity
import torch.nn.functional as F
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoModel, AutoTokenizer

checkpoint = "Salesforce/codet5p-110m-embedding"
device = "cuda"  # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

inputs = tokenizer.encode("def print_hello_world():\tprint('Hello World!')", return_tensors="pt").to(device)
embedding = model(inputs)[0]
print(embedding.size())
print(f'Dimension of the embedding: {embedding.size()[0]}, with norm={embedding.norm().item()}')
# Dimension of the embedding: 256, with norm=1.0
print(embedding)

torch.Size([256])
Dimension of the embedding: 256, with norm=1.0
tensor([ 0.0185,  0.0229, -0.0315, -0.0307, -0.1421, -0.0575, -0.0275,  0.0501,
         0.0203,  0.0337, -0.0067, -0.0075, -0.0222, -0.0107, -0.0250, -0.0657,
         0.1571, -0.0994, -0.0370,  0.0164, -0.0948,  0.0490, -0.0352,  0.0907,
        -0.0198,  0.0130, -0.0921,  0.0209,  0.0651,  0.0319,  0.0299, -0.0173,
        -0.0693, -0.0798, -0.0066, -0.0417,  0.1076,  0.0597, -0.0316,  0.0940,
        -0.0313,  0.0993,  0.0931, -0.0427,  0.0256,  0.0297, -0.0561, -0.0155,
        -0.0496, -0.0697, -0.1011,  0.1178,  0.0283, -0.0571, -0.0635, -0.0222,
         0.0710, -0.0617,  0.0423, -0.0057,  0.0620, -0.0262,  0.0441,  0.0425,
        -0.0413, -0.0245,  0.0043,  0.0185,  0.0060, -0.1727, -0.1152,  0.0655,
        -0.0235, -0.1465, -0.1359,  0.0022,  0.0177, -0.0176, -0.0361, -0.0750,
        -0.0464, -0.0846, -0.0088,  0.0136, -0.0221,  0.0591,  0.0876, -0.0903,
         0.0271, -0.1165, -0.0169, -0.0566,  0.1173, -0

In [4]:
def get_code_embedding(code_snippet, checkpoint="Salesforce/codet5p-110m-embedding", device="cuda"):
    """
    Generates embeddings for a given code snippet using a pre-trained model.

    Parameters:
    - code_snippet (str): The code for which embeddings are to be generated.
    - checkpoint (str): The model checkpoint to be used for embedding. Default is Salesforce/codet5p-110m-embedding.
    - device (str): Device to run the model on, either 'cuda' for GPU or 'cpu' for CPU. Default is 'cuda'.

    Returns:
    - torch.Tensor: Embedding tensor for the input code.
    """
    tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
    model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)
    
    inputs = tokenizer.encode(code_snippet, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = model(inputs)[0]
    
    return embedding


In [5]:
code_snippet1 = """
def add(x, y):
    return x + y
"""
embedding1 = get_code_embedding(code_snippet1, device="cuda")
print(f"Embedding shape: {embedding1.size()}")

code_snippet2 = """
def sum_values(a, b):
    return a + b
"""
embedding2 = get_code_embedding(code_snippet2, device="cuda")
print(f"Embedding shape: {embedding2.size()}")

code_snippet3 = """
def sum_values(x, y):
    a = 42
    b = 17
    c = a - b + 8
    d = c * 3 - 10
    return x + y
"""
embedding3 = get_code_embedding(code_snippet3, device="cuda")
print(f"Embedding shape: {embedding3.size()}")

code_snippet4 = """

def sum_values(x, y):
    a = 42
    b = 17
    c = a - b + 8
    d = c * 3 - 10
    return a + b
"""
embedding4 = get_code_embedding(code_snippet4, device="cuda")
print(f"Embedding shape: {embedding4.size()}")

Embedding shape: torch.Size([256])
Embedding shape: torch.Size([256])
Embedding shape: torch.Size([256])
Embedding shape: torch.Size([256])


In [6]:
cosine_sim = F.cosine_similarity(embedding2, embedding3, dim=0)
print(f'Cosine similarity: {cosine_sim.item()}')

Cosine similarity: 0.8219677209854126


Jadx examples 

In [6]:
j1 = """
public void uncacheShortcuts(String str, String str2, List<String> list, UserHandle userHandle, int i) {
            ensureStrictAccessShortcutsPermission(str);
            if (canAccessProfile(userHandle.getIdentifier(), "Cannot uncache shortcuts")) {
                this.mShortcutServiceInternal.uncacheShortcuts(getCallingUserId(), str, str2, list, userHandle.getIdentifier(), toShortcutsCacheFlags(i));
            }
        }
"""

j2 = """
@Override // android.content.pm.ILauncherApps
        public void cacheShortcuts(String str, String str2, List<String> list, UserHandle userHandle, int i) {
            ensureStrictAccessShortcutsPermission(str);
            if (canAccessProfile(userHandle.getIdentifier(), "Cannot cache shortcuts")) {
                this.mShortcutServiceInternal.cacheShortcuts(getCallingUserId(), str, str2, list, userHandle.getIdentifier(), toShortcutsCacheFlags(i));
            }
        }
"""


j3 = """
@Override // android.content.pm.ILauncherApps
        public int getShortcutIconResId(String str, String str2, String str3, int i) {
            ensureShortcutPermission(str);
            if (canAccessProfile(i, "Cannot access shortcuts")) {
                return this.mShortcutServiceInternal.getShortcutIconResId(getCallingUserId(), str, str2, str3, i);
            }
            return 0;
        }
"""


j4 = """
@Override // android.content.pm.ILauncherApps
        public void pinShortcuts(String str, String str2, List<String> list, UserHandle userHandle) {
            ensureShortcutPermission(str);
            if (canAccessProfile(userHandle.getIdentifier(), "Cannot pin shortcuts")) {
                this.mShortcutServiceInternal.pinShortcuts(getCallingUserId(), str, str2, list, userHandle.getIdentifier());
            }
        }
"""

j5 = """
@Override // android.content.pm.ILauncherApps
        public String getShortcutIconUri(String str, String str2, String str3, int i) {
            ensureShortcutPermission(str);
            if (canAccessProfile(i, "Cannot access shortcuts")) {
                AndroidFuture<String> androidFuture = new AndroidFuture<>();
                this.mShortcutServiceInternal.getShortcutIconUriAsync(getCallingUserId(), str, str2, str3, i, androidFuture);
                try {
                    return androidFuture.get();
                } catch (InterruptedException | ExecutionException e2) {
                    throw new RuntimeException(e2);
                }
            }
            return null;
        }
"""



# trace event taken from j2
trace_events = """
this.mShortcutServiceInternal.cacheShortcuts(getCallingUserId(), str, str2, list, userHandle.getIdentifier(), toShortcutsCacheFlags(i));
"""

# get embedding for the trace event 
embedding_trace_event = get_code_embedding(trace_events, device="cuda")


embedding_j1 = get_code_embedding(j1, device="cuda")
embedding_j2 = get_code_embedding(j2, device="cuda")
embedding_j3 = get_code_embedding(j3, device="cuda")
embedding_j4 = get_code_embedding(j4, device="cuda")
embedding_j5 = get_code_embedding(j5, device="cuda")

list = [embedding_j1, embedding_j2, embedding_j3, embedding_j4, embedding_j5]

for i in range(5):
  for j in range(i+1, 5):
    cosine_sim = F.cosine_similarity(list[i], list[j], dim=0)
    print(f'Cosine similarity between j{i+1} and j{j+1}: {cosine_sim.item()}')


Cosine similarity between j1 and j2: 0.8550946712493896
Cosine similarity between j1 and j3: 0.710127592086792
Cosine similarity between j1 and j4: 0.7475813627243042
Cosine similarity between j1 and j5: 0.7092236280441284
Cosine similarity between j2 and j3: 0.8119555115699768
Cosine similarity between j2 and j4: 0.8571170568466187
Cosine similarity between j2 and j5: 0.8091253042221069
Cosine similarity between j3 and j4: 0.8190112709999084
Cosine similarity between j3 and j5: 0.9439752101898193
Cosine similarity between j4 and j5: 0.8089645504951477


In [7]:

for i in range(5):
  cosine_sim_trace_j = F.cosine_similarity(embedding_trace_event, list[i], dim=0)
  print(f'Cosine similarity between trace event and j{i+1}: {cosine_sim_trace_j.item()}')

Cosine similarity between trace event and j1: 0.804334819316864
Cosine similarity between trace event and j2: 0.8724206686019897
Cosine similarity between trace event and j3: 0.6814613342285156
Cosine similarity between trace event and j4: 0.728966474533081
Cosine similarity between trace event and j5: 0.6645785570144653
