In [1]:
import json

# Load your data (assuming it's in a JSONL format
lang = 'java'
DATA_DIR = 'dataset/'+lang+'/'
data = []
with open(DATA_DIR + "train.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

# Extract code snippets and queries
snippets = [{"code": entry["code_tokens"], "language": lang} for entry in data]
queries = [entry["docstring_tokens"] for entry in data]
code = [entry["code"] for entry in data]

In [3]:
import json

# Load your data
data = []
with open(DATA_DIR + "train.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

# Save tokens to a text file
with open("fasttext_training_data.txt", "w") as f:
    for entry in data:
        f.write(" ".join(entry["code_tokens"]) + "\n")
        f.write(" ".join(entry["docstring_tokens"]) + "\n")

In [4]:
import fasttext

# Train a FastText model
model = fasttext.train_unsupervised(
    input="fasttext_training_data.txt",  # Path to your training data
    model="skipgram",  # or "cbow"
    dim=100,  # Embedding dimension
    epoch=10,  # Number of epochs
    lr=0.05,  # Learning rate
    minCount=1  # Minimum word frequency
)

# Save the trained model
model.save_model("fasttext_model.bin")

Read 20M words
Number of words:  669044
Number of labels: 0
Progress: 100.0% words/sec/thread:  120828 lr:  0.000000 avg.loss:  0.846318 ETA:   0h 0m 0s 40.3% words/sec/thread:  121001 lr:  0.029864 avg.loss:  0.970018 ETA:   0h 1m40s


In [2]:
from codesearch.encoders import BasicEncoder
from functools import partial


from codesearch.text_preprocessing import preprocess_text
from codesearch.code_preprocessing import code_tokenization

# Custom preprocessing parameters
text_preprocessing_params = {"lemmatize": True, "remove_stop": True}
code_preprocessing_params = {"language": lang}

# Initialize the BasicEncoder with custom parameters
encoder = BasicEncoder(
    description_preprocessor=partial(preprocess_text, **text_preprocessing_params),
    code_preprocessor=partial(code_tokenization, **code_preprocessing_params)
)

In [3]:
from codesearch.unif.unif_embedder import UNIFEmbedder
from codesearch.unif.unif_modules import SimilarityModel
from fasttext import load_model

# Load the FastText model
ft_model = load_model("fasttext_model.bin")

# Initialize the similarity model
sim_model = SimilarityModel(ft_model)

# Initialize the encoder (you need to define or load this)
# encoder = BasicEncoder() # Replace with your encoder

# Initialize the UNIFEmbedder
unif_embedder = UNIFEmbedder(
    model=sim_model,
    encoder=encoder,
    ft_model=ft_model,
    batch_size=32,  # Adjust based on your hardware
    max_code_len=200,  # Adjust based on your data
    max_description_len=25  # Adjust based on your data
)

Initializing the weights with fast text matrix
Initializing the weights with fast text matrix


In [4]:
queries = [entry["docstring_tokens"] for entry in data]
queries

[['Expects', 'a', 'height', 'mat', 'as', 'input'],
 ['Pops',
  'the',
  'top',
  'event',
  'off',
  'the',
  'current',
  'event',
  'stack',
  '.',
  'This',
  'action',
  'has',
  'to',
  'be',
  'performed',
  'immediately',
  'after',
  'the',
  'event',
  'has',
  'been',
  'dispatched',
  'to',
  'all',
  'listeners',
  '.'],
 ['Executes',
  'the',
  'given',
  'transaction',
  'within',
  'the',
  'context',
  'of',
  'a',
  'write',
  'lock',
  '.'],
 ['Executes',
  'the',
  'given',
  'supplier',
  'within',
  'the',
  'context',
  'of',
  'a',
  'read',
  'lock',
  '.'],
 ['This',
  'should',
  'be',
  'called',
  'from',
  'a',
  'subclass',
  'constructor',
  'if',
  'offset',
  'or',
  'length',
  'are',
  'unknown',
  'at',
  'a',
  'time',
  'when',
  'SubIIMInputStream',
  'constructor',
  'is',
  'called',
  '.',
  'This',
  'method',
  'shouldn',
  't',
  'be',
  'called',
  'more',
  'than',
  'once',
  '.'],
 ['Bessel', 'function', 'of', 'order', '0', '.'],
 ['Bess

In [6]:
# Embed queries
query_embeddings = unif_embedder.embed_queries(queries)

Embedding queries: 100%|██████████| 5154/5154 [00:21<00:00, 241.46it/s]


In [7]:
import numpy as np
# Save query embeddings
np.save("query_embeddings.npy", query_embeddings)

In [8]:
# Embed code snippets
snippet_embeddings = unif_embedder.embed_snippets(snippets)

Embedding snippets: 100%|██████████| 5154/5154 [05:31<00:00, 15.55it/s]


In [9]:
# Save code embeddings
np.save("code_embeddings.npy", snippet_embeddings) 

In [10]:
unif_embedder.save("unif_embedder")

In [11]:
import numpy as np
import json

# Load embeddings
code_embeds = np.load("code_embeddings.npy")  # Shape: [num_code_snippets, embed_dim]
query_embeds = np.load("query_embeddings.npy")  # Shape: [num_queries, embed_dim]

# Load ground truth (query_id -> correct_code_id)
valid_data = []
lang = 'java'
DATA_DIR = 'dataset/'+lang+'/'
with open(DATA_DIR + "valid.jsonl", "r") as f:
    for line in f:
        valid_data.append(json.loads(line))

In [12]:
# Extract code snippets and queries
valid_snippets = [entry["code_tokens"] for entry in valid_data]
valid_queries = [entry["docstring_tokens"] for entry in valid_data]

In [13]:
import argparse
import logging
import os
import pickle
import random
import torch # type: ignore
import json
import numpy as np
from torch.nn import CrossEntropyLoss, MSELoss # type: ignore
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset # type: ignore

In [14]:
import json
import numpy as np

# Load validation data
with open(DATA_DIR + "valid.jsonl", "r") as f:
    valid_data = [json.loads(line) for line in f]

# Extract queries (docstrings) and codes
queries = [entry["docstring_tokens"] for entry in valid_data]
codes = [entry["code_tokens"] for entry in valid_data]

In [15]:
# Generate query embeddings
query_embeddings = unif_embedder.embed_queries(queries)

# Generate code embeddings
code_snippets = [{"code": code, "language": lang} for code in codes]
code_embeddings = unif_embedder.embed_snippets(code_snippets)

Embedding queries: 100%|██████████| 162/162 [00:00<00:00, 252.47it/s]
Embedding snippets: 100%|██████████| 162/162 [00:10<00:00, 15.79it/s]


In [16]:
query_embeddings = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True)
code_embeddings = code_embeddings / np.linalg.norm(code_embeddings, axis=1, keepdims=True)

In [17]:
similarity_matrix = query_embeddings @ code_embeddings.T  # Shape: [num_queries, num_codes]

In [18]:
def evaluate_topk(similarity_matrix, top_k_values=[1, 5, 10]):
    results = {f"Success@{k}": 0 for k in top_k_values}
    reciprocal_ranks = []
    
    for query_idx in range(similarity_matrix.shape[0]):
        # Get scores and sorted indices for this query
        scores = similarity_matrix[query_idx]
        sorted_indices = np.argsort(-scores)  # Descending order
        
        # The correct code is at the same index as the query
        correct_code_idx = query_idx
        rank = np.where(sorted_indices == correct_code_idx)[0][0] + 1  # 1-based
        
        # Update metrics
        for k in top_k_values:
            if rank <= k:
                results[f"Success@{k}"] += 1
        
        reciprocal_ranks.append(1 / rank)
    
    # Normalize results
    num_queries = similarity_matrix.shape[0]
    for k in top_k_values:
        results[f"Success@{k}"] /= num_queries
    results["MRR"] = np.mean(reciprocal_ranks)
    
    return results

In [19]:
# Run evaluation
metrics_java = evaluate_topk(similarity_matrix)
print("Evaluation Results:")
for metric, value in metrics_java.items():
    print(f"{metric}: {value:.4f}")

Evaluation Results:
Success@1: 0.0868
Success@5: 0.2230
Success@10: 0.3008
MRR: 0.1590
