In [None]:
# Colab script to generate Python code with CodeT5-small and compute metrics
!pip install transformers==4.40.0 torch==2.0.1 nltk==3.8.1 python-Levenshtein==0.25.1 sentencepiece==0.2.0

import torch
from transformers import RobertaTokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu
import Levenshtein
import re
import py_compile
import unittest
import time
import numpy as np
import os

# Clear Hugging Face cache to avoid corrupted files
!rm -rf ~/.cache/huggingface/

# Initialize CodeT5-small with RobertaTokenizer
model_name = "Salesforce/codet5-small"
try:
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")
    raise

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prompt and reference
prompt = "Write a Python function named add that takes two integers a and b and returns their sum."
reference_code = "def add(a, b):\n    return a + b"
reference_tokens = reference_code.split()

# Normalize code
def normalize_code(code):
    return re.sub(r'\s+', ' ', code.strip())

# Generate completions
def generate_completions(prompt, num_completions=5, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    start_time = time.perf_counter()
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=num_completions,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7
    )
    end_time = time.perf_counter()
    completions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    inference_time = (end_time - start_time) / num_completions
    return completions, inference_time

# Syntax check
def check_python_syntax(code, filename):
    with open(filename, "w") as f:
        f.write(code)
    try:
        py_compile.compile(filename, doraise=True)
        return True
    except py_compile.PyCompileError:
        return False

# Functional equivalence check
def check_functional_equivalence(code, filename):
    with open(filename, "w") as f:
        f.write(code)
    test_code = """
import unittest
from sample_python import add

class TestAddFunction(unittest.TestCase):
    def test_add_positive(self):
        self.assertEqual(add(2, 3), 5)
    def test_add_negative(self):
        self.assertEqual(add(-1, -2), -3)
    def test_add_zero(self):
        self.assertEqual(add(0, 5), 5)

def run_tests():
    suite = unittest.TestLoader().loadTestsFromTestCase(TestAddFunction)
    result = unittest.TextTestRunner(verbosity=0).run(suite)
    return result.wasSuccessful()
"""
    with open("/content/test_sample_python.py", "w") as f:
        f.write(test_code)
    try:
        exec(open("/content/test_sample_python.py").read(), globals())
        return run_tests()
    except Exception:
        return False

# Evaluate completions
completions, avg_inference_time = generate_completions(prompt, num_completions=5)
normalized_reference = normalize_code(reference_code)
pairwise_distances = []

for i, completion in enumerate(completions):
    filename = f"/content/sample_python_{i}.py"
    syntax_valid = check_python_syntax(completion, filename)
    functional_valid = check_functional_equivalence(completion, filename) if syntax_valid else False

    # BLEU Score
    completion_tokens = normalize_code(completion).split()
    bleu_score = sentence_bleu([reference_tokens], completion_tokens, weights=(0.25, 0.25, 0.25, 0.25))

    # Edit Distance
    normalized_completion = normalize_code(completion)
    edit_distance = Levenshtein.distance(normalized_reference, normalized_completion)

    # Diversity (pairwise distances)
    for j in range(i + 1, len(completions)):
        distance = Levenshtein.distance(normalized_completion, normalize_code(completions[j]))
        pairwise_distances.append(distance)

    print(f"\nCompletion {i+1}:")
    print(f"Code:\n{completion}")
    print(f"Syntax Valid: {syntax_valid}")
    print(f"Functionally Equivalent: {functional_valid}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"Edit Distance: {edit_distance}")

# Metrics Summary
avg_distance = np.mean(pairwise_distances) if pairwise_distances else 0
print(f"\nMetrics Summary:")
print(f"Average Inference Time: {avg_inference_time:.4f} seconds")
print(f"Average Pairwise Edit Distance (Diversity): {avg_distance:.2f}")

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting nltk==3.8.1
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting python-Levenshtein==0.25.1
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl.metadata (3.7 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.1)
  Downloadi

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]


Completion 1:
Code:
 def
Syntax Valid: False
Functionally Equivalent: False
BLEU Score: 0.0000
Edit Distance: 24

Completion 2:
Code:

Syntax Valid: True
Functionally Equivalent: False
BLEU Score: 0.0000
Edit Distance: 27

Completion 3:
Code:

Syntax Valid: True
Functionally Equivalent: False
BLEU Score: 0.0000
Edit Distance: 27

Completion 4:
Code:
 def
Syntax Valid: False
Functionally Equivalent: False
BLEU Score: 0.0000
Edit Distance: 24

Completion 5:
Code:
 def
Syntax Valid: False
Functionally Equivalent: False
BLEU Score: 0.0000
Edit Distance: 24

Metrics Summary:
Average Inference Time: 0.0265 seconds
Average Pairwise Edit Distance (Diversity): 1.80


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  In COLING 2004.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  In COLING 2004.
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  In COLING 2004.


**Edit Distance**

In [None]:
# Colab script to generate Python code with CodeT5-small and compute metrics
!pip install transformers==4.40.0 torch nltk python-Levenshtein

import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu
import Levenshtein
import re
import py_compile
import unittest
import time
import numpy as np
import os

# Clear Hugging Face cache to avoid corrupted files
!rm -rf ~/.cache/huggingface/

# Initialize CodeT5-small with AutoTokenizer
model_name = "Salesforce/codet5-small"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")
    raise

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prompt and reference
prompt = "Write a Python function named add that takes two integers a and b and returns their sum."
reference_code = "def add(a, b):\n    return a + b"
reference_tokens = reference_code.split()

# Normalize code
def normalize_code(code):
    return re.sub(r'\s+', ' ', code.strip())

# Generate completions
def generate_completions(prompt, num_completions=5, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    start_time = time.perf_counter()
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=num_completions,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7
    )
    end_time = time.perf_counter()
    completions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    inference_time = (end_time - start_time) / num_completions
    return completions, inference_time

# Syntax check
def check_python_syntax(code, filename):
    with open(filename, "w") as f:
        f.write(code)
    try:
        py_compile.compile(filename, doraise=True)
        return True
    except py_compile.PyCompileError:
        return False

# Functional equivalence check
def check_functional_equivalence(code, filename):
    with open(filename, "w") as f:
        f.write(code)
    test_code = """
import unittest
from sample_python import add

class TestAddFunction(unittest.TestCase):
    def test_add_positive(self):
        self.assertEqual(add(2, 3), 5)
    def test_add_negative(self):
        self.assertEqual(add(-1, -2), -3)
    def test_add_zero(self):
        self.assertEqual(add(0, 5), 5)

def run_tests():
    suite = unittest.TestLoader().loadTestsFromTestCase(TestAddFunction)
    result = unittest.TextTestRunner(verbosity=0).run(suite)
    return result.wasSuccessful()
"""
    with open("/content/test_sample_python.py", "w") as f:
        f.write(test_code)
    try:
        exec(open("/content/test_sample_python.py").read(), globals())
        return run_tests()
    except Exception:
        return False

# Evaluate completions
completions, avg_inference_time = generate_completions(prompt, num_completions=5)
normalized_reference = normalize_code(reference_code)
pairwise_distances = []

for i, completion in enumerate(completions):
    filename = f"/content/sample_python_{i}.py"
    syntax_valid = check_python_syntax(completion, filename)
    functional_valid = check_functional_equivalence(completion, filename) if syntax_valid else False

    # BLEU Score
    completion_tokens = normalize_code(completion).split()
    bleu_score = sentence_bleu([reference_tokens], completion_tokens, weights=(0.25, 0.25, 0.25, 0.25))

    # Edit Distance
    normalized_completion = normalize_code(completion)
    edit_distance = Levenshtein.distance(normalized_reference, normalized_completion)

    # Diversity (pairwise distances)
    for j in range(i + 1, len(completions)):
        distance = Levenshtein.distance(normalized_completion, normalize_code(completions[j]))
        pairwise_distances.append(distance)

    print(f"\nCompletion {i+1}:")
    print(f"Code:\n{completion}")
    print(f"Syntax Valid: {syntax_valid}")
    print(f"Functionally Equivalent: {functional_valid}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"Edit Distance: {edit_distance}")

# Sample Diversity
avg_distance = np.mean(pairwise_distances) if pairwise_distances else 0
print(f"\nMetrics Summary:")
print(f"Average Inference Time: {avg_inference_time:.4f} seconds")
print(f"Average Pairwise Edit Distance (Diversity): {avg_distance:.2f}")



RuntimeError: Failed to import transformers.models.t5.modeling_t5 because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
No module named 'torch.sparse._triton_ops'

In [None]:
# Colab script to generate Python/Verilog code with CodeT5-base, check syntax, calculate Pass@k, and compute BLEU score

# Install dependencies with specific versions
!pip install transformers==4.40.0 torch==2.0.1 nltk==3.8.1 pyverilog==1.3.0 sentencepiece==0.2.0 --quiet

# Clear Hugging Face cache to avoid corrupted files
!rm -rf ~/.cache/huggingface/

import os
import torch
from transformers import RobertaTokenizer, T5ForConditionalGeneration
import pyverilog.vparser.parser as pyverilog_parser
import numpy as np
import re
import unittest
import importlib.util
import io
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Initialize CodeT5-base with RobertaTokenizer
model_name = "Salesforce/codet5-base"
try:
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")
    raise

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example prompts
prompts = {
    "verilog": """
Write a Verilog module for a 2-input AND gate with inputs a, b and output y.
""",
    "python": """
Write a Python function named add that takes two integers a and b and returns their sum.
"""
}

# Reference codes for BLEU score
reference_codes = {
    "verilog": """
module and_gate(
    input wire a,
    input wire b,
    output wire y
);
assign y = a & b;
endmodule
""",
    "python": """
def add(a, b):
    return a + b
"""
}

# Code tokenizer for BLEU score (handles Python and Verilog)
def code_tokenizer(code):
    # Normalize whitespace and split into tokens (keywords, identifiers, symbols)
    code = re.sub(r'\s+', ' ', code.strip())
    # Split on spaces, punctuation, and operators, preserving meaningful tokens
    tokens = re.findall(r'\w+|[^\w\s]', code)
    return tokens

# Function to generate multiple completions
def generate_completions(prompt, num_completions=10, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=num_completions,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Verilog syntax check
def check_verilog_syntax(code, filename):
    with open(filename, "w") as f:
        f.write(code)
    try:
        ast, _ = pyverilog_parser.parse([filename])
        return True
    except Exception:
        return False

# Python syntax check
def check_python_syntax(code, filename):
    with open(filename, "w") as f:
        f.write(code)
    try:
        py_compile.compile(filename, doraise=True)
        return True
    except py_compile.PyCompileError:
        return False

# Verilog functional correctness (AST-based check in Colab)
def check_verilog_functional(code):
    try:
        with open("temp_verilog_check.v", "w") as f:
            f.write(code)
        ast, _ = pyverilog_parser.parse(["temp_verilog_check.v"])
        for item in ast.description.items:
            if isinstance(item, pyverilog_parser.ast.ModuleDef):
                if item.name == "and_gate":
                    inputs = [port.first.name for port in item.portlist.ports if isinstance(port.first, pyverilog_parser.ast.Input)]
                    outputs = [port.first.name for port in item.portlist.ports if isinstance(port.first, pyverilog_parser.ast.Output)]
                    if set(inputs) == {"a", "b"} and outputs == ["y"]:
                        for decl in item.items:
                            if isinstance(decl, pyverilog_parser.ast.Assign):
                                if isinstance(decl.right, pyverilog_parser.ast.And) and \
                                   isinstance(decl.right.left, pyverilog_parser.ast.Identifier) and decl.right.left.name == 'a' and \
                                   isinstance(decl.right.right, pyverilog_parser.ast.Identifier) and decl.right.right.name == 'b' and \
                                   isinstance(decl.left, pyverilog_parser.ast.Identifier) and decl.left.name == 'y':
                                    return True
        return False
    except Exception:
        return False
    finally:
        if os.path.exists("temp_verilog_check.v"):
            os.remove("temp_verilog_check.v")

# Python functional correctness (unit tests)
def check_python_functional(code, filename):
    with open(filename, "w") as f:
        f.write(code)
    test_code = """
import unittest
import importlib.util
spec = importlib.util.spec_from_file_location("sample_python", "{filename}")
sample_python = importlib.util.module_from_spec(spec)
spec.loader.exec_module(sample_python)

class TestAddFunction(unittest.TestCase):
    def test_add_positive(self):
        self.assertEqual(sample_python.add(2, 3), 5)
    def test_add_negative(self):
        self.assertEqual(sample_python.add(-1, -2), -3)
    def test_add_zero(self):
        self.assertEqual(sample_python.add(0, 5), 5)

def run_tests():
    suite = unittest.TestLoader().loadTestsFromTestCase(TestAddFunction)
    runner = unittest.TextTestRunner(stream=io.StringIO())
    result = runner.run(suite)
    return result.wasSuccessful()
""".format(filename=filename)

    test_filename = "/content/test_sample_python.py"
    with open(test_filename, "w") as f:
        f.write(test_code)
    try:
        exec(open(test_filename).read(), globals())
        return run_tests()
    except Exception:
        return False
    finally:
        if os.path.exists(test_filename):
            os.remove(test_filename)

# Calculate Pass@k
def calculate_pass_k(correct, k, n):
    if n < k:
        return 0.0
    epsilon = 1e-9
    return 1.0 - np.prod([1.0 - correct / max(epsilon, (n - i)) for i in range(k)])

# Main evaluation function
def evaluate_code(prompt_type, prompt, num_completions=10):
    completions = generate_completions(prompt, num_completions)
    correct_count = 0
    results = []
    ref_tokens = [code_tokenizer(reference_codes[prompt_type])]

    for i, code in enumerate(completions):
        filename = f"/content/sample_{prompt_type}_{i}.{'v' if prompt_type == 'verilog' else 'py'}"

        # Syntax check
        syntax_valid = check_verilog_syntax(code, filename) if prompt_type == "verilog" else check_python_syntax(code, filename)

        # Functional check
        functional_valid = False
        if syntax_valid:
            functional_valid = check_verilog_functional(code) if prompt_type == "verilog" else check_python_functional(code, filename)

        is_correct = syntax_valid and functional_valid
        if is_correct:
            correct_count += 1

        # BLEU score
        cand_tokens = code_tokenizer(code)
        bleu1 = sentence_bleu(ref_tokens, cand_tokens, weights=(1, 0, 0, 0), smoothing_function=SmoothingFunction().method1)
        bleu4 = sentence_bleu(ref_tokens, cand_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)

        results.append({
            "completion": code,
            "syntax_valid": syntax_valid,
            "functional_valid": functional_valid,
            "correct": is_correct,
            "bleu1": bleu1,
            "bleu4": bleu4
        })

        # Clean up files
        if os.path.exists(filename):
            os.remove(filename)

    # Calculate Pass@k
    pass_at_k = {}
    for k in [1, 5, 10]:
        if k <= num_completions:
            pass_at_k[f"Pass@{k}"] = calculate_pass_k(min(correct_count, num_completions), k, num_completions)

    return results, pass_at_k

# Run evaluation for Verilog
print("\nEvaluating Verilog Completions")
verilog_results, verilog_pass_k = evaluate_code("verilog", prompts["verilog"])
for i, result in enumerate(verilog_results):
    print(f"\nCompletion {i+1}:")
    print(f"Code:\n{result['completion']}")
    print(f"Syntax Valid: {result['syntax_valid']}")
    print(f"Functional Valid: {result['functional_valid']}")
    print(f"Correct: {result['correct']}")
    print(f"BLEU-1 Score: {result['bleu1']:.4f}")
    print(f"BLEU-4 Score: {result['bleu4']:.4f}")
print("\nVerilog Pass@k Metrics:", verilog_pass_k)

# Run evaluation for Python
print("\nEvaluating Python Completions")
python_results, python_pass_k = evaluate_code("python", prompts["python"])
for i, result in enumerate(python_results):
    print(f"\nCompletion {i+1}:")
    print(f"Code:\n{result['completion']}")
    print(f"Syntax Valid: {result['syntax_valid']}")
    print(f"Functional Valid: {result['functional_valid']}")
    print(f"Correct: {result['correct']}")
    print(f"BLEU-1 Score: {result['bleu1']:.4f}")
    print(f"BLEU-4 Score: {result['bleu4']:.4f}")
print("\nPython Pass@k Metrics:", python_pass_k)