## AI-powered Code Autocompletion


In [22]:
# Import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from datasets import load_dataset
import json

In [23]:
def jsonl_generator(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                yield json.loads(line)

In [24]:
train_path = "code_search_net/python/final/jsonl/train/python_train_0.jsonl"
test_path = "code_search_net/python/final/jsonl/test/python_test_0.jsonl"
valid_path = "code_search_net/python/final/jsonl/valid/python_valid_0.jsonl"

# Example usage
for obj in jsonl_generator(train_path):
    # process each JSON object
    pass

In [25]:
def print_code_examples(filepath, num_examples=5):
    count = 0
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                example = json.loads(line)
                print(f"\n--- Example {count+1} ---")
                print(f"Function name: {example.get('func_name', 'N/A')}")
                print(
                    f"Docstring: {example.get('docstring', 'N/A')[:100]}..."
                )  # Print first 100 chars
                print(f"Code:\n{example.get('code', 'N/A')}")
                count += 1
                if count >= num_examples:
                    break

In [26]:
# Print 3 examples from the training set
print("TRAINING EXAMPLES:")
print_code_examples(train_path, 3)

TRAINING EXAMPLES:

--- Example 1 ---
Function name: train
Docstring: Trains a k-nearest neighbors classifier for face recognition.

    :param train_dir: directory that ...
Code:
def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):
    """
    Trains a k-nearest neighbors classifier for face recognition.

    :param train_dir: directory that contains a sub-directory for each known person, with its name.

     (View in source code to see train_dir example tree structure)

     Structure:
        <train_dir>/
        ├── <person1>/
        │   ├── <somename1>.jpeg
        │   ├── <somename2>.jpeg
        │   ├── ...
        ├── <person2>/
        │   ├── <somename1>.jpeg
        │   └── <somename2>.jpeg
        └── ...

    :param model_save_path: (optional) path to save model on disk
    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified
    :param knn_algo: (optional) underl

In [27]:
print("\nVALIDATION EXAMPLES:")
print_code_examples(valid_path, 2)


VALIDATION EXAMPLES:

--- Example 1 ---
Function name: learn
Docstring: Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
  ...
Code:
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs
            ):
    """Train a deepq model.

    Parameters
    

In [28]:
def print_filtered_examples(filepath, libraries=["numpy"], num_examples=3):
    count = 0
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                example = json.loads(line)
                code = example.get("code", "").lower()

                # Check if any of the libraries are used in the code
                if any(lib in code for lib in libraries):
                    print(f"\n--- Example {count+1} using target libraries ---")
                    print(f"Function name: {example.get('func_name', 'N/A')}")
                    print(f"Code:\n{example.get('code', 'N/A')}")
                    count += 1
                    if count >= num_examples:
                        break


# Print examples that use our target libraries
print("\nEXAMPLES USING TARGET LIBRARIES:")
print_filtered_examples(train_path)


EXAMPLES USING TARGET LIBRARIES:

--- Example 1 using target libraries ---
Function name: _trim_css_to_bounds
Code:
def _trim_css_to_bounds(css, image_shape):
    """
    Make sure a tuple in (top, right, bottom, left) order is within the bounds of the image.

    :param css:  plain tuple representation of the rect in (top, right, bottom, left) order
    :param image_shape: numpy shape of the image array
    :return: a trimmed plain tuple representation of the rect in (top, right, bottom, left) order
    """
    return max(css[0], 0), min(css[1], image_shape[1]), min(css[2], image_shape[0]), max(css[3], 0)

--- Example 2 using target libraries ---
Function name: face_distance
Code:
def face_distance(face_encodings, face_to_compare):
    """
    Given a list of face encodings, compare them to a known face encoding and get a euclidean distance
    for each comparison face. The distance tells you how similar the faces are.

    :param faces: List of face encodings to compare
    :param f

In [29]:
def collect_training_data(
    filepath, max_examples=1000, libraries=["tensorflow", "numpy", "transformers"]
):
    examples = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                example = json.loads(line)
                code = example.get("code", "")

                # Optionally filter for target libraries
                if not libraries or any(lib in code.lower() for lib in libraries):
                    examples.append(
                        {
                            "function_name": example.get("func_name", ""),
                            "docstring": example.get("docstring", ""),
                            "code": code,
                            "language": example.get("language", "python"),
                        }
                    )

                if len(examples) >= max_examples:
                    break

    print(f"Collected {len(examples)} examples")
    return examples


# Collect training data
training_data = collect_training_data(train_path, max_examples=500)

Collected 500 examples


In [30]:
training_data

[{'function_name': '_trim_css_to_bounds',
  'docstring': 'Make sure a tuple in (top, right, bottom, left) order is within the bounds of the image.\n\n    :param css:  plain tuple representation of the rect in (top, right, bottom, left) order\n    :param image_shape: numpy shape of the image array\n    :return: a trimmed plain tuple representation of the rect in (top, right, bottom, left) order',
  'code': 'def _trim_css_to_bounds(css, image_shape):\n    """\n    Make sure a tuple in (top, right, bottom, left) order is within the bounds of the image.\n\n    :param css:  plain tuple representation of the rect in (top, right, bottom, left) order\n    :param image_shape: numpy shape of the image array\n    :return: a trimmed plain tuple representation of the rect in (top, right, bottom, left) order\n    """\n    return max(css[0], 0), min(css[1], image_shape[1]), min(css[2], image_shape[0]), max(css[3], 0)',
  'language': 'python'},
 {'function_name': 'face_distance',
  'docstring': "Given

In [31]:
training_data[1]

{'function_name': 'face_distance',
 'docstring': "Given a list of face encodings, compare them to a known face encoding and get a euclidean distance\n    for each comparison face. The distance tells you how similar the faces are.\n\n    :param faces: List of face encodings to compare\n    :param face_to_compare: A face encoding to compare against\n    :return: A numpy ndarray with the distance for each face in the same order as the 'faces' array",
 'code': 'def face_distance(face_encodings, face_to_compare):\n    """\n    Given a list of face encodings, compare them to a known face encoding and get a euclidean distance\n    for each comparison face. The distance tells you how similar the faces are.\n\n    :param faces: List of face encodings to compare\n    :param face_to_compare: A face encoding to compare against\n    :return: A numpy ndarray with the distance for each face in the same order as the \'faces\' array\n    """\n    if len(face_encodings) == 0:\n        return np.empty((0

In [32]:
import random
from transformers import AutoTokenizer

# Load a code-optimized tokenizer
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")


def prepare_training_examples(examples, tokenizer, max_length=512):
    """Convert raw code examples into training samples for code completion."""
    training_samples = []

    for example in examples:
        code = example["code"]

        # Skip empty or very short functions
        if len(code.strip()) < 20:
            continue

        # Tokenize the code
        tokenized = tokenizer(code, truncation=True, max_length=max_length)
        input_ids = tokenized["input_ids"]

        # Create training examples with different completion points
        # We'll randomly mask 10-50% of the tokens at the end
        seq_length = len(input_ids)
        if seq_length > 20:  # Ensure there's enough content to mask
            # Create a few different masking positions for each example
            for _ in range(3):
                # Decide how much to keep (50-90% of tokens)
                keep_percent = random.uniform(0.5, 0.9)
                keep_tokens = int(seq_length * keep_percent)

                # Create input/target pairs
                input_sample = input_ids[:keep_tokens]
                target_sample = input_ids[keep_tokens:]

                training_samples.append(
                    {
                        "input_ids": input_sample,
                        "labels": target_sample,
                        "original_code": code,
                        "function_name": example["function_name"],
                    }
                )

    print(
        f"Created {len(training_samples)} training samples from {len(examples)} examples"
    )
    return training_samples


# Prepare the training data
training_samples = prepare_training_examples(training_data, tokenizer)



Created 1500 training samples from 500 examples


In [34]:
from transformers import AutoModelForCausalLM
from peft import (
    get_peft_config,
    PeftModel,
    PeftConfig,
    get_peft_model,
    LoraConfig,
    TaskType,
)

# 1. Load a pre-trained code model (small enough for Colab)
model_name = "Salesforce/codegen-350M-mono"  # A relatively small code model
model = AutoModelForCausalLM.from_pretrained(model_name)

# 2. Configure LoRA adapter
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,  # Rank of update matrices
    lora_alpha=32,  # Parameter scaling factor
    lora_dropout=0.1,
)

# 3. Create PEFT model
model = get_peft_model(model, peft_config)
print(
    f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}"
)
print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")
print(
    f"Percentage of trainable parameters: {100 * sum(p.numel() for p in model.parameters() if p.requires_grad) / sum(p.numel() for p in model.parameters()):.2f}%"
)

Trainable parameters: 655360
Total parameters: 357367808
Percentage of trainable parameters: 0.18%


In [38]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")

text = "def read_json(filename, print=False):"
input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=128)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


def read_json(filename, print=False):
    """
    Reads a JSON file and returns a list of dictionaries.
    """
    with open(filename, 'r') as f:
        return json.load(f)

def write_json(filename, data):
    """
    Writes a JSON file.
    """
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

def read_csv(filename, print=False):
    """
    Reads a CSV


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
import numpy as np


class CodeCompletionDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=512):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Get input and target
        input_ids = sample["input_ids"]
        labels = sample["labels"]

        # Combine input with a special token and the beginning of labels for training
        # We typically include the first token of labels in the input to help the model start
        combined_ids = input_ids + labels

        # Handle truncation if needed
        if len(combined_ids) > self.max_length:
            combined_ids = combined_ids[: self.max_length]

        # Create attention mask
        attention_mask = [1] * len(combined_ids)

        # Pad sequences if needed
        padding_length = self.max_length - len(combined_ids)
        if padding_length > 0:
            combined_ids = combined_ids + [self.tokenizer.pad_token_id] * padding_length
            attention_mask = attention_mask + [0] * padding_length

        # Set up labels (set to -100 for input portion to ignore in loss)
        labels = [-100] * len(input_ids) + combined_ids[len(input_ids) :]

        # Ensure all sequences have the right length
        if len(labels) > self.max_length:
            labels = labels[: self.max_length]
        elif len(labels) < self.max_length:
            labels = labels + [-100] * (self.max_length - len(labels))

        return {
            "input_ids": torch.tensor(combined_ids),
            "attention_mask": torch.tensor(attention_mask),
            "labels": torch.tensor(labels),
        }


# Create train and validation datasets
train_samples = training_samples[: int(0.9 * len(training_samples))]
val_samples = training_samples[int(0.9 * len(training_samples)) :]

train_dataset = CodeCompletionDataset(train_samples, tokenizer)
val_dataset = CodeCompletionDataset(val_samples, tokenizer)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./code-completion-model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # To simulate larger batch size
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=5e-4,
    weight_decay=0.01,
    fp16=True,  # Enable mixed-precision training
    load_best_model_at_end=True,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned adapter
model.save_pretrained("./code-completion-adapter")

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).

In [None]:
def generate_completion(model, tokenizer, function_prefix, max_new_tokens=100):
    inputs = tokenizer(function_prefix, return_tensors="pt")

    # Generate completion
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.95,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode the generated tokens
    completed_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Return only the newly generated part
    return completed_code[
        len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)) :
    ]


# Test with some examples
test_prefixes = [
    "def train_model(X_train, y_train):\n    # Create TensorFlow model\n    model = tf.keras",
    "def process_image(image_path):\n    # Load and preprocess image\n    import numpy as np\n    img = ",
    "def create_bert_classifier():\n    # Initialize a BERT model from HuggingFace\n    from transformers import ",
]

for prefix in test_prefixes:
    completion = generate_completion(model, tokenizer, prefix)
    print(f"\nPrefix:\n{prefix}")
    print(f"\nCompletion:\n{completion}")
    print("-" * 50)