In [16]:
import math
import random
from collections import defaultdict
import math
import random
from collections import defaultdict
import pandas as pd
import ast
import javalang
import json

def tokenize_java(code):
    """
    Tokenizes a Java code string using javalang.
    Returns a list of tokens as strings.
    """
    try:
        tokens = list(javalang.tokenizer.tokenize(code))
        return [t.value for t in tokens]
    except Exception as E:
        return []

In [17]:
class NGramModel:
    def __init__(self, N):
        self.N = N
        self.model = defaultdict(lambda: defaultdict(int))  # counts
        self.prefix_totals = defaultdict(int)              # total counts per prefix
        self.vocab = set()                                 # store vocab once
        self.vocab_size = 0

    def train(self, tokens_list):
        """
        Build the n-gram counts from a list of tokenized sequences.
        """
        for tokens in tokens_list:
            if len(tokens) < self.N:
                continue
            for i in range(len(tokens) - self.N + 1):
                prefix = tuple(tokens[i:i + self.N - 1])
                next_token = tokens[i + self.N - 1]
                self.model[prefix][next_token] += 1
                self.prefix_totals[prefix] += 1
                self.vocab.add(next_token)  # update vocab
        self.vocab_size = len(self.vocab)

    def prob(self, prefix, next_token):
        """
        Probability with Laplace smoothing.
        O(1) lookup.
        """
        prefix = tuple(prefix)
        count_next = self.model[prefix].get(next_token, 0)
        total = self.prefix_totals.get(prefix, 0)
        return (count_next + 1) / (total + self.vocab_size)

    def perplexity(self, tokens_list):
        """
        Compute perplexity over a list of tokenized sequences.
        Optimized to avoid recomputing vocab or totals.
        """
        total_log_prob = 0.0
        total_tokens = 0

        for tokens in tokens_list:
            if len(tokens) < self.N:
                continue
            for i in range(len(tokens) - self.N + 1):
                prefix = tokens[i:i + self.N - 1]
                target = tokens[i + self.N - 1]
                count_next = self.model[tuple(prefix)].get(target, 0)
                total = self.prefix_totals.get(tuple(prefix), 0)
                prob = (count_next + 1) / (total + self.vocab_size)
                total_log_prob += math.log(prob)
                total_tokens += 1

        return math.exp(-total_log_prob / total_tokens) if total_tokens > 0 else float("inf")

    def sample_next(self, prefix):
        """
        Sample next token given a prefix.
        """
        prefix = tuple(prefix)
        if prefix not in self.model:
            return None
        tokens = list(self.model[prefix].keys())
        counts = list(self.model[prefix].values())
        return random.choices(tokens, weights=counts, k=1)[0]

In [18]:
def find_best_model(data_path):
    # Load your dataset
    # df = pd.read_csv("methods.csv")
    df = pd.read_csv(data_path)
    
    # Convert stringified lists back into Python lists
    df["new_code"] = df["original_code"].apply(tokenize_java)
    df["tokenized_code"] = df["new_code"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    train_tokens = df[df["dataset_split"] == "train"]["tokenized_code"].tolist()
    test_tokens_int  = df[df["dataset_split"] == "test"]["tokenized_code"].tolist()
    test_tokens = test_tokens_int[:4000]
    sample_tokens = test_tokens_int[4001:]
       
    # Try different N values
    best_N, best_ppl = None, float("inf")
    
    for N in range(3, 12, 2):
        model = NGramModel(N)
        model.train(train_tokens)
        ppl = model.perplexity(test_tokens)
        print(f"N={N}, Perplexity={ppl:.2f}")
        if ppl < best_ppl:
            best_ppl, best_N = ppl, N
    
    print(f"Best model: N={best_N} with perplexity {best_ppl:.2f}")
    return best_N, train_tokens, sample_tokens

def sample_sequence(best_N, train_tokens, start_tokens, max_len=50):
    """
    Generate sequence until model can't predict or max_len is reached.
    Uses the best N-gram model.
    """
    model = NGramModel(best_N)
    model.train(train_tokens)
    tok_seq = start_tokens[:]
    result = start_tokens[:]
    while len(tok_seq) < max_len:
        prefix = tok_seq[-(best_N- 1):]  # last N-1 tokens
        next_tok = model.sample_next(prefix)
        probab = round(model.prob(prefix, next_tok), 5)
        if next_tok is None:
            break  # no continuation found
        result.append((next_tok, probab))
        tok_seq.append(next_tok)
    return tok_seq, result


def write_output(op_data, op_filename = "output.json"):
    with open(op_filename, "w") as f:
        json.dump(op_data, f, indent=4) # indent for pretty-printing
    print(f"Output successfully written to {op_filename}")

In [19]:
def main():
    data_path = "methods.csv"
    best_N, train_tokens, sample_tokens = find_best_model(data_path)
    sample_tokens = [i[:best_N] for i in sample_tokens]
    result_dict = {}
    for i, seq in enumerate(sample_tokens):
        generated, prob_seq = sample_sequence(best_N, train_tokens, seq, max_len=30)
        # print("Input:", seq)
        # print("Generated token:", generated)
        # print("Token probability:", prob_seq)
        # print()
        result_dict[i] = {"input":seq,
                          "generated tokens": generated,
                          "token prob": prob_seq}
    write_output(result_dict)

In [20]:
if __name__ == "__main__":
    main()

N=3, Perplexity=2709.65
N=5, Perplexity=15135.37
N=7, Perplexity=30042.56
N=9, Perplexity=38290.17
N=11, Perplexity=42397.12
Best model: N=3 with perplexity 2709.65
Output successfully written to output.json
