In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import tensorflow as tf

In [25]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True, cache_dir="./.models").cuda()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [71]:
input_text = '''
def multiply(a, b):
    """Complete the function that takes two integers and returns 
    the product of their unit digits.
    Assume the input is always valid.
    Examples:
    multiply(148, 412) should return 16.
    multiply(19, 28) should return 72.
    multiply(2020, 1851) should return 0.
    multiply(14,-15) should return 20.
    """
    return abs(a * 10) * abs(b % 10)
'''
inputs = tokenizer(input_text, return_tensors="pt", padding='max_length', max_length=1000).to(model.device)

with torch.no_grad():
    last_hidden_state = model(**inputs, output_hidden_states=True).hidden_states[-1]
    print(last_hidden_state)

tensor([[[-0.1228,  0.1144, -0.0702,  ...,  0.1505, -0.1266, -0.0158],
         [-0.1228,  0.1144, -0.0702,  ...,  0.1505, -0.1266, -0.0158],
         [-0.1228,  0.1144, -0.0702,  ...,  0.1505, -0.1266, -0.0158],
         ...,
         [-0.0877, -0.1349,  0.2730,  ...,  0.1404, -0.2630,  0.2289],
         [-0.0689,  0.1239,  0.1794,  ...,  0.0543, -0.1602,  0.0352],
         [ 0.2016,  0.3785, -0.0902,  ..., -0.0068, -0.0309, -0.3164]]],
       device='cuda:0')


In [72]:
input_text = '''
def multiply(a, b):
    """Complete the function that takes two integers and returns 
    the product of their unit digits.
    Assume the input is always valid.
    Examples:
    multiply(148, 412) should return 16.
    multiply(19, 28) should return 72.
    multiply(2020, 1851) should return 0.
    multiply(14,-15) should return 20.
    """
    return abs(a % 10) * abs(b % 10)
'''

inputs = tokenizer(input_text, return_tensors="pt", padding='max_length', max_length=1000).to(model.device)
with torch.no_grad():
    last_hidden_state2 = model(**inputs, output_hidden_states=True).hidden_states[-1]
    print(last_hidden_state2)

tensor([[[-0.1226,  0.1177, -0.0766,  ...,  0.1515, -0.1237, -0.0240],
         [-0.1226,  0.1177, -0.0766,  ...,  0.1515, -0.1237, -0.0240],
         [-0.1226,  0.1177, -0.0766,  ...,  0.1515, -0.1237, -0.0240],
         ...,
         [-0.0678, -0.0914,  0.2131,  ...,  0.2443, -0.0695,  0.3230],
         [-0.1141,  0.1411,  0.1703,  ...,  0.2180, -0.0491,  0.1528],
         [ 0.2050,  0.3780, -0.0561,  ...,  0.0264, -0.0401, -0.3204]]],
       device='cuda:0')


In [73]:
import torch

print("Euclidean Distance:",tf.norm(last_hidden_state2.cpu()-last_hidden_state.cpu(),ord='euclidean'))

Euclidean Distance: tf.Tensor(32.332375, shape=(), dtype=float32)


In [91]:
def compute_dis(src1, src2):

    size1 = tokenizer(src1, return_tensors="pt").to(model.device)['input_ids'].shape[1]
    size2 = tokenizer(src2, return_tensors="pt").to(model.device)['input_ids'].shape[1]
    size = max(size1, size2)

    inputs1 = tokenizer(src1, return_tensors="pt", padding='max_length', max_length=size).to(model.device)
    inputs2 = tokenizer(src2, return_tensors="pt", padding='max_length', max_length=size).to(model.device)

    with torch.no_grad():
        last_hidden_state1 = model(**inputs1, output_hidden_states=True).hidden_states[-1]
        last_hidden_state2 = model(**inputs2, output_hidden_states=True).hidden_states[-1]

    return tf.norm(last_hidden_state2.cpu()-last_hidden_state1.cpu(),ord='euclidean')

In [99]:
import glob
import os
# All files and directories ending with .txt and that don't begin with a dot:
mutants = glob.glob("mutpy/mutants/human_eval/*/m*.py")

for m in mutants:
    name = m.split('/')[-1].split('.py')[0]
    ind = m.split('/')[-2].replace('m', '')
    original_file = m.replace(name, f'src{ind}')
    with open(original_file, 'r') as file:
        original_src = file.read()
    with open(m, 'r') as file:
        mutant_src = file.read()
    os.system(f'echo {name} >> mutant_dis.txt')
    os.system(f'echo {compute_dis(original_src, mutant_src)} >> mutant_dis.txt')

In [116]:
from statistics import median

survived = set()
killed = set()
survived_dis = []
killed_dis = []
with open('mutpy/mutant_status.txt', 'r') as f:
    for l in f.readlines():
        l = l.strip()
        if l.startswith('m'):
            cur_mut = l
        elif 'Survived' in l:
            survived.add(cur_mut)
        elif 'Killed' in l:
            killed.add(cur_mut)
        elif 'Timeout' in l:
            killed.add(cur_mut)
        else:
            raise Exception('unknown type')
with open('mutant_dis.txt', 'r') as f:
    for l in f.readlines():
        l = l.strip()
        if l.startswith('m'):
            cur_mut = l
        else:
            dis = float(l)
            if cur_mut in survived:
                survived_dis.append(dis)
            elif cur_mut in killed:
                killed_dis.append(dis)
            else:
                print(cur_mut)
                continue

print(sum(survived_dis) / len(survived))
print(sum(killed_dis) / len(killed))

print(median(survived_dis))
print(median(killed_dis))

253.35802700645044
223.70555181424334
245.00821685791016
227.03334045410156


In [117]:
mutants = glob.glob("mutpy/mutants/human_eval/*/m*.py")
selected_inds = set()

for m in mutants:
    name = m.split('/')[-1].split('.py')[0]
    ind = m.split('/')[-2].replace('m', '')
    if name in survived:
        selected_inds.add(ind)

for m in mutants:
    name = m.split('/')[-1].split('.py')[0]
    ind = m.split('/')[-2].replace('m', '')
    if ind not in selected_inds:
        continue
    original_file = m.replace(name, f'src{ind}')
    with open(original_file, 'r') as file:
        original_src = file.read()
    with open(m, 'r') as file:
        mutant_src = file.read()
    os.system(f'echo {name} >> selected_mutant_dis.txt')
    os.system(f'echo {compute_dis(original_src, mutant_src)} >> selected_mutant_dis.txt')

In [119]:
from statistics import median

survived = set()
killed = set()
survived_dis = []
killed_dis = []
with open('mutpy/mutant_status.txt', 'r') as f:
    for l in f.readlines():
        l = l.strip()
        if l.startswith('m'):
            cur_mut = l
        elif 'Survived' in l:
            survived.add(cur_mut)
        elif 'Killed' in l:
            killed.add(cur_mut)
        elif 'Timeout' in l:
            killed.add(cur_mut)
        else:
            raise Exception('unknown type')
with open('selected_mutant_dis.txt', 'r') as f:
    for l in f.readlines():
        l = l.strip()
        if l.startswith('m'):
            cur_mut = l
        else:
            dis = float(l)
            if cur_mut in survived:
                survived_dis.append(dis)
            elif cur_mut in killed:
                killed_dis.append(dis)
            else:
                print(cur_mut)
                continue

print(sum(survived_dis) / len(survived))
print(sum(killed_dis) / len(killed))

print(median(survived_dis))
print(median(killed_dis))

253.35802700645044
127.01595306396484
245.00821685791016
239.3861846923828
