<a href="https://colab.research.google.com/github/kae1dy/NLPCodeReview/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers

from datasets import Dataset
import pandas as pd
from transformers import AutoModel, AutoTokenizer

In [6]:
# preprocessing dataset

url = "https://raw.githubusercontent.com/CommentFinder/CommentFinder/master/dataset"

data_files = {
    "train": url + "/train.tsv",
    "test":  url + "/test.tsv",
}
train = pd.read_csv(data_files["train"], header=None, sep='\t', on_bad_lines='skip', skipinitialspace=True, names=['source', 'target'])
test = pd.read_csv(data_files["test"], header=None, sep='\t', on_bad_lines='skip', skipinitialspace=True, names=['source', 'target'])

train = Dataset.from_pandas(train[:50000])
test = Dataset.from_pandas(test[:10000])

train, test

(Dataset({
     features: ['source', 'target'],
     num_rows: 50000
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 10000
 }))

In [7]:
checkpoint = "Salesforce/codet5p-110m-embedding"
device = "cuda"

# import gc
# gc.collect()

tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

# code embedding
def get_embedding(source):
    input = tokenizer(source, padding=True, truncation=True, return_tensors="pt")
    input = {k: v.to(device) for k, v in input.items()}

    model_output = model(**input)
    return model_output

train = train.map(
    lambda x: {"embedding": get_embedding(x["source"]).detach().cpu().numpy()[0]} # batched=True???
)

test = test.map(
    lambda x: {"embedding": get_embedding(x["source"]).detach().cpu().numpy()[0]} # batched=True???
)
train, test

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

(Dataset({
     features: ['source', 'target', 'embedding'],
     num_rows: 50000
 }),
 Dataset({
     features: ['source', 'target', 'embedding'],
     num_rows: 10000
 }))

In [9]:
import statistics
import time
import numpy as np
from nltk.translate import bleu_score
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from functools import wraps

train['embedding'], test['embedding']

print(f'\nVector Length: {len(test["embedding"][0])}.')


def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        total_time = time.perf_counter() - start_time
        print(f'\nTime cost ({func.__name__} {args}): {total_time:.4f} seconds.')
        return result
    return timeit_wrapper


@timeit
def predict_top_k(test, source, topk=10) -> list:
    prediction = []
    similarity = cosine_similarity(test['embedding'], source['embedding'])

    for index, similar in enumerate(tqdm(similarity)):

        index_nn = np.argpartition(similar, -topk)[-topk:]
        current_prediction = [source['target'][i] for i in index_nn]

        prediction.append(current_prediction)

    #   write the recommendation comments to the file named as "our_predictions_k.txt"
    with open('predictions_' + str(topk) + '.txt', 'w') as f:
        for data in prediction:
            for element in data:
                f.write(element + '\n')
    return prediction

# Compute the cosine distance and its computational time
prediction = predict_top_k(test, train)
chencherry = bleu_score.SmoothingFunction()

predictions = [line.strip() for line in open("predictions_10.txt")]

# Evaluate perfect prediction & BLEU score of our approach
for k in [1, 3, 5, 10]:

    print(f'\nk candidates: {k}.')
    count_perfect = 0
    BLEUscore = []

    for i, target in enumerate(tqdm(test['target'])):
        best_BLEU = 0
        for prediction in predictions[i * k:i * k + k]:
            current_BLEU = bleu_score.sentence_bleu([target], prediction, smoothing_function=chencherry.method1)
            best_BLEU = max(best_BLEU, current_BLEU)

            if " ".join(prediction.split()) == " ".join(target.split()):
                count_perfect += 1
                break
        BLEUscore.append(best_BLEU)

    print(f'\nPP    : %d/%d (%s%.2f)' % (count_perfect, len(test['target']), '%', (count_perfect * 100) / len(test['target'])))
    print(f'BLEU mean              : ', statistics.mean(BLEUscore))


k candidates: 1.


100%|██████████| 10000/10000 [00:07<00:00, 1282.10it/s]



PP    : 0/10000 (%0.00)
BLEU mean              :  0.07186670454582522

k candidates: 3.


100%|██████████| 10000/10000 [00:15<00:00, 626.81it/s]



PP    : 0/10000 (%0.00)
BLEU mean              :  0.12376671759056854

k candidates: 5.


100%|██████████| 10000/10000 [00:23<00:00, 425.50it/s]



PP    : 0/10000 (%0.00)
BLEU mean              :  0.14684517137416095

k candidates: 10.


100%|██████████| 10000/10000 [00:45<00:00, 221.54it/s]


PP    : 225/10000 (%2.25)
BLEU mean              :  0.21998532860945674



