<a href="https://colab.research.google.com/github/kae1dy/NLPCodeReview/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install transformers

from datasets import Dataset
import pandas as pd
from transformers import AutoModel, AutoTokenizer

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
# preprocessing dataset

url = "https://raw.githubusercontent.com/CommentFinder/CommentFinder/master/dataset"

data_files = {
    "train": url + "/train.tsv",
    "test":  url + "/test.tsv",
}
train = pd.read_csv(data_files["train"], header=None, sep='\t', on_bad_lines='skip', skipinitialspace=True, names=['source', 'target'])
test = pd.read_csv(data_files["test"], header=None, sep='\t', on_bad_lines='skip', skipinitialspace=True, names=['source', 'target'])

train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test[:5000])

train, test

(Dataset({
     features: ['source', 'target'],
     num_rows: 134225
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 5000
 }))

In [3]:
checkpoint = "Salesforce/codet5p-110m-embedding"
device = "cuda"

import gc
gc.collect()

tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

# code embedding
def get_embedding(source):
    input = tokenizer(source, padding=True, truncation=True, return_tensors="pt") # truncation=True
    input = {k: v.to(device) for k, v in input.items()}

    model_output = model(**input)
    return model_output

train = train.map(
    lambda x: {"embedding": get_embedding(x["source"]).detach().cpu().numpy()[0]}, remove_columns=["source"] # batched=True, batch_size=26, remove_columns=["source", "target"]
)
test = test.map(
    lambda x: {"embedding": get_embedding(x["source"]).detach().cpu().numpy()[0]}, remove_columns=["source"] # batched=True, batch_size=26, remove_columns=["source", "target"]
)

# with batched 30.83 examples/s
# without 55.29 examples/s
train, test

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading (…)codet5p_embedding.py:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- configuration_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)codet5p_embedding.py:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- modeling_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Map:   0%|          | 0/134225 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

(Dataset({
     features: ['target', 'embedding'],
     num_rows: 134225
 }),
 Dataset({
     features: ['target', 'embedding'],
     num_rows: 5000
 }))

In [4]:
import statistics
import time
import numpy as np
from nltk.translate import bleu_score
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from functools import wraps

# train['embedding'], test['embedding']
print(f'\nVector Length: {len(test["embedding"][0])}.')


def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        total_time = time.perf_counter() - start_time
        print(f'\nTime cost ({func.__name__}): {total_time:.4f} seconds.')
        return result
    return timeit_wrapper


def predict_top_k(test, source, topk=10) -> list:
    prediction = []
    similarity = cosine_similarity(test['embedding'], source['embedding'])

    for index, similar in enumerate(tqdm(similarity)):

        index_nn = np.argpartition(similar, -topk)[-topk:]
        current_prediction = [source['target'][i] for i in index_nn]

        prediction.append(current_prediction)

    #   write the recommendation comments to the file named as "our_predictions_k.txt"
    with open('predictions_' + str(topk) + '.txt', 'w') as f:
        for data in prediction:
            for element in data:
                f.write(element + '\n')
    return prediction


def batch_data(data, batch_size=1000):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]


chencherry = bleu_score.SmoothingFunction()
BLEUscore = []
count_perfect = 0
k = 10

for test_batch in batch_data(test):
    # Compute the cosine distance and its computational time
    prediction = predict_top_k(test_batch, train, k)

    # Evaluate perfect prediction & BLEU score of our approach
    for i, target in enumerate(tqdm(test_batch['target'])):
        best_BLEU = 0
        for pred in prediction[i]:
            current_BLEU = bleu_score.sentence_bleu([target], pred, smoothing_function=chencherry.method1)
            best_BLEU = max(best_BLEU, current_BLEU)

            if " ".join(pred.split()) == " ".join(target.split()):
                count_perfect += 1
                break
        BLEUscore.append(best_BLEU)


print(f'\nPP    : %d/%d (%s%.2f)' % (count_perfect, len(test['target']), '%', (count_perfect * 100) / len(test['target'])))
print(f'BLEU mean              : ', statistics.mean(BLEUscore))


Vector Length: 256.


100%|██████████| 1000/1000 [22:55<00:00,  1.38s/it]
100%|██████████| 1000/1000 [00:04<00:00, 210.77it/s]
100%|██████████| 1000/1000 [22:20<00:00,  1.34s/it]
100%|██████████| 1000/1000 [00:04<00:00, 230.43it/s]
100%|██████████| 1000/1000 [23:00<00:00,  1.38s/it]
100%|██████████| 1000/1000 [00:03<00:00, 256.46it/s]
100%|██████████| 1000/1000 [23:14<00:00,  1.39s/it]
100%|██████████| 1000/1000 [00:04<00:00, 218.78it/s]
100%|██████████| 1000/1000 [22:39<00:00,  1.36s/it]
100%|██████████| 1000/1000 [00:04<00:00, 246.67it/s]


PP    : 204/5000 (%4.08)
BLEU mean              :  0.24526899805535846



