<a href="https://colab.research.google.com/github/kae1dy/NLPCodeReview/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install transformers
!pip install umap-learn
!pip install 'umap-learn[plot]'

from datasets import Dataset
import pandas as pd
from transformers import AutoModel, AutoTokenizer

Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, safetensors, transformers
Successfully installed safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.2
Collecting umap-learn
  Downloading umap-learn-0.5.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)


In [2]:
# preprocessing dataset

url = "https://raw.githubusercontent.com/CommentFinder/CommentFinder/master/dataset"

data_files = {
    "train": url + "/train.tsv",
    "test":  url + "/test.tsv",
}
train = pd.read_csv(data_files["train"], header=None, sep='\t', on_bad_lines='skip', skipinitialspace=True, names=['source', 'target'])
test = pd.read_csv(data_files["test"], header=None, sep='\t', on_bad_lines='skip', skipinitialspace=True, names=['source', 'target'])

train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test[:5000])

train, test

(Dataset({
     features: ['source', 'target'],
     num_rows: 134225
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 5000
 }))

In [3]:
!git clone https://github.com/tree-sitter/tree-sitter-java
!pip install tree_sitter

Cloning into 'tree-sitter-java'...
remote: Enumerating objects: 2191, done.[K
remote: Counting objects: 100% (650/650), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 2191 (delta 586), reused 555 (delta 531), pack-reused 1541[K
Receiving objects: 100% (2191/2191), 15.50 MiB | 17.83 MiB/s, done.
Resolving deltas: 100% (1329/1329), done.
Collecting tree_sitter
  Downloading tree_sitter-0.20.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.3/484.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree_sitter
Successfully installed tree_sitter-0.20.2


In [4]:
import tree_sitter
from tree_sitter import Language, Parser
from collections import deque
import re
from functools import partial


LANG_PATH = "./tree-sitter-java"
TARGET_PATH = "./build/my-languages.so"

Language.build_library(
    TARGET_PATH,
    [LANG_PATH]
)

JAVA_LANGUAGE = Language(TARGET_PATH, 'java')
parser = Parser()
parser.set_language(JAVA_LANGUAGE)

# get tokens from AST-tree
def tokenize(root: tree_sitter.Node) -> list:
    list_nodes = deque([root])
    leaves = []

    def name_split(str: bytes):
        name_list = re.findall(b'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', str)
        name_list = re.split(b'_+', b'_'.join(name_list))
        return name_list

    while list_nodes:
        node = list_nodes.popleft()
        if node.has_error:
            continue

        if not node.children:
            if "literal" in node.type:
                continue
            if node.is_named:
                leaves += name_split(node.text)
            else:
                leaves.append(node.text)

        for children in node.children:
            if children:
                list_nodes.append(children)
    return leaves


def parse_java_code(source):
    tokens = []

    for line in source:
        tree = parser.parse(bytes("class Test {" + line + " }", "utf8"))
        tokens.append(tokenize(tree.root_node)[4:])
    return tokens

def byte_into_str(source):
    return [[s.decode("utf8") for s in xs] for xs in source]

train = train.map(lambda batch: {"tokenized": parse_java_code(batch["source"])}, batched=True, batch_size=1024)
test = test.map(lambda batch: {"tokenized": parse_java_code(batch["source"])}, batched=True, batch_size=1024)




Map:   0%|          | 0/134225 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [5]:
import torch

checkpoint = "Salesforce/codet5p-110m-embedding"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# import gc
# gc.collect()

tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, add_prefix_space=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)


# code embedding
def get_embedding(source):
    input = tokenizer(list(map(str, source)), padding=True, truncation=True, is_split_into_words=True, return_tensors="pt") # truncation=True
    input = {k: v.to(device) for k, v in input.items()}

    model_output = model(**input)
    return model_output

train = train.map(
    # lambda batch: get_embedding(batch["source"]), batched=True, batch_size=16
    lambda batch: {"embedding": get_embedding(batch["tokenized"]).detach().cpu().numpy()[0]}, remove_columns=["source"]
)
test = test.map(
    lambda batch: {"embedding": get_embedding(batch["tokenized"]).detach().cpu().numpy()[0]}, remove_columns=["source"]
)

# with batched ~35 examples/s
# without ~50 examples/s
# verdict: batched < none-batched (why???)

# detach ... выпилить
# понять есть ли для test'a n примеров построить 10 и определить похож не похож
# проверить батчи, **input - не факт, что работает как кажется, draw_umap(...)

train.save_to_disk('./train')
test.save_to_disk('./test')

train, test


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading (…)codet5p_embedding.py:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- configuration_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)codet5p_embedding.py:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- modeling_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Map:   0%|          | 0/134225 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/134225 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

(Dataset({
     features: ['target', 'tokenized', 'embedding'],
     num_rows: 134225
 }),
 Dataset({
     features: ['target', 'tokenized', 'embedding'],
     num_rows: 5000
 }))

In [6]:
from datasets import load_from_disk

train = load_from_disk('./train')
test = load_from_disk('./test')

In [9]:
import time
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from functools import wraps

import numpy as np


# train['embedding'], test['embedding']
print(f'\nVector Length: {len(test["embedding"][0])}.')

def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        total_time = time.perf_counter() - start_time
        print(f'\nTime cost ({func.__name__}): {total_time:.4f} seconds.')
        return result
    return timeit_wrapper


@timeit
def predict_top_k(test, source, topk=10) -> list:
    prediction = []
    similarity = cosine_similarity(test['embedding'], source['embedding'])

    for index, similar in enumerate(tqdm(similarity)):

        index_nn = np.argpartition(similar, -topk)[-topk:]
        current_prediction = [source['target'][i] for i in index_nn[::-1]]

        prediction += current_prediction

    # write the recommendation comments to the file named as "predictions_k.txt"
    with open('predictions_' + str(topk) + '.txt', 'a') as f:
        for data in prediction:
            f.write(data + '\n')
    return prediction


def batch_data(data, batch_size=1024):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Compute the cosine distance and its computational time
prediction = []

for test_batch in batch_data(test):
    prediction += predict_top_k(test_batch, train)


Vector Length: 256.


100%|██████████| 1024/1024 [23:16<00:00,  1.36s/it]



Time cost (predict_top_k): 1420.3077 seconds.


100%|██████████| 1024/1024 [23:10<00:00,  1.36s/it]



Time cost (predict_top_k): 1411.6642 seconds.


100%|██████████| 1024/1024 [23:46<00:00,  1.39s/it]



Time cost (predict_top_k): 1446.5837 seconds.


100%|██████████| 1024/1024 [23:45<00:00,  1.39s/it]



Time cost (predict_top_k): 1448.3581 seconds.


100%|██████████| 904/904 [21:14<00:00,  1.41s/it]


Time cost (predict_top_k): 1296.9309 seconds.





In [10]:
import numpy as np
from nltk.translate import bleu_score
from tqdm import tqdm
import statistics

# Evaluate perfect prediction & BLEU score of our approach
prediction = [line.strip() for line in open("./predictions_10.txt")]
chencherry = bleu_score.SmoothingFunction()
top_k = 10

print(len(prediction))

BLEU_score = { 1:  [],
               3:  [],
               5:  [],
               10: []
}
count_perfect = { 1:  0,
                  3:  0,
                  5:  0,
                  10: 0
}

for i, target in enumerate(tqdm(test['target'])):
    best_BLEU = 0
    flag_perfect = 0
    for counter, pred in enumerate(prediction[top_k * i: top_k * i + top_k], 1):

        current_BLEU = bleu_score.sentence_bleu([pred.split()], target.split(), smoothing_function=chencherry.method1)
        best_BLEU = max(best_BLEU, current_BLEU)

        if " ".join(pred.split()) == " ".join(target.split()):
            flag_perfect = 1

        if counter in BLEU_score.keys():
            BLEU_score[counter].append(best_BLEU)
            count_perfect[counter] += flag_perfect


for k in BLEU_score.keys():
    print(f'\nPP    : %d/%d (%s%.2f)' % (count_perfect[k], len(test['target']), '%', (count_perfect[k] * 100) / len(test['target'])))
    print(f'BLEU mean              : ', statistics.mean(BLEU_score[k]))


50000


100%|██████████| 5000/5000 [00:13<00:00, 363.31it/s]


PP    : 47/5000 (%0.94)
BLEU mean              :  0.014603909229728087

PP    : 119/5000 (%2.38)
BLEU mean              :  0.03262315077277205

PP    : 154/5000 (%3.08)
BLEU mean              :  0.042517487337334715

PP    : 182/5000 (%3.64)
BLEU mean              :  0.05045316912525354





In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import pandas as pd
import umap.umap_ as umap
import umap.plot
from datasets import load_from_disk

%matplotlib inline

train = load_from_disk('./train')
test = load_from_disk('./test')
# train, test

num_vector = 1000

def draw_UMAP(data, n_neighbors=10, min_dist=0.2, metric='cosine', n_components=2):
    fit = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric
    )
    u = fit.fit_transform(data);
    fig = plt.figure()
    if n_components == 2:
        ax = fig.add_subplot(111)
        ax.scatter(u[:,0], u[:,1])
    if n_components == 3:
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(u[:,0], u[:,1], u[:,2], s=100)
    plt.title(f'n_neighbors={n_neighbors} min_dist={min_dist}, metric={metric}', fontsize=14)

# ---------- 3D-picture ----------
# 3d best:
# 10 0.1 cosine
# 15 0.0 cosine
# 15 0.99 correlation
# 25 0.0 cosine

# fit = umap.UMAP(
#         n_neighbors=25,
#         min_dist=0.0,
#         metric='cosine',
#         n_components=3
# )
# umap_train = fit.fit_transform(train.shuffle(seed=42)["embedding"][:num_vector])
# umap_test = fit.transform(test.shuffle(seed=42)["embedding"][:num_vector])


# fig = plt.figure(figsize=(12, 12))
# axs = [fig.add_subplot(121, projection='3d'), fig.add_subplot(122, projection='3d')]

# axs[0].set_title('train embeddings')
# axs[1].set_title('test embeddings')

# axs[0].scatter(umap_train[:,0], umap_train[:,1], umap_train[:,2],
#               c=np.sqrt(umap_train[:,0] ** 2 + umap_train[:,1] ** 2 + umap_train[:,2] ** 2),
#               cmap='winter'
# )
# axs[1].scatter(umap_test[:,0], umap_test[:,1], umap_test[:,2],
#               c=np.sqrt(umap_test[:,0] ** 2 + umap_test[:,1] ** 2 + umap_test[:,2] ** 2),
#               cmap='winter'
# )
# plt.savefig('./graph_apart_3d.png')

# ----------2D-picture-------------
# 2d best:
# 5 0.0 euclidean
# 10 0.0 manh
# 25 0.25 cosine

fit = umap.UMAP(
    n_neighbors=25,
    min_dist=0.0,
    metric='cosine',
)
umap_train = fit.fit_transform(train.shuffle(seed=42)["embedding"][:num_vector])
umap_test = fit.transform(test.shuffle(seed=42)["embedding"][:num_vector])

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 8))

axs[0].set_title('train embeddings')
axs[1].set_title('test embeddings')

axs[0].scatter(umap_train[:,0], umap_train[:,1], c=np.sqrt(umap_train[:,0] ** 2 + umap_train[:,1] ** 2), cmap='winter')
axs[1].scatter(umap_test[:,0], umap_test[:,1], c=np.sqrt(umap_test[:,0] ** 2 + umap_test[:,1] ** 2), cmap='winter')

plt.savefig('./graph_apart_2d.png')


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import pandas as pd
import umap.umap_ as umap
import umap.plot
from datasets import load_from_disk

%matplotlib inline

train = load_from_disk('./train')
test = load_from_disk('./test')
# train, test

num_vector = 1000

fit = umap.UMAP(
    n_neighbors=25,
    min_dist=0.0,
    metric='cosine',
)
umap_train = fit.fit_transform(train.shuffle(seed=42)["embedding"][:num_vector])
umap_test = fit.transform(test.shuffle(seed=42)["embedding"][:num_vector])

plt.scatter(umap_train[:,0], umap_train[:,1], c='red', alpha=0.3)
plt.scatter(umap_test[:,0], umap_test[:,1], c='blue', alpha=0.3)

plt.savefig('./graph_togeth.png')