Analysis Dataset

In [1]:
from datasets import load_dataset
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset and convert to dataframe
ds = load_dataset("code_search_net", "python", trust_remote_code=True)
df = ds['train'].to_pandas()

# Retrieve necessary data series.
df = df[['func_name', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens']]


In [3]:
# Number of data in each set
ds.num_rows

{'train': 412178, 'test': 22176, 'validation': 23107}

In [4]:
# Count the length of code token and docstring token
df['func_code_tokens_len'] = df['func_code_tokens'].apply(lambda x: len(x))
df['func_documentation_tokens_len'] = df['func_documentation_tokens'].apply(lambda x: len(x))

df['func_code_tokens_len'].describe(), df['func_documentation_tokens_len'].describe()

# There are chinese documentation...
df.loc[df['func_documentation_tokens_len'] == 1].head()

Unnamed: 0,func_name,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,func_code_tokens_len,func_documentation_tokens_len
10978,WechatSogouAPI.__hosting_wechat_img,"def __hosting_wechat_img(self, content_info, h...","[def, __hosting_wechat_img, (, self, ,, conten...",将微信明细中图片托管到云端，同时将html页面中的对应图片替换\n\n Par...,[将微信明细中图片托管到云端，同时将html页面中的对应图片替换],83,1
10984,WechatSogouAPI.get_article_content,"def get_article_content(self, url, del_qqmusic...","[def, get_article_content, (, self, ,, url, ,,...",获取文章原文，避免临时链接失效\n\n Parameters\n ...,[获取文章原文，避免临时链接失效],116,1
10985,WechatSogouAPI.get_sugg,"def get_sugg(self, keyword):\n """"""获取微信搜...","[def, get_sugg, (, self, ,, keyword, ), :, url...",获取微信搜狗搜索关键词联想\n\n Parameters\n -...,[获取微信搜狗搜索关键词联想],71,1
10986,unlock_sogou_callback_example,"def unlock_sogou_callback_example(url, req, re...","[def, unlock_sogou_callback_example, (, url, ,...",手动打码解锁\n\n Parameters\n ----------\n ...,[手动打码解锁],112,1
10987,unlock_weixin_callback_example,"def unlock_weixin_callback_example(url, req, r...","[def, unlock_weixin_callback_example, (, url, ...",手动打码解锁\n\n Parameters\n ----------\n ...,[手动打码解锁],97,1


In [5]:
def _filter_dataset(ds, min_doc_token: int, max_doc_token: int, min_code_token: int, max_code_token: int, language='python') -> bool:
    # Step 1: Only allow python
    if ds['language'] != language:
        return False
    
    # Step 2: Check if the coden token length if > min_code_token
    if len(ds['func_code_tokens']) < min_code_token or \
        len(ds['func_code_tokens']) > max_code_token:
        return False
    
    if len(ds['func_documentation_tokens']) < min_doc_token or \
        len(ds['func_documentation_tokens']) > max_doc_token:
        return False
    
    # Step 3: Check if the func documentation only include ascii code (exclude non-english).
    if ds['func_documentation_string'].isascii() == False:
        return False

    return True

def _filter_columns_from_dataset(datasets, columns_to_save: list):

    # Get all the columns names
    dataset_columns_to_remove = {
        dataset: columns for dataset, columns in datasets.column_names.items()
    }

    # Remove columns to save from all the column names
    for dataset in dataset_columns_to_remove:
        for column in columns_to_save:
            if column in dataset_columns_to_remove[dataset]:
                dataset_columns_to_remove[dataset].remove(column)

    # Remove all the columns except columns_to_save.
    for dataset in datasets:
        datasets[dataset] = datasets[dataset].remove_columns(dataset_columns_to_remove[dataset])
    
    return datasets

columns_to_save = [
        'func_code_string',
        'func_code_tokens',
        'func_documentation_string',
        'func_documentation_tokens'
    ]

filtered_ds = ds.filter(lambda example: _filter_dataset(example, 0, 64, 0, 64))
filtered_ds = _filter_columns_from_dataset(ds, columns_to_save)

In [10]:
import unicodedata

def _unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def _normalize_string(s):
    s = _unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

def preprocess_documentation_string_to_tokens(s):
    return _normalize_string(s).split()

filtered_ds = filtered_ds.map(
    lambda example: {
        'func_documentation_tokens': preprocess_documentation_string_to_tokens(example['func_documentation_string'])
    }
)

Map: 100%|██████████| 412178/412178 [00:37<00:00, 11085.98 examples/s]
Map: 100%|██████████| 22176/22176 [00:02<00:00, 10623.71 examples/s]
Map: 100%|██████████| 23107/23107 [00:02<00:00, 10287.69 examples/s]


In [11]:
filtered_pd = filtered_ds['train'].to_pandas()
filtered_pd.head()

Unnamed: 0,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens
0,"def train(train_dir, model_save_path=None, n_n...","[def, train, (, train_dir, ,, model_save_path,...",Trains a k-nearest neighbors classifier for fa...,"[trains, a, k, nearest, neighbors, classifier,..."
1,"def predict(X_img_path, knn_clf=None, model_pa...","[def, predict, (, X_img_path, ,, knn_clf, =, N...",Recognizes faces in given image using a traine...,"[recognizes, faces, in, given, image, using, a..."
2,"def show_prediction_labels_on_image(img_path, ...","[def, show_prediction_labels_on_image, (, img_...",Shows the face recognition results visually.\n...,"[shows, the, face, recognition, results, visua..."
3,"def _rect_to_css(rect):\n """"""\n Convert ...","[def, _rect_to_css, (, rect, ), :, return, rec...",Convert a dlib 'rect' object to a plain tuple ...,"[convert, a, dlib, rect, object, to, a, plain,..."
4,"def _trim_css_to_bounds(css, image_shape):\n ...","[def, _trim_css_to_bounds, (, css, ,, image_sh...","Make sure a tuple in (top, right, bottom, left...","[make, sure, a, tuple, in, top, right, bottom,..."


In [7]:
import sys
sys.path.append(r'C:\Code\Code_Semantic_Search')

from data.tokenizer import Tokenizer
code_tokenizer = Tokenizer(8192)
doc_tokenizer = Tokenizer(8192)

# Load tokens in tokenizer
code_tokenizer.load_datasets(filtered_ds, 'func_code_tokens')
doc_tokenizer.load_datasets(filtered_ds, 'func_documentation_tokens')
print(f'code tokens: {len(code_tokenizer)}')
print(f'doc tokens: {len(doc_tokenizer)}')


code tokens: 8192
doc tokens: 8192


In [8]:
# filter the dataset to only include allow code tokens and allow documentation tokens
# so that the dataset will not have any unknown token
def _filter_tokens(ds, allow_code_tokens, allow_doc_tokens) -> bool:
    for code_token, doc_token in zip(ds['func_code_tokens'], ds['func_documentation_tokens']):
        if code_token not in allow_code_tokens:
            return False

        if doc_token not in allow_doc_tokens:
            return False

    return True


filtered_tokens = filtered_ds.filter(lambda example: _filter_tokens(example, code_tokenizer.most_freq_tokens, doc_tokenizer.most_freq_tokens))


Filter: 100%|██████████| 412178/412178 [00:51<00:00, 8005.37 examples/s]
Filter: 100%|██████████| 22176/22176 [00:02<00:00, 8164.58 examples/s]
Filter: 100%|██████████| 23107/23107 [00:03<00:00, 7561.67 examples/s]


In [14]:
len(filtered_tokens['train']), len(filtered_tokens['test']), len(filtered_tokens['validation'])

(37671, 2049, 2401)