Analysis Dataset

In [33]:
from datasets import load_dataset
import pandas as pd
import re

In [34]:
# Load dataset and convert to dataframe
ds = load_dataset("code_search_net", "python", trust_remote_code=True)
df = ds['train'].to_pandas()

# Retrieve necessary data series.
df = df[['func_name', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'language']]

# Only retrieve code written in Python.
df = df.loc[df['language'] == 'python'].drop('language', axis=1)

df.head()


Unnamed: 0,func_name,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens
0,train,"def train(train_dir, model_save_path=None, n_n...","[def, train, (, train_dir, ,, model_save_path,...",Trains a k-nearest neighbors classifier for fa...,"[Trains, a, k, -, nearest, neighbors, classifi..."
1,predict,"def predict(X_img_path, knn_clf=None, model_pa...","[def, predict, (, X_img_path, ,, knn_clf, =, N...",Recognizes faces in given image using a traine...,"[Recognizes, faces, in, given, image, using, a..."
2,show_prediction_labels_on_image,"def show_prediction_labels_on_image(img_path, ...","[def, show_prediction_labels_on_image, (, img_...",Shows the face recognition results visually.\n...,"[Shows, the, face, recognition, results, visua..."
3,_rect_to_css,"def _rect_to_css(rect):\n """"""\n Convert ...","[def, _rect_to_css, (, rect, ), :, return, rec...",Convert a dlib 'rect' object to a plain tuple ...,"[Convert, a, dlib, rect, object, to, a, plain,..."
4,_trim_css_to_bounds,"def _trim_css_to_bounds(css, image_shape):\n ...","[def, _trim_css_to_bounds, (, css, ,, image_sh...","Make sure a tuple in (top, right, bottom, left...","[Make, sure, a, tuple, in, (, top, right, bott..."


In [35]:
# Number of data in each set
ds.num_rows

{'train': 412178, 'test': 22176, 'validation': 23107}

In [36]:
# Count the length of code token and docstring token
df['func_code_tokens_len'] = df['func_code_tokens'].apply(lambda x: len(x))
df['func_documentation_tokens_len'] = df['func_documentation_tokens'].apply(lambda x: len(x))

df['func_code_tokens_len'].describe(), df['func_documentation_tokens_len'].describe()

# There are chinese documentation...
df.loc[df['func_documentation_tokens_len'] == 1].head()

Unnamed: 0,func_name,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,func_code_tokens_len,func_documentation_tokens_len
10978,WechatSogouAPI.__hosting_wechat_img,"def __hosting_wechat_img(self, content_info, h...","[def, __hosting_wechat_img, (, self, ,, conten...",将微信明细中图片托管到云端，同时将html页面中的对应图片替换\n\n Par...,[将微信明细中图片托管到云端，同时将html页面中的对应图片替换],83,1
10984,WechatSogouAPI.get_article_content,"def get_article_content(self, url, del_qqmusic...","[def, get_article_content, (, self, ,, url, ,,...",获取文章原文，避免临时链接失效\n\n Parameters\n ...,[获取文章原文，避免临时链接失效],116,1
10985,WechatSogouAPI.get_sugg,"def get_sugg(self, keyword):\n """"""获取微信搜...","[def, get_sugg, (, self, ,, keyword, ), :, url...",获取微信搜狗搜索关键词联想\n\n Parameters\n -...,[获取微信搜狗搜索关键词联想],71,1
10986,unlock_sogou_callback_example,"def unlock_sogou_callback_example(url, req, re...","[def, unlock_sogou_callback_example, (, url, ,...",手动打码解锁\n\n Parameters\n ----------\n ...,[手动打码解锁],112,1
10987,unlock_weixin_callback_example,"def unlock_weixin_callback_example(url, req, r...","[def, unlock_weixin_callback_example, (, url, ...",手动打码解锁\n\n Parameters\n ----------\n ...,[手动打码解锁],97,1


In [37]:
def is_english(s: str) -> bool:
    return s.isascii()

english_df = df.loc[df['func_documentation_string'].apply(is_english) == True]

english_df['func_code_tokens_len'].describe(), english_df['func_documentation_tokens_len'].describe(), len(english_df), len(english_df.loc[df['func_documentation_tokens_len'] < 5])

(count    406508.000000
 mean        117.024460
 std         169.413479
 min          19.000000
 25%          43.000000
 50%          72.000000
 75%         132.000000
 max       28410.000000
 Name: func_code_tokens_len, dtype: float64,
 count    406508.000000
 mean         16.466761
 std          23.899018
 min           1.000000
 25%           7.000000
 50%          10.000000
 75%          17.000000
 max        1971.000000
 Name: func_documentation_tokens_len, dtype: float64,
 406508,
 34518)

Filter Dataset

In [39]:
def _filter_dataset(ds, min_doc_token: int, max_doc_token: int, min_code_token: int, max_code_token: int, language='python') -> bool:
    # Step 1: Only allow python
    if ds['language'] != language:
        return False
    
    # Step 2: Check if the coden token length if > min_code_token
    if len(ds['func_code_tokens']) < min_code_token or \
        len(ds['func_code_tokens']) > max_code_token:
        return False
    
    if len(ds['func_documentation_tokens']) < min_doc_token or \
        len(ds['func_documentation_tokens']) > max_doc_token:
        return False
    
    # Step 3: Check if the func documentation only include ascii code (exclude non-english).
    if ds['func_documentation_string'].isascii() == False:
        return False

    return True

ds = ds.filter(lambda ds: _filter_dataset(
        ds=ds,
        min_doc_token=0,
        max_doc_token=256,
        min_code_token=0,
        max_code_token=256
    ))


Filter: 100%|██████████| 412178/412178 [00:54<00:00, 7509.31 examples/s]
Filter: 100%|██████████| 22176/22176 [00:02<00:00, 7570.29 examples/s]
Filter: 100%|██████████| 23107/23107 [00:03<00:00, 7225.34 examples/s]


In [40]:
df = ds['train'].to_pandas()
df['func_documentation_tokens_len'] = df['func_documentation_tokens'].apply(len)
df['func_documentation_tokens_len'].describe()

count    371335.000000
mean         15.610705
std          17.268367
min           1.000000
25%           7.000000
50%          10.000000
75%          17.000000
max         256.000000
Name: func_documentation_tokens_len, dtype: float64

In [42]:
def _filter_column_from_dataset(datasets, column_to_save: list):

    dataset_columns_to_remove = {
        dataset: columns for dataset, columns in datasets.column_names.items()
    }

    for dataset in dataset_columns_to_remove:
        for column in column_to_save:
            if column in dataset_columns_to_remove[dataset]:
                dataset_columns_to_remove[dataset].remove(column)

    for dataset in datasets:
        datasets[dataset] = datasets[dataset].remove_columns(dataset_columns_to_remove[dataset])

column_to_save = [
        'func_code_tokens',
        'func_documentation_tokens'
    ]

_filter_column_from_dataset(ds, column_to_save)
ds, ds['train'][0]['func_code_tokens'][:10], ds['train'][0]['func_documentation_tokens'][:30], 


(DatasetDict({
     train: Dataset({
         features: ['func_code_tokens', 'func_documentation_tokens'],
         num_rows: 371335
     })
     test: Dataset({
         features: ['func_code_tokens', 'func_documentation_tokens'],
         num_rows: 20184
     })
     validation: Dataset({
         features: ['func_code_tokens', 'func_documentation_tokens'],
         num_rows: 20458
     })
 }),
 ['def',
  'predict',
  '(',
  'X_img_path',
  ',',
  'knn_clf',
  '=',
  'None',
  ',',
  'model_path'],
 ['Recognizes',
  'faces',
  'in',
  'given',
  'image',
  'using',
  'a',
  'trained',
  'KNN',
  'classifier'])

In [46]:
import sys
sys.path.append(r'C:\Code\Code_Semantic_Search\data')

from tokenizer import Tokenizer

def _tokenize(example, tokenizer: Tokenizer) -> int:
    return {
        'func_code_tokens': tokenizer.to_idx(example['func_code_tokens']),
        'func_documentation_tokens': tokenizer.to_idx(example['func_documentation_tokens']),
    }

tokenizer = Tokenizer()
ds = ds.map(lambda example: _tokenize(example, tokenizer))


Map: 100%|██████████| 371335/371335 [01:09<00:00, 5365.70 examples/s]
Map: 100%|██████████| 20184/20184 [00:03<00:00, 5401.78 examples/s]
Map: 100%|██████████| 20458/20458 [00:03<00:00, 5388.84 examples/s]


In [None]:
ds_torch = ds.with_format("torch")
ds_torch['train'][0]['func_code_tokens']


tensor([ 4,  5,  6,  7,  8,  9, 10, 11,  8, 12, 10, 11,  8, 13, 10, 14, 15, 16,
        17, 18, 19, 20, 21, 20, 22,  6,  7, 15, 23, 19, 20, 21, 20, 24,  6,  7,
        15, 25, 26, 27, 25, 26, 16, 27, 18, 28, 29, 16, 30, 31,  6, 32, 20, 33,
         6,  7, 15, 15, 17,  9, 34, 11, 35, 12, 34, 11, 16, 30, 31,  6, 36, 15,
        37, 17,  9, 34, 11, 16, 38, 39,  6, 12,  8, 40, 15, 41, 42, 16,  9, 10,
        43, 20, 44,  6, 42, 15, 45, 46, 10, 47, 20, 48,  6,  7, 15, 49, 10, 47,
        20, 50,  6, 46, 15, 51, 17, 52,  6, 49, 15, 53, 54, 16, 55, 25, 27, 56,
        57, 10, 47, 20, 58,  6, 46,  8, 59, 10, 49, 15, 60, 61, 10,  9, 20, 62,
         6, 57,  8, 63, 10, 26, 15, 64, 10, 25, 61, 25, 54, 27, 25, 65, 27, 25,
        54, 27, 66, 13, 67, 65, 28, 68,  6, 52,  6, 49, 15, 15, 27, 69, 55, 25,
         6, 70,  8, 71, 15, 17, 72, 73,  6, 74,  8, 71, 15, 67, 70,  8, 71,  8,
        72, 28, 75,  6,  9, 20,  5,  6, 57, 15,  8, 49,  8, 64, 15, 27])