Analysis Dataset

In [27]:
from datasets import load_dataset
import pandas as pd
import re

In [28]:
# Load dataset and convert to dataframe
ds = load_dataset("code_search_net", "python", trust_remote_code=True)
df = ds['train'].to_pandas()

# Retrieve necessary data series.
df = df[['func_name', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'language']]

# Only retrieve code written in Python.
df = df.loc[df['language'] == 'python'].drop('language', axis=1)

df.head()


Unnamed: 0,func_name,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens
0,train,"def train(train_dir, model_save_path=None, n_n...","[def, train, (, train_dir, ,, model_save_path,...",Trains a k-nearest neighbors classifier for fa...,"[Trains, a, k, -, nearest, neighbors, classifi..."
1,predict,"def predict(X_img_path, knn_clf=None, model_pa...","[def, predict, (, X_img_path, ,, knn_clf, =, N...",Recognizes faces in given image using a traine...,"[Recognizes, faces, in, given, image, using, a..."
2,show_prediction_labels_on_image,"def show_prediction_labels_on_image(img_path, ...","[def, show_prediction_labels_on_image, (, img_...",Shows the face recognition results visually.\n...,"[Shows, the, face, recognition, results, visua..."
3,_rect_to_css,"def _rect_to_css(rect):\n """"""\n Convert ...","[def, _rect_to_css, (, rect, ), :, return, rec...",Convert a dlib 'rect' object to a plain tuple ...,"[Convert, a, dlib, rect, object, to, a, plain,..."
4,_trim_css_to_bounds,"def _trim_css_to_bounds(css, image_shape):\n ...","[def, _trim_css_to_bounds, (, css, ,, image_sh...","Make sure a tuple in (top, right, bottom, left...","[Make, sure, a, tuple, in, (, top, right, bott..."


In [29]:
# Number of data in each set
ds.num_rows

{'train': 412178, 'test': 22176, 'validation': 23107}

In [30]:
# Count the length of code token and docstring token
df['func_code_tokens_len'] = df['func_code_tokens'].apply(lambda x: len(x))
df['func_documentation_tokens_len'] = df['func_documentation_tokens'].apply(lambda x: len(x))

df['func_code_tokens_len'].describe(), df['func_documentation_tokens_len'].describe()

# There are chinese documentation...
df.loc[df['func_documentation_tokens_len'] == 1].head()

Unnamed: 0,func_name,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,func_code_tokens_len,func_documentation_tokens_len
10978,WechatSogouAPI.__hosting_wechat_img,"def __hosting_wechat_img(self, content_info, h...","[def, __hosting_wechat_img, (, self, ,, conten...",将微信明细中图片托管到云端，同时将html页面中的对应图片替换\n\n Par...,[将微信明细中图片托管到云端，同时将html页面中的对应图片替换],83,1
10984,WechatSogouAPI.get_article_content,"def get_article_content(self, url, del_qqmusic...","[def, get_article_content, (, self, ,, url, ,,...",获取文章原文，避免临时链接失效\n\n Parameters\n ...,[获取文章原文，避免临时链接失效],116,1
10985,WechatSogouAPI.get_sugg,"def get_sugg(self, keyword):\n """"""获取微信搜...","[def, get_sugg, (, self, ,, keyword, ), :, url...",获取微信搜狗搜索关键词联想\n\n Parameters\n -...,[获取微信搜狗搜索关键词联想],71,1
10986,unlock_sogou_callback_example,"def unlock_sogou_callback_example(url, req, re...","[def, unlock_sogou_callback_example, (, url, ,...",手动打码解锁\n\n Parameters\n ----------\n ...,[手动打码解锁],112,1
10987,unlock_weixin_callback_example,"def unlock_weixin_callback_example(url, req, r...","[def, unlock_weixin_callback_example, (, url, ...",手动打码解锁\n\n Parameters\n ----------\n ...,[手动打码解锁],97,1


In [31]:
def is_english(s: str) -> bool:
    return s.isascii()

english_df = df.loc[df['func_documentation_string'].apply(is_english) == True]

english_df['func_code_tokens_len'].describe(), english_df['func_documentation_tokens_len'].describe(), len(english_df), len(english_df.loc[df['func_documentation_tokens_len'] < 5])

(count    406508.000000
 mean        117.024460
 std         169.413479
 min          19.000000
 25%          43.000000
 50%          72.000000
 75%         132.000000
 max       28410.000000
 Name: func_code_tokens_len, dtype: float64,
 count    406508.000000
 mean         16.466761
 std          23.899018
 min           1.000000
 25%           7.000000
 50%          10.000000
 75%          17.000000
 max        1971.000000
 Name: func_documentation_tokens_len, dtype: float64,
 406508,
 34518)

Filter Dataset

In [32]:
def filter_dataset(ds, min_doc_token: int, min_code_token: int=0, language='python') -> bool:
    # Step 1: Only allow python
    if ds['language'] != language:
        return False
    
    # Step 2: Check if the coden token length if > min_code_token
    if len(ds['func_code_tokens']) < min_code_token:
        return False
    
    if len(ds['func_documentation_tokens']) < min_doc_token:
        return False
    
    # Step 3: Check if the func documentation only include ascii code (exclude non-english).
    if ds['func_documentation_string'].isascii() == False:
        return False

    return True

ds.filter(lambda ds: filter_dataset(ds, 0))


DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 406508
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22014
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22656
    })
})

In [42]:
def _filter_column_from_dataset(datasets, column_to_save: list):

    dataset_columns_to_remove = {
        dataset: columns for dataset, columns in datasets.column_names.items()
    }

    for dataset in dataset_columns_to_remove:
        for column in column_to_save:
            if column in dataset_columns_to_remove[dataset]:
                dataset_columns_to_remove[dataset].remove(column)

    for dataset in datasets:
        datasets[dataset] = datasets[dataset].remove_columns(dataset_columns_to_remove[dataset])

column_to_save = [
        'func_code_tokens',
        'func_documentation_tokens'
    ]

_filter_column_from_dataset(ds, column_to_save)
ds


DatasetDict({
    train: Dataset({
        features: ['func_code_tokens', 'func_documentation_tokens'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['func_code_tokens', 'func_documentation_tokens'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['func_code_tokens', 'func_documentation_tokens'],
        num_rows: 23107
    })
})