In [4]:
# !pip install langid
# !pip3 install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install --upgrade transformers

In [5]:
import os
import json
import pandas as pd
import langid
from tqdm.auto import tqdm

In [8]:
def is_english(line):
  lang, _ = langid.classify(line)
  return lang == 'en'

def clean_unnecessary_data(dataset_file):
  jsonl_array = []
  json_array = []
  keys_to_keep = ["code", "docstring"]

  Lines = dataset_file.readlines()

  count = 0
  for line in Lines:
    count += 1
    json_object = json.loads(line.strip())
    filtered_json_object = {key: json_object[key] for key in keys_to_keep if key in json_object}
    jsonl_array.append(filtered_json_object)
  
  df = pd.DataFrame(jsonl_array)
  for ind in tqdm(df.index):
    docstring_line = df['docstring'][ind].replace('#', ' ')
    docstring_line = docstring_line.replace('*', ' ')
    docstring_lines = docstring_line.split("\n")
    cleaned_docstring = ' '.join(line.strip() for line in docstring_lines 
                                 if ('@' not in line.strip() and '**' not in line.strip() and is_english(line)))

    altered_json_object = {"codeblock": df['code'][ind], "docstring": cleaned_docstring}
    
    if df['code'][ind].strip().startswith('function'):
      if cleaned_docstring.strip():
        json_array.append(altered_json_object)
  df = pd.DataFrame(json_array)
  return df

In [9]:
dataset_train_files = [
    open('dataset/javascript_train_0.jsonl'),
    open('dataset/javascript_train_1.jsonl'),
    open('dataset/javascript_train_2.jsonl'),
    open('dataset/javascript_train_3.jsonl'),
    open('dataset/javascript_train_4.jsonl')
]
dataset_valid_file = open('dataset/javascript_valid_0.jsonl')
dataset_test_file = open('dataset/javascript_test_0.jsonl')

In [10]:
df_train = pd.concat([
    clean_unnecessary_data(dataset_train_files[0]),
    clean_unnecessary_data(dataset_train_files[1]),
    clean_unnecessary_data(dataset_train_files[2]),
    clean_unnecessary_data(dataset_train_files[3]),
    clean_unnecessary_data(dataset_train_files[4])
])
df_valid = clean_unnecessary_data(dataset_valid_file)
df_test = clean_unnecessary_data(dataset_test_file)

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/3889 [00:00<?, ?it/s]

  0%|          | 0/8253 [00:00<?, ?it/s]

  0%|          | 0/6483 [00:00<?, ?it/s]

In [11]:
df_train.head()

Unnamed: 0,codeblock,docstring
0,function createTypeScriptLanguageService(optio...,"region Discovery, LanguageService & Setup"
1,function discoverAndReadFiles(options) {\n ...,Read imports and follow them until all files h...
2,"function getRealNodeSymbol(checker, node) {\n ...",Returns the node's symbol and the `import` nod...
3,"function shouldSkipAlias(node, declaration) {\...",Go to the original declaration for cases: (1)...
4,"function getTokenAtPosition(sourceFile, positi...",Get the token whose text contains the position


In [12]:
df_valid.head()

Unnamed: 0,codeblock,docstring
0,function getCounts(langs = []) {\n return {...,Compute the overall total counts of models and...
1,"function className(node, value){\n var klas...",access className property while respecting SVG...
2,function deserializeValue(value) {\n try {\...,"""true"" => true ""false"" => false ""null"" => nu..."
3,"function parseArguments(url, data, success, da...",handle optional data/success arguments
4,function once (fn) {\n let called = false\n ...,"in Webpack 2, require.ensure now also returns ..."


In [13]:
df_test.head()

Unnamed: 0,codeblock,docstring
0,function createInstance(defaultConfig) {\n va...,Create an instance of Axios
1,function CancelToken(executor) {\n if (typeof...,A `CancelToken` is an object that can be used ...
2,function isArrayBufferView(val) {\n var resul...,Determine if a value is a view on an ArrayBuffer
3,function isStandardBrowserEnv() {\n if (typeo...,Determine if we're running in a standard brows...
4,"function forEach(obj, fn) {\n // Don't bother...",Iterate over an Array or an Object invoking a ...


In [14]:
len(df_train), len(df_valid), len(df_test)

(109944, 7435, 5758)

In [15]:
df_train['code_tokens'] = df_train.codeblock.apply(lambda x: x.split())
df_train['docstring_tokens'] = df_train.docstring.apply(lambda x: x.split())
with open(f'cleaned_{os.path.basename(dataset_train_files[0].name)}','w') as f:
  for _, row in df_train.iterrows():
    f.write(json.dumps(row.to_dict()) + '\n')

df_valid['code_tokens'] = df_valid.codeblock.apply(lambda x: x.split())
df_valid['docstring_tokens'] = df_valid.docstring.apply(lambda x: x.split())
with open(f'cleaned_{os.path.basename(dataset_valid_file.name)}','w') as f:
  for _, row in df_valid.iterrows():
    f.write(json.dumps(row.to_dict()) + '\n')

df_test['code_tokens'] = df_test.codeblock.apply(lambda x: x.split())
df_test['docstring_tokens'] = df_test.docstring.apply(lambda x: x.split())
with open(f'cleaned_{os.path.basename(dataset_test_file.name)}','w') as f:
  for _, row in df_test.iterrows():
    f.write(json.dumps(row.to_dict()) + '\n')

In [None]:
lr = 1e-5
batch_size = 20
epochs = 40
beam_size = 10
source_length = 256
target_length = 128
weight_decay = 0.001
gradient_accumulation_steps = 4
model_type = 'roberta'
pretrained_model = 'microsoft/codebert-base'
output_dir = 'model/javascript'
train_file = f'cleaned_{os.path.basename(dataset_train_files[0].name)}'
dev_file = f'cleaned_{os.path.basename(dataset_valid_file.name)}'
test_file = f'cleaned_{os.path.basename(dataset_test_file.name)}'
load_model_path = 'force_save_pytorch_model.bin'

In [16]:
! python run.py \
    --do_train \
    --do_eval \
    --do_test \
    --load_model_path {load_model_path} \
    --learning_rate {lr} \
    --num_train_epochs {epochs} \
    --model_type {model_type} \
    --tokenizer_name {pretrained_model} \
    --model_name_or_path {pretrained_model} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --weight_decay {weight_decay} \
    --gradient_accumulation_steps {gradient_accumulation_steps} \
    --beam_size {beam_size} \
    --train_batch_size {batch_size} \
    --eval_batch_size {batch_size} \
    --do_lower_case \
    --train_filename {train_file} \
    --dev_filename {dev_file} \
    --test_filename {dev_file} \
    --force_save \
    --output_dir {output_dir}

04/08/2024 15:51:19 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='model/javascript', load_model_path='force_save_pytorch_model.bin', train_filename='cleaned_javascript_train_0.jsonl', dev_filename='cleaned_javascript_valid_0.jsonl', test_filename='cleaned_javascript_valid_0.jsonl', config_name='', tokenizer_name='microsoft/codebert-base', max_source_length=256, max_target_length=128, do_train=True, do_eval=True, do_test=True, force_save=True, do_lower_case=True, no_cuda=False, train_batch_size=20, eval_batch_size=20, gradient_accumulation_steps=1, learning_rate=1e-05, beam_size=10, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=0, local_rank=-1, seed=42)
config.json: 100%|█████████████████████████████| 498/498 [00:00<00:00, 89.9kB/s]
tokenizer_config.json: 100%|█████████████████| 25.0/25.0 [00:00<00:00, 4.60kB/s]
vocab.json: 100%|████