#all

In [None]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

Installing dependencies...
[K     |████████████████████████████████| 235kB 6.9MB/s 
[K     |████████████████████████████████| 2.2MB 7.3MB/s 
[K     |████████████████████████████████| 61kB 7.9MB/s 
[K     |████████████████████████████████| 1.2MB 26.0MB/s 
[K     |████████████████████████████████| 368kB 57.2MB/s 
[K     |████████████████████████████████| 3.4MB 55.1MB/s 
[K     |████████████████████████████████| 3.8MB 45.1MB/s 
[K     |████████████████████████████████| 870kB 47.5MB/s 
[K     |████████████████████████████████| 3.3MB 44.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "v3-8"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU zdetection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Setting up GCS access...
Running on TPU: grpc://10.42.248.130:8470
Instructions for updating:
non-resource variables are not supported in the long term


Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
print(t5.__version__)

0.9.0


### Register concode

In [None]:
def dumping_dataset_java(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://cotext/data/codegeneration/code_generation_train.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["java: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_java("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'check if details are parsed . concode_field_sep Container parent concode_elem_sep boolean isParsed concode_elem_sep long offset concode_elem_sep long contentStartPosition concode_elem_sep ByteBuffer deadBytes concode_elem_sep boolean isRead concode_elem_sep long memMapSize concode_elem_sep Logger LOG concode_elem_sep byte[] userType concode_elem_sep String type concode_elem_sep ByteBuffer content concode_elem_sep FileChannel fileChannel concode_field_sep Container getParent concode_elem_sep byte[] getUserType concode_elem_sep void readContent concode_elem_sep long getOffset concode_elem_sep long getContentSize concode_elem_sep void getContent concode_elem_sep void setDeadBytes concode_elem_sep void parse concode_elem_sep void getHeader concode_elem_sep long getSize concode_elem_sep void parseDetails concode_elem_sep String getType concode_elem_sep void _parseDetails concode_elem_sep String getPath concode_elem_sep boolean verify concode_elem

In [None]:
t5.data.TaskRegistry.remove('java')
t5.data.TaskRegistry.add(
    "java",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_java,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],)

<t5.data.dataset_providers.FunctionTask at 0x7f7a757dbdd0>

## Mixtures

In [None]:
t5.data.MixtureRegistry.remove("all_mix")
t5.data.MixtureRegistry.add(
    "all_mix",
    [
     'java',
     ],
     default_rate=1.0
)

<t5.seqio.dataset_providers.Mixture at 0x7f7a75835710>

## Define Model

In [None]:
# Using pretrained_models from wiki + books
MODEL_SIZE = "base"
PRETRAINED_DIR = "gs://cotext/cc"

MODEL_DIR = "gs://t5_training/models/code/codegeneration_uni_v1/"
MODEL_DIR = os.path.join(MODEL_DIR, MODEL_SIZE)


# Set parallelism and batch size to fit on v2-8 TPU (if possible).
# Limit number of checkpoints to fit within 5GB (if possible).
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)
# The models from our paper are based on the Mesh Tensorflow Transformer.
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 256, "targets": 256},
    learning_rate_schedule=0.001,
    save_checkpoints_steps=1000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)


## Finetune

In [None]:
FINETUNE_STEPS = 57000

model.finetune(
    mixture_or_task_name="all_mix",
    pretrained_model_dir=PRETRAINED_DIR,
    finetune_steps=FINETUNE_STEPS
)

INFO:root:system_path_file_exists:gs://t5_training/models/code/code_uni_v1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/code_uni_v1/base/operative_config.gin
INFO:root:Skipping import of unknown module `t5.data.sentencepiece_vocabulary` (skip_unknown=True).


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_training/models/code/codegeneration_uni_v1/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.42.248.130:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.42.248.130:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.42.248.130:8470', '_evaluation_master': 'grpc://10.

## Predict

In [None]:
tasks = [['codegeneration', 'java']]
output_dir = "codegeneration_uni_v1"
test_file = 'test'
test_folder = 'valid3'

In [None]:
!mkdir {test_folder}
!gsutil cp gs://cotext/data/codegeneration/code_generation_valid.tsv {test_folder}/ 
with open(f'{test_folder}/code_generation_valid.tsv', 'r') as file:
  with open(f'{test_folder}/predict_input.tsv', 'w') as predict_input:
    for line in file:
      line = line.strip().split('\t')
      input = line[0].strip()
      predict_input.write(f'java: {input}\n')

Copying gs://t5_training/t5-data/code_data/codegeneration/code_generation_valid.tsv...
/ [0 files][    0.0 B/  1.8 MiB]                                                / [1 files][  1.8 MiB/  1.8 MiB]                                                
Operation completed over 1 objects/1.8 MiB.                                      


In [None]:
import tensorflow.compat.v1 as tf

for t in tasks:
  dir = t[0]
  lang = t[1]
  input_file = f'{test_folder}/predict_input.tsv'
  output_file = f'{test_folder}/predict_output.tsv'
  predict_inputs_path = input_file
  predict_outputs_path = output_file

  # Manually apply preprocessing by prepending "triviaqa question:".
  print(predict_inputs_path)
  print(predict_outputs_path)
  # Ignore any logging so that we only see the model's answers to the questions.
  with tf_verbosity_level('ERROR'):
    model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
    model.predict(
        input_file=predict_inputs_path,
        output_file=predict_outputs_path,
        checkpoint_steps=-1,
        temperature=0,
    )

  # The output filename will have the checkpoint appended so we glob to get 
  # the latest.
  prediction_files = sorted(tf.io.gfile.glob(predict_outputs_path + "*"))
  print("Predicted task : " + lang)
  print("\nPredictions using checkpoint %s:\n" % prediction_files[-1].split("-")[-1])

valid3/predict_input.tsv
valid3/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codegeneration_uni_v1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codegeneration_uni_v1/base/operative_config.gin


Predicted task : java

Predictions using checkpoint 1260000:



## Scoring

In [None]:
checkpoint = '1255000'
with open(f'test1/predict_output.tsv-{checkpoint}') as file:
  with open('predictions_uni_55k.txt', 'w') as out:
    for line in file:
      pred = line.strip().replace('SMALLER_TOKEN', '<').replace('GREATER_TOKEN', '>').replace('OPEN_SQUARE_TOKEN', '[').replace('CLOSE_SQUARE_TOKEN', ']').replace('OPEN_CURLY_TOKEN', '{').replace('CLOSE_CURLY_TOKEN', '}').replace('EXPONENTIAL_TOKEN', '^').replace('SHARP_TOKEN', '#').replace('DOLLAR_TOKEN', '$').replace('UNK_TOKEN', '`')
      out.write(pred + '\n')

In [None]:
tasks = [
         ['codegeneration', 'java']
         ]
output_dir = "codegeneration_uni_v1"
test_file = 'test'
checkpoint = '1260000'

In [None]:
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Text-Code/text-to-code/evaluator/bleu.py
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Text-Code/text-to-code/evaluator/evaluator.py
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Text-Code/text-to-code/evaluator/answers.json
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Text-Code/text-to-code/evaluator/predictions.txt

--2021-04-13 07:20:27--  https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Text-Code/text-to-code/evaluator/bleu.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4877 (4.8K) [text/plain]
Saving to: ‘bleu.py’


2021-04-13 07:20:27 (59.1 MB/s) - ‘bleu.py’ saved [4877/4877]

--2021-04-13 07:20:27--  https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Text-Code/text-to-code/evaluator/evaluator.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1390 (1.4K) [text/plain]
Saving to: ‘evaluator.py’


2021-04-13 07:20:27 (

In [None]:
!python evaluator.py -a=answers.json -p=predictions.txt
# !mkdir output

INFO:__main__:BLEU: 16.68, EM: 17.0


In [None]:
import json
for task in tasks:
  lang = task[1]
  with open(f'{test_folder}/code_generation_valid.tsv', 'r') as test_file:
    with open(f'{test_folder}/answers.json', 'w') as out_test:
      for line in test_file:
        line = line.strip().split('\t')
        nl = line[0].strip()
        code = line[1].strip().replace('SMALLER_TOKEN', '<').replace('GREATER_TOKEN', '>').replace('OPEN_SQUARE_TOKEN', '[').replace('CLOSE_SQUARE_TOKEN', ']').replace('OPEN_CURLY_TOKEN', '{').replace('CLOSE_CURLY_TOKEN', '}').replace('EXPONENTIAL_TOKEN', '^').replace('SHARP_TOKEN', '#').replace('DOLLAR_TOKEN', '$').replace('UNK_TOKEN', '`')
        li = {"code": code, "nl": nl}
        out_test.write(json.dumps(li))
        out_test.write('\n')
  with open(f'{test_folder}/predict_output.tsv-{checkpoint}') as predict_output:
    with open(f'{test_folder}/predictions.txt', 'w') as predict_file:
      for line in predict_output:
        line = line.strip().replace('SMALLER_TOKEN', '<').replace('GREATER_TOKEN', '>').replace('OPEN_SQUARE_TOKEN', '[').replace('CLOSE_SQUARE_TOKEN', ']').replace('OPEN_CURLY_TOKEN', '{').replace('CLOSE_CURLY_TOKEN', '}').replace('EXPONENTIAL_TOKEN', '^').replace('SHARP_TOKEN', '#').replace('DOLLAR_TOKEN', '$').replace('UNK_TOKEN', '`')
        predict_file.write(line)
        predict_file.write('\n')
  print(f'language: {lang}')
  !python evaluator.py -a={test_folder}/answers.json -p={test_folder}/predictions.txt
  print('\n')
  print('\n')

language: java
INFO:__main__:BLEU: 33.43, EM: 17.25




