# All

## Set up

In [None]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

Installing dependencies...
[K     |████████████████████████████████| 235kB 5.1MB/s 
[K     |████████████████████████████████| 368kB 8.4MB/s 
[K     |████████████████████████████████| 1.2MB 10.4MB/s 
[K     |████████████████████████████████| 2.1MB 20.5MB/s 
[K     |████████████████████████████████| 3.4MB 16.6MB/s 
[K     |████████████████████████████████| 61kB 5.3MB/s 
[K     |████████████████████████████████| 3.9MB 46.2MB/s 
[K     |████████████████████████████████| 901kB 45.9MB/s 
[K     |████████████████████████████████| 3.3MB 32.8MB/s 
[?25h

In [None]:
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "v2-8"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU zdetection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)


Setting up GCS access...
Running on TPU: grpc://10.65.72.234:8470
Instructions for updating:
non-resource variables are not supported in the long term


Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
print(t5.__version__)

0.9.0


## Register codesearchnet Tasks

### java


In [None]:
def dumping_dataset_java(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://cotext/data/codesearchnet/java/train_encode.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["java: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_java("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'public ImageSource apply ( ImageSource input )  OPEN_CURLY_TOKEN  final int  OPEN_SQUARE_TOKEN   CLOSE_SQUARE_TOKEN   OPEN_SQUARE_TOKEN   CLOSE_SQUARE_TOKEN  pixelMatrix = new int  OPEN_SQUARE_TOKEN  3  CLOSE_SQUARE_TOKEN   OPEN_SQUARE_TOKEN  3  CLOSE_SQUARE_TOKEN  ; int w = input . getWidth ( ) ; int h = input . getHeight ( ) ; int  OPEN_SQUARE_TOKEN   CLOSE_SQUARE_TOKEN   OPEN_SQUARE_TOKEN   CLOSE_SQUARE_TOKEN  output = new int  OPEN_SQUARE_TOKEN  h  CLOSE_SQUARE_TOKEN   OPEN_SQUARE_TOKEN  w  CLOSE_SQUARE_TOKEN  ; for ( int j = 1 ; j  SMALLER_TOKEN  h - 1 ; j ++ )  OPEN_CURLY_TOKEN  for ( int i = 1 ; i  SMALLER_TOKEN  w - 1 ; i ++ )  OPEN_CURLY_TOKEN  pixelMatrix  OPEN_SQUARE_TOKEN  0  CLOSE_SQUARE_TOKEN   OPEN_SQUARE_TOKEN  0  CLOSE_SQUARE_TOKEN  = input . getR ( i - 1 , j - 1 ) ; pixelMatrix  OPEN_SQUARE_TOKEN  0  CLOSE_SQUARE_TOKEN   OPEN_SQUARE_TOKEN  1  CLOSE_SQUARE_TOKEN  = input . getRGB ( i - 1 , j ) ; pixelMatrix  OPEN_SQUARE_TOKE

In [None]:
t5.data.TaskRegistry.remove('java')
t5.data.TaskRegistry.add(
    "java",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_java,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
)

<t5.data.dataset_providers.FunctionTask at 0x7f43be582790>

### php


In [None]:
def dumping_dataset_php(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://cotext/data/codesearchnet/php/train_encode.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["php: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_php("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b"public function onChannelPreDelete ( ResourceControllerEvent  DOLLAR_TOKEN  event ) : void  OPEN_CURLY_TOKEN   DOLLAR_TOKEN  channel =  DOLLAR_TOKEN  event - GREATER_TOKEN  getSubject ( ) ; if ( !  DOLLAR_TOKEN  channel instanceof ChannelInterface )  OPEN_CURLY_TOKEN  throw new UnexpectedTypeException (  DOLLAR_TOKEN  channel , ChannelInterface :: class ) ;  CLOSE_CURLY_TOKEN   DOLLAR_TOKEN  results =  DOLLAR_TOKEN  this - GREATER_TOKEN  channelRepository - GREATER_TOKEN  findBy (  OPEN_SQUARE_TOKEN  'enabled' = GREATER_TOKEN  true  CLOSE_SQUARE_TOKEN  ) ; if ( !  DOLLAR_TOKEN  results || ( count (  DOLLAR_TOKEN  results ) === 1 && current (  DOLLAR_TOKEN  results ) ===  DOLLAR_TOKEN  channel ) )  OPEN_CURLY_TOKEN   DOLLAR_TOKEN  event - GREATER_TOKEN  stop ( 'sylius.channel.delete_error' ) ;  CLOSE_CURLY_TOKEN   CLOSE_CURLY_TOKEN ", 'target': b'Prevent channel deletion if no more channels enabled .'}
{'input': b'public function getTaxTotal 

In [None]:
t5.data.TaskRegistry.remove('php')
t5.data.TaskRegistry.add(
    "php",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_php,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],)

<t5.data.dataset_providers.FunctionTask at 0x7f43be592f90>

### js


In [None]:
def dumping_dataset_js(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://cotext/data/codesearchnet/javascript/train_encode.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    # print('text', text)

    text = tf.strings.regex_replace(text, "\<", "SMALLER_TOKEN")
    text = tf.strings.regex_replace(text, "\>", "GREATER_TOKEN")
    text = tf.strings.regex_replace(text, "\[", "OPEN_SQUARE_TOKEN")
    text = tf.strings.regex_replace(text, "\]", "CLOSE_SQUARE_TOKEN")
    text = tf.strings.regex_replace(text, "\{", "OPEN_CURLY_TOKEN")
    text = tf.strings.regex_replace(text, "\}", "CLOSE_CURLY_TOKEN")
    text = tf.strings.regex_replace(text, "\^", "EXPONENTIAL_TOKEN")
    text = tf.strings.regex_replace(text, "\#", "SHARP_TOKEN")
    text = tf.strings.regex_replace(text, "\$", "DOLLAR_TOKEN")
    text = tf.strings.regex_replace(text, "\`", "UNK_TOKEN")

    return text
    

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":


             tf.strings.join(
                 ["javascript: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_js("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'function ( state , action )  OPEN_CURLY_TOKEN  return _ . defaults (  OPEN_CURLY_TOKEN  isValidating : action . isValidating , lastAction : IS_VALIDATING  CLOSE_CURLY_TOKEN  , state )  CLOSE_CURLY_TOKEN ', 'target': b'Update is validating result'}
{'input': b'function addWidgetForFilter ( view , filter , editModeHint )  OPEN_CURLY_TOKEN  var gridster = view . _widgetsGridster ; var row = filter . row || 1 ; var col = filter . col || 1 ; var sizeX = filter . size_x || 3 ; var sizeY = filter . size_y || 3 ; var el = gridster . add_widget ( \' SMALLER_TOKEN div class="widgetOuterFrame" GREATER_TOKEN  SMALLER_TOKEN /div GREATER_TOKEN \' , sizeX , sizeY , col , row ) ; var frameView = new WidgetFrameView (  OPEN_CURLY_TOKEN  model : filter  CLOSE_CURLY_TOKEN  ) ; view . renderSubview ( frameView , el  OPEN_SQUARE_TOKEN  0  CLOSE_SQUARE_TOKEN  ) ; frameView . renderContent ( ) ; frameView . gridsterHook = el  OPEN_SQUARE_TOKEN  0  CLOSE_SQUARE_TOK

In [None]:
t5.data.TaskRegistry.remove('js')
t5.data.TaskRegistry.add(
    "js",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_js,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7f438b8df110>

In [None]:
dumping_dataset = t5.data.TaskRegistry.get("js")
ds = dumping_dataset.get_dataset(split="train", sequence_length={"inputs": 128, "targets": 128})
print("A few preprocessed validation examples...")
for ex in tfds.as_numpy(ds.take(20)):
  print(ex)

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


A few preprocessed validation examples...
{'inputs_pretokenized': b"javascript: function normalize ( obj , caseType = 'camel' )  OPEN_CURLY_TOKEN  let ret = obj ; const method = methods  OPEN_SQUARE_TOKEN  caseType  CLOSE_SQUARE_TOKEN  ; if ( Array . isArray ( obj ) )  OPEN_CURLY_TOKEN  ret =  OPEN_SQUARE_TOKEN   CLOSE_SQUARE_TOKEN  ; let i = 0 ; while ( i  SMALLER_TOKEN  obj . length )  OPEN_CURLY_TOKEN  ret . push ( normalize ( obj  OPEN_SQUARE_TOKEN  i  CLOSE_SQUARE_TOKEN  , caseType ) ) ; ++ i ;  CLOSE_CURLY_TOKEN   CLOSE_CURLY_TOKEN  else if ( isPlainObject ( obj ) )  OPEN_CURLY_TOKEN  ret =  OPEN_CURLY_TOKEN   CLOSE_CURLY_TOKEN  ; for ( const k in obj )  OPEN_CURLY_TOKEN  ret  OPEN_SQUARE_TOKEN  method ( k )  CLOSE_SQUARE_TOKEN  = normalize ( obj  OPEN_SQUARE_TOKEN  k  CLOSE_SQUARE_TOKEN  , caseType ) ;  CLOSE_CURLY_TOKEN   CLOSE_CURLY_TOKEN  return ret ;  CLOSE_CURLY_TOKEN ", 'inputs': array([    3, 27578, 11815,    10,  1681,  1389,  1737,    41,     3,
          32,   115,   3

### ruby


In [None]:
def dumping_dataset_ruby(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://cotext/data/codesearchnet/ruby/train_encode.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["ruby: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_ruby("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'def render_body ( context , options ) if options . key? ( :partial )  OPEN_SQUARE_TOKEN  render_partial ( context , options )  CLOSE_SQUARE_TOKEN  else StreamingTemplateRenderer . new ( @lookup_context ) . render ( context , options ) end end', 'target': b'Render but returns a valid Rack body . If fibers are defined we return a streaming body that renders the template piece by piece .'}
{'input': b'def attribute_missing ( match , * args , & block ) __send__ ( match . target , match . attr_name , * args , & block ) end', 'target': b'+ attribute_missing + is like + method_missing + but for attributes . When + method_missing + is called we check to see if there is a matching attribute method . If so we tell + attribute_missing + to dispatch the attribute . This method can be overloaded to customize the behavior .'}
{'input': b'def matched_attribute_method ( method_name ) matches = self . class . send ( :attribute_method_matchers_matching , meth

In [None]:
t5.data.TaskRegistry.remove('ruby')
t5.data.TaskRegistry.add(
    "ruby",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_ruby,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7f438846a850>

### go


In [None]:
def dumping_dataset_go(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://cotext/data/codesearchnet/go/train_encode.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["go: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_go("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'func getAllDepTypes ( )  OPEN_SQUARE_TOKEN   CLOSE_SQUARE_TOKEN  string  OPEN_CURLY_TOKEN  depTypes := make (  OPEN_SQUARE_TOKEN   CLOSE_SQUARE_TOKEN  string , 0 , len ( cmds ) )  NEW_LINE  for depType := range cmds  OPEN_CURLY_TOKEN  depTypes = append ( depTypes , depType )  NEW_LINE   CLOSE_CURLY_TOKEN   NEW_LINE  sort . Strings ( depTypes )  NEW_LINE  return depTypes  NEW_LINE   CLOSE_CURLY_TOKEN ', 'target': b'getAllDepTypes returns a sorted list of names of all dep type commands .'}
{'input': b'func getIoProgressReader ( label string , res * http . Response ) io . Reader  OPEN_CURLY_TOKEN  prefix := "Downloading " + label  NEW_LINE  fmtBytesSize := 18  NEW_LINE  barSize := int64 ( 80 - len ( prefix ) - fmtBytesSize )  NEW_LINE  bar := ioprogress . DrawTextFormatBarForW ( barSize , os . Stderr )  NEW_LINE  fmtfunc := func ( progress , total int64 ) string  OPEN_CURLY_TOKEN  if total == - 1  OPEN_CURLY_TOKEN  return fmt . Sprintf ( "%s: %

In [None]:
t5.data.TaskRegistry.remove('go')
t5.data.TaskRegistry.add(
    "go",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_go,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7f43882ddd50>

### python


In [None]:
def dumping_dataset_python(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://cotext/data/codesearchnet/python/train_encode.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["python: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_python("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'def split_phylogeny ( p , level = "s" ) : level = level + "__" result = p . split ( level ) return result  OPEN_SQUARE_TOKEN  0  CLOSE_SQUARE_TOKEN  + level + result  OPEN_SQUARE_TOKEN  1  CLOSE_SQUARE_TOKEN  . split ( ";" )  OPEN_SQUARE_TOKEN  0  CLOSE_SQUARE_TOKEN ', 'target': b'Return either the full or truncated version of a QIIME - formatted taxonomy string .'}
{'input': b'def ensure_dir ( d ) : if not os . path . exists ( d ) : try : os . makedirs ( d ) except OSError as oe : if os . errno == errno . ENOENT : msg = twdd ( ) return msg . format ( d ) else : msg = twdd ( ) return msg . format ( d , oe . strerror )', 'target': b'Check to make sure the supplied directory path does not exist if so create it . The method catches OSError exceptions and returns a descriptive message instead of re - raising the error .'}
{'input': b'def file_handle ( fnh , mode = "rU" ) : handle = None if isinstance ( fnh , file ) : if fnh . closed : raise Valu

In [None]:
t5.data.TaskRegistry.remove('python')
t5.data.TaskRegistry.add(
    "python",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_python,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7f438860c3d0>

## Mixtures

In [None]:
t5.data.MixtureRegistry.remove("all_mix")
t5.data.MixtureRegistry.add(
    "all_mix",
    [
     'go',
     'ruby',
     'js',
     'php',
     'java',
     'python',
     ],
     default_rate=1.0
)

<t5.seqio.dataset_providers.Mixture at 0x7f43882c8650>

## Define Model

In [None]:
# Using pretrained_models from wiki + books
MODEL_SIZE = "base"

PRETRAINED_DIR = "gs://cotext/cc/"

MODEL_DIR = "gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1"
MODEL_DIR = os.path.join(MODEL_DIR, MODEL_SIZE)


# Set parallelism and batch size to fit on v2-8 TPU (if possible).
# Limit number of checkpoints to fit within 5GB (if possible).
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)
# The models from our paper are based on the Mesh Tensorflow Transformer.
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    learning_rate_schedule=0.001,
    save_checkpoints_steps=1000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)


## Finetune

In [None]:
FINETUNE_STEPS = 45000

model.finetune(
    mixture_or_task_name="all_mix",
    pretrained_model_dir=PRETRAINED_DIR,
    finetune_steps=FINETUNE_STEPS
)

## Predict

In [None]:
tasks = [
         ['codesearchnet', 'python'],
         ['codesearchnet', 'java'],
         ['codesearchnet', 'javascript'],
         ['codesearchnet', 'go'],
         ['codesearchnet', 'php'],
         ['codesearchnet', 'ruby'],
         ]
output_dir = "codesummarization_code_all_codesearchnet_v1"
test_file = 'test'

In [None]:
for task in tasks:
  lang = task[1]
  !mkdir {lang}
  !gsutil cp gs://cotext/data/{task[0]}/{lang}/{test_file}.tsv {lang}/
  with open(f'{lang}/{test_file}.tsv', 'r') as file:
    with open(f'{lang}/predict_input.tsv', 'w') as predict_input:
      with open(f'{lang}/actual_output.tsv', 'w') as actual_output:
        for line in file:
          line = line.strip().split('\t')
          input = line[0].strip()
          actual = line[1].strip()

          predict_input.write(f'{lang}: {input}\n')
          actual_output.write(f'{actual}\n')

Copying gs://t5_training/t5-data/code_data/codesearchnet/python/test.tsv...
/ [1 files][  7.9 MiB/  7.9 MiB]                                                
Operation completed over 1 objects/7.9 MiB.                                      
Copying gs://t5_training/t5-data/code_data/codesearchnet/java/test.tsv...
/ [1 files][  5.8 MiB/  5.8 MiB]                                                
Operation completed over 1 objects/5.8 MiB.                                      
Copying gs://t5_training/t5-data/code_data/codesearchnet/javascript/test.tsv...
/ [1 files][  1.8 MiB/  1.8 MiB]                                                
Operation completed over 1 objects/1.8 MiB.                                      
Copying gs://t5_training/t5-data/code_data/codesearchnet/go/test.tsv...
/ [1 files][  3.9 MiB/  3.9 MiB]                                                
Operation completed over 1 objects/3.9 MiB.                                      
Copying gs://t5_training/t5-data/code_data/cod

In [None]:
import tensorflow.compat.v1 as tf
for t in tasks:
  dir = t[0]
  lang = t[1]
  input_file = f'{lang}/predict_input.tsv'
  output_file = f'{lang}/predict_output.tsv'

  predict_inputs_path = input_file
  predict_outputs_path = output_file

  # Manually apply preprocessing by prepending "triviaqa question:".
  print(predict_inputs_path)
  print(predict_outputs_path)
  # Ignore any logging so that we only see the model's answers to the questions.
  with tf_verbosity_level('ERROR'):
    model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
    model.predict(
        input_file=predict_inputs_path,
        output_file=predict_outputs_path,
        checkpoint_steps=-1,
        temperature=0,
    )

  # The output filename will have the checkpoint appended so we glob to get 
  # the latest.
  prediction_files = sorted(tf.io.gfile.glob(predict_outputs_path + "*"))
  print("Predicted task : " + lang)
  print("\nPredictions using checkpoint %s:\n" % prediction_files[-1].split("-")[-1])

python/predict_input.tsv
python/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin


Predicted task : python

Predictions using checkpoint 1245000:

java/predict_input.tsv
java/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin
INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin


Predicted task : java

Predictions using checkpoint 1245000:

javascript/predict_input.tsv
javascript/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin


Predicted task : javascript

Predictions using checkpoint 1245000:

go/predict_input.tsv
go/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin


Predicted task : go

Predictions using checkpoint 1245000:

php/predict_input.tsv
php/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_code_all_codesearchnet_v1/base/operative_config.gin


Predicted task : php

Predictions using checkpoint 1245000:

ruby/predict_input.tsv
ruby/predict_output.tsv




Predicted task : ruby

Predictions using checkpoint 1245000:



## Scoring

In [None]:
tasks = [
         ['codesearchnet', 'python'],
         ['codesearchnet', 'java'],
         ['codesearchnet', 'javascript'],
         ['codesearchnet', 'go'],
         ['codesearchnet', 'php'],
         ['codesearchnet', 'ruby'],
         ]
output_dir = "codesummarization_code_all_codesearchnet_v1"
test_file = 'test'
checkpoint = '1245000'

In [None]:
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/evaluator.py
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/predictions.txt
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/reference.txt

--2021-04-23 05:32:33--  https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/evaluator.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6916 (6.8K) [text/plain]
Saving to: ‘evaluator.py’


2021-04-23 05:32:33 (70.0 MB/s) - ‘evaluator.py’ saved [6916/6916]

--2021-04-23 05:32:33--  https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/predictions.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 196 [text/plain]
Saving to: ‘predictions.txt’


2021-04-

In [None]:
!python evaluator.py reference.txt < predictions.txt
!mkdir output

Total: 5
9.554726113590661


In [None]:
for task in tasks:
  lang = task[1]
  with open(f'{lang}/predict_output.tsv-{checkpoint}') as predict_output:
    with open(f'{lang}/actual_output.tsv') as actual_output:
      with open(f'output/{lang}_reference.txt', 'w') as reference:
        with open(f'output/{lang}_predictions.txt', 'w') as predictions:
          for idx, (line1, line2) in enumerate(zip(predict_output, actual_output)):
            line1 = line1.replace('⁇', '')
            reference.write(f'{idx}\t{line2}')
            predictions.write(f'{idx}\t{line1}')
          print(f'language: {lang}')
          !python evaluator.py output/{lang}_reference.txt < output/{lang}_predictions.txt
          print('\n')
        

language: python
Total: 14884
19.35622797473649


language: java
Total: 10834
18.749421575274635


language: javascript
Total: 3180
14.75286010413368


language: go
Total: 8076
18.952009265859775


language: php
Total: 13951
22.968755076406474


language: ruby
Total: 1198
13.232945480742675


