# All

## Set up

In [None]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

Installing dependencies...
[K     |████████████████████████████████| 235kB 6.7MB/s 
[K     |████████████████████████████████| 368kB 10.7MB/s 
[K     |████████████████████████████████| 2.1MB 23.0MB/s 
[K     |████████████████████████████████| 1.2MB 46.3MB/s 
[K     |████████████████████████████████| 3.9MB 47.4MB/s 
[K     |████████████████████████████████| 3.4MB 40.6MB/s 
[K     |████████████████████████████████| 61kB 5.3MB/s 
[K     |████████████████████████████████| 901kB 41.1MB/s 
[K     |████████████████████████████████| 3.3MB 42.0MB/s 
[?25h

In [None]:
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "v3-8"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU zdetection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)


Setting up GCS access...
Running on TPU: grpc://10.88.248.130:8470
Instructions for updating:
non-resource variables are not supported in the long term


Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
# print(mesh_tensorflow.__version__)

In [None]:
print(t5.__version__)

0.9.0


In [None]:
# import gin
# import subprocess
# gin.parse_config_file(
#         'gs://t5-data/pretrained_models/base/operative_config.gin'
#     )


## Register codesearchnet Tasks

### java


In [None]:
def dumping_dataset_java(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://t5_training/t5-data/code_data/codesearchnet/java/train.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["java: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_java("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'public ImageSource apply ( ImageSource input ) { final int [ ] [ ] pixelMatrix = new int [ 3 ] [ 3 ] ; int w = input . getWidth ( ) ; int h = input . getHeight ( ) ; int [ ] [ ] output = new int [ h ] [ w ] ; for ( int j = 1 ; j < h - 1 ; j ++ ) { for ( int i = 1 ; i < w - 1 ; i ++ ) { pixelMatrix [ 0 ] [ 0 ] = input . getR ( i - 1 , j - 1 ) ; pixelMatrix [ 0 ] [ 1 ] = input . getRGB ( i - 1 , j ) ; pixelMatrix [ 0 ] [ 2 ] = input . getRGB ( i - 1 , j + 1 ) ; pixelMatrix [ 1 ] [ 0 ] = input . getRGB ( i , j - 1 ) ; pixelMatrix [ 1 ] [ 2 ] = input . getRGB ( i , j + 1 ) ; pixelMatrix [ 2 ] [ 0 ] = input . getRGB ( i + 1 , j - 1 ) ; pixelMatrix [ 2 ] [ 1 ] = input . getRGB ( i + 1 , j ) ; pixelMatrix [ 2 ] [ 2 ] = input . getRGB ( i + 1 , j + 1 ) ; int edge = ( int ) convolution ( pixelMatrix ) ; int rgb = ( edge << 16 | edge << 8 | edge ) ; output [ j ] [ i ] = rgb ; } } MatrixSource source = new MatrixSource ( output ) ; return source ; }', 

In [None]:
t5.data.TaskRegistry.remove('java')
t5.data.TaskRegistry.add(
    "java",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_java,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7fcfec96d890>

### php


In [None]:
def dumping_dataset_php(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://t5_training/t5-data/code_data/codesearchnet/php/train.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["php: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_php("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b"public function onChannelPreDelete ( ResourceControllerEvent $ event ) : void { $ channel = $ event -> getSubject ( ) ; if ( ! $ channel instanceof ChannelInterface ) { throw new UnexpectedTypeException ( $ channel , ChannelInterface :: class ) ; } $ results = $ this -> channelRepository -> findBy ( [ 'enabled' => true ] ) ; if ( ! $ results || ( count ( $ results ) === 1 && current ( $ results ) === $ channel ) ) { $ event -> stop ( 'sylius.channel.delete_error' ) ; } }", 'target': b'Prevent channel deletion if no more channels enabled .'}
{'input': b'public function getTaxTotal ( ) : int { $ taxTotal = 0 ; foreach ( $ this -> getAdjustments ( AdjustmentInterface :: TAX_ADJUSTMENT ) as $ taxAdjustment ) { $ taxTotal += $ taxAdjustment -> getAmount ( ) ; } foreach ( $ this -> units as $ unit ) { $ taxTotal += $ unit -> getTaxTotal ( ) ; } return $ taxTotal ; }', 'target': b'Returns sum of neutral and non neutral tax adjustments on order item

In [None]:
t5.data.TaskRegistry.remove('php')
t5.data.TaskRegistry.add(
    "php",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_php,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7fd0afd54590>

### js


In [None]:
def dumping_dataset_js(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://t5_training/t5-data/code_data/codesearchnet/javascript/train.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["javascript: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_js("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'function ( state , action ) { return _ . defaults ( { isValidating : action . isValidating , lastAction : IS_VALIDATING } , state ) }', 'target': b'Update is validating result'}
{'input': b'function addWidgetForFilter ( view , filter , editModeHint ) { var gridster = view . _widgetsGridster ; var row = filter . row || 1 ; var col = filter . col || 1 ; var sizeX = filter . size_x || 3 ; var sizeY = filter . size_y || 3 ; var el = gridster . add_widget ( \'<div class="widgetOuterFrame"></div>\' , sizeX , sizeY , col , row ) ; var frameView = new WidgetFrameView ( { model : filter } ) ; view . renderSubview ( frameView , el [ 0 ] ) ; frameView . renderContent ( ) ; frameView . gridsterHook = el [ 0 ] ; $ ( el [ 0 ] ) . data ( \'spotWidgetFrameView\' , frameView ) ; var chartView = frameView . widget ; chartView . model . updateConfiguration ( ) ; if ( chartView . model . isConfigured ) { if ( ! filter . isInitialized ) { filter . initDataFilter

In [None]:
t5.data.TaskRegistry.remove('js')
t5.data.TaskRegistry.add(
    "js",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_js,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7fcfb6aa26d0>

### ruby


In [None]:
def dumping_dataset_ruby(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://t5_training/t5-data/code_data/codesearchnet/ruby/train.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["ruby: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_ruby("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'def render_body ( context , options ) if options . key? ( :partial ) [ render_partial ( context , options ) ] else StreamingTemplateRenderer . new ( @lookup_context ) . render ( context , options ) end end', 'target': b'Render but returns a valid Rack body . If fibers are defined we return a streaming body that renders the template piece by piece .'}
{'input': b'def attribute_missing ( match , * args , & block ) __send__ ( match . target , match . attr_name , * args , & block ) end', 'target': b'+ attribute_missing + is like + method_missing + but for attributes . When + method_missing + is called we check to see if there is a matching attribute method . If so we tell + attribute_missing + to dispatch the attribute . This method can be overloaded to customize the behavior .'}
{'input': b'def matched_attribute_method ( method_name ) matches = self . class . send ( :attribute_method_matchers_matching , method_name ) matches . detect { | match 

In [None]:
t5.data.TaskRegistry.remove('ruby')
t5.data.TaskRegistry.add(
    "ruby",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_ruby,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7fcfb6ad1210>

### go


In [None]:
def dumping_dataset_go(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://t5_training/t5-data/code_data/codesearchnet/go/train.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["go: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_go("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'func getAllDepTypes ( ) [ ] string { depTypes := make ( [ ] string , 0 , len ( cmds ) ) \\n for depType := range cmds { depTypes = append ( depTypes , depType ) \\n } \\n sort . Strings ( depTypes ) \\n return depTypes \\n }', 'target': b'getAllDepTypes returns a sorted list of names of all dep type commands .'}
{'input': b'func getIoProgressReader ( label string , res * http . Response ) io . Reader { prefix := "Downloading " + label \\n fmtBytesSize := 18 \\n barSize := int64 ( 80 - len ( prefix ) - fmtBytesSize ) \\n bar := ioprogress . DrawTextFormatBarForW ( barSize , os . Stderr ) \\n fmtfunc := func ( progress , total int64 ) string { if total == - 1 { return fmt . Sprintf ( "%s: %v of an unknown total size" , prefix , ioprogress . ByteUnitStr ( progress ) , ) \\n } \\n return fmt . Sprintf ( "%s: %s %s" , prefix , bar ( progress , total ) , ioprogress . DrawTextFormatBytes ( progress , total ) , ) \\n } \\n return & ioprogress . Read

In [None]:
t5.data.TaskRegistry.remove('go')
t5.data.TaskRegistry.add(
    "go",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_go,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7fcfb6b0f610>

### python


In [None]:
def dumping_dataset_python(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://t5_training/t5-data/code_data/codesearchnet/python/train.tsv',
            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["python: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset_python("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'def split_phylogeny ( p , level = "s" ) : level = level + "__" result = p . split ( level ) return result [ 0 ] + level + result [ 1 ] . split ( ";" ) [ 0 ]', 'target': b'Return either the full or truncated version of a QIIME - formatted taxonomy string .'}
{'input': b'def ensure_dir ( d ) : if not os . path . exists ( d ) : try : os . makedirs ( d ) except OSError as oe : if os . errno == errno . ENOENT : msg = twdd ( ) return msg . format ( d ) else : msg = twdd ( ) return msg . format ( d , oe . strerror )', 'target': b'Check to make sure the supplied directory path does not exist if so create it . The method catches OSError exceptions and returns a descriptive message instead of re - raising the error .'}
{'input': b'def file_handle ( fnh , mode = "rU" ) : handle = None if isinstance ( fnh , file ) : if fnh . closed : raise ValueError ( "Input file is closed." ) handle = fnh elif isinstance ( fnh , str ) : handle = open ( fnh , mode ) re

In [None]:
t5.data.TaskRegistry.remove('python')
t5.data.TaskRegistry.add(
    "python",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset_python,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    metric_fns=[t5.evaluation.metrics.accuracy, 
               t5.evaluation.metrics.sequence_accuracy, 
                ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7fcfb6a7c210>

## Mixtures

In [None]:
t5.data.MixtureRegistry.remove("all_mix")
t5.data.MixtureRegistry.add(
    "all_mix",
    [
     'go',
     'ruby',
     'js',
     'php',
     'java',
     'python',
     ],
     default_rate=1.0
)

<t5.seqio.dataset_providers.Mixture at 0x7fcfb6a8d250>

## Define Model

In [None]:
# !gsutil -m rm -r {MODEL_DIR}

In [None]:
!gsutil -m cp gs://t5_training/models/code/codesummarization_uni_v1/base/* gs://t5_training/models/code/codesummarization_uni_v1_1/base/

Copying gs://t5_training/models/code/codesummarization_uni_v1/base/checkpoint...
/ [0/57 files][    0.0 B/  6.0 GiB]   0% Done                                   Copying gs://t5_training/models/code/codesummarization_uni_v1/base/model.ckpt-1196000.data-00001-of-00002...
/ [0/57 files][    0.0 B/  6.0 GiB]   0% Done                                   Copying gs://t5_training/models/code/codesummarization_uni_v1/base/graph.pbtxt...
/ [0/57 files][    0.0 B/  6.0 GiB]   0% Done                                   Copying gs://t5_training/models/code/codesummarization_uni_v1/base/model.ckpt-1203000.index...
/ [0/57 files][    0.0 B/  6.0 GiB]   0% Done                                   Copying gs://t5_training/models/code/codesummarization_uni_v1/base/model.ckpt-1196000.data-00000-of-00002...
/ [0/57 files][    0.0 B/  6.0 GiB]   0% Done                                   Copying gs://t5_training/models/code/codesummarization_uni_v1/base/model.ckpt-1196000.index...
Copying gs://t5_training

In [None]:
# Using pretrained_models from wiki + books
MODEL_SIZE = "base"
# BASE_PRETRAINED_DIR = "gs://t5-data/pretrained_models"
BASE_PRETRAINED_DIR = "gs://t5_training/models/code/code_uni_v1"
# BASE_PRETRAINED_DIR = "gs://t5_training/models/bio/pmc_v1"
# BASE_PRETRAINED_DIR = "gs://t5_training/models/bio/pubmed_v2"
# BASE_PRETRAINED_DIR = "gs://t5_training/models/export_models/bio/pmc_v4_1200k"
PRETRAINED_DIR = os.path.join(BASE_PRETRAINED_DIR, MODEL_SIZE)
# MODEL_DIR = "gs://t5_training/models/bio/re_v2"
MODEL_DIR = "gs://t5_training/models/code/codesummarization_uni_v1_1"
MODEL_DIR = os.path.join(MODEL_DIR, MODEL_SIZE)


# Set parallelism and batch size to fit on v2-8 TPU (if possible).
# Limit number of checkpoints to fit within 5GB (if possible).
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)
# The models from our paper are based on the Mesh Tensorflow Transformer.
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    learning_rate_schedule=0.001,
    save_checkpoints_steps=1000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)


## Finetune

In [None]:
FINETUNE_STEPS = 45000

model.finetune(
    mixture_or_task_name="all_mix",
    pretrained_model_dir=PRETRAINED_DIR,
    finetune_steps=FINETUNE_STEPS
)

INFO:root:system_path_file_exists:gs://t5_training/models/code/code_uni_v1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/code_uni_v1/base/operative_config.gin
INFO:root:Skipping import of unknown module `t5.data.sentencepiece_vocabulary` (skip_unknown=True).


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_training/models/code/codesummarization_uni_v1_1/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.88.248.130:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.88.248.130:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.88.248.130:8470', '_evaluation_master': 'grpc:

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('vocab', 'model'), ('ensemble', 'ensemble'), ('batch', 'ba

## Predict

In [None]:
tasks = [
         ['codesearchnet', 'python'],
         ['codesearchnet', 'java'],
         ['codesearchnet', 'javascript'],
         ['codesearchnet', 'go'],
         ['codesearchnet', 'php'],
         ['codesearchnet', 'ruby'],
         ]
output_dir = "codesummarization_uni_v1"
test_file = 'test'

In [None]:
for task in tasks:
  lang = task[1]
  !mkdir {lang}
  !gsutil cp gs://t5_training/t5-data/code_data/{task[0]}/{lang}/{test_file}.tsv {lang}/
  with open(f'{lang}/{test_file}.tsv', 'r') as file:
    with open(f'{lang}/predict_input.tsv', 'w') as predict_input:
      with open(f'{lang}/actual_output.tsv', 'w') as actual_output:
        for line in file:
          line = line.strip().split('\t')
          input = line[0].strip()
          actual = line[1].strip()

          predict_input.write(f'{lang}: {input}\n')
          actual_output.write(f'{actual}\n')
# for task in tasks:
#   lang = task[1]
#   !gsutil cp {lang}/actual_output.tsv gs://t5_training/t5-data/code_data/{task[0]}/{lang}/
#   !gsutil cp {lang}/predict_input.tsv gs://t5_training/t5-data/code_data/{task[0]}/{lang}/

mkdir: cannot create directory ‘python’: File exists
Copying gs://t5_training/t5-data/code_data/codesearchnet/python/test.tsv...
/ [1 files][  7.9 MiB/  7.9 MiB]                                                
Operation completed over 1 objects/7.9 MiB.                                      
mkdir: cannot create directory ‘java’: File exists
Copying gs://t5_training/t5-data/code_data/codesearchnet/java/test.tsv...
/ [1 files][  5.8 MiB/  5.8 MiB]                                                
Operation completed over 1 objects/5.8 MiB.                                      
mkdir: cannot create directory ‘javascript’: File exists
Copying gs://t5_training/t5-data/code_data/codesearchnet/javascript/test.tsv...
/ [1 files][  1.8 MiB/  1.8 MiB]                                                
Operation completed over 1 objects/1.8 MiB.                                      
mkdir: cannot create directory ‘go’: File exists
Copying gs://t5_training/t5-data/code_data/codesearchnet/go/test.tsv...

In [None]:
import tensorflow.compat.v1 as tf
# question_1 = "Emerin is a nuclear membrane protein which is missing or defective in Emery-Dreifuss muscular dystrophy (EDMD). It is one member of a family of lamina-associated proteins which includes LAP1, LAP2 and lamin B receptor (LBR). A panel of 16 monoclonal antibodies (mAbs) has been mapped to six specific sites throughout the emerin molecule using phage-displayed peptide libraries and has been used to localize emerin in human and rabbit heart. Several mAbs against different emerin epitopes did not recognize intercalated discs in the heart, though they recognized cardiomyocyte nuclei strongly, both at the rim and in intranuclear spots or channels. A polyclonal rabbit antiserum against emerin did recognize both nuclear membrane and intercalated discs but, after affinity purification against a pure-emerin band on a western blot, it stained only the nuclear membrane. These results would not be expected if immunostaining at intercalated discs were due to a product of the emerin gene and, therefore, cast some doubt upon the hypothesis that cardiac defects in EDMD are caused by absence of emerin from intercalated discs. Although emerin was abundant in the membranes of cardiomyocyte nuclei, it was absent from many non-myocyte cells in the heart. This distribution of emerin was similar to that of lamin A, a candidate gene for an autosomal form of EDMD. In contrast, lamin B1 was absent from cardiomyocyte nuclei, showing that lamin B1 is not essential for localization of emerin to the nuclear lamina. Lamin B1 is also almost completely absent from skeletal muscle nuclei. In EDMD, the additional absence of lamin B1 from heart and skeletal muscle nuclei which already lack emerin may offer an alternative explanation of why these tissues are particularly affected.." 
# question_2 = "Molecular analysis of the APC gene in 205 families: extended genotype-phenotype correlations in FAP and evidence for the role of APC amino acid changes in colorectal cancer predisposition." 
# question_3 = "Who are the 4 members of The Beatles?" 
# question_4 = "How many teeth do humans have?"

# questions = [question_2]


for t in tasks:
  dir = t[0]
  lang = t[1]
  input_file = f'{lang}/predict_input.tsv'
  output_file = f'{lang}/predict_output.tsv'


  # Write out the supplied questions to text files.
  # predict_inputs_path = os.path.join('gs://t5_training/t5-data/code_data', dir, input_file)
  # predict_outputs_path = os.path.join('gs://t5_training/t5-data/code_data', dir, output_dir , MODEL_SIZE, output_file)

  predict_inputs_path = input_file
  predict_outputs_path = output_file

  # Manually apply preprocessing by prepending "triviaqa question:".
  print(predict_inputs_path)
  print(predict_outputs_path)
  # Ignore any logging so that we only see the model's answers to the questions.
  with tf_verbosity_level('ERROR'):
    model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
    model.predict(
        input_file=predict_inputs_path,
        output_file=predict_outputs_path,
        checkpoint_steps=-1,
        # Select the most probable output token at each step.
        # vocabulary=t5.data.SentencePieceVocabulary(vocab)
        temperature=0,
    )

  # The output filename will have the checkpoint appended so we glob to get 
  # the latest.
  prediction_files = sorted(tf.io.gfile.glob(predict_outputs_path + "*"))
  print("Predicted task : " + lang)
  print("\nPredictions using checkpoint %s:\n" % prediction_files[-1].split("-")[-1])
  # with tf.io.gfile.GFile(prediction_files[-1]) as f:
  #   for q, a in zip(questions, f):
  #     if q:
  #       print("Q: " + q)
  #       print("A: " + a)
  #       print()

INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin


python/predict_input.tsv
python/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin


Predicted task : python

Predictions using checkpoint 1245000:

java/predict_input.tsv
java/predict_output.tsv
Predicted task : java

Predictions using checkpoint 1245000:

javascript/predict_input.tsv
javascript/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin
INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin


Predicted task : javascript

Predictions using checkpoint 1245000:

go/predict_input.tsv
go/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin


Predicted task : go

Predictions using checkpoint 1245000:

php/predict_input.tsv
php/predict_output.tsv
Predicted task : php

Predictions using checkpoint 1245000:

ruby/predict_input.tsv
ruby/predict_output.tsv


INFO:root:system_path_file_exists:gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin
ERROR:root:Path not found: gs://t5_training/models/code/codesummarization_uni_v1_1/base/operative_config.gin


Predicted task : ruby

Predictions using checkpoint 1245000:



## Scoring

In [None]:
tasks = [
         ['codesearchnet', 'python'],
         ['codesearchnet', 'java'],
         ['codesearchnet', 'javascript'],
         ['codesearchnet', 'go'],
         ['codesearchnet', 'php'],
         ['codesearchnet', 'ruby'],
         ]
output_dir = "codesummarization_uni_v1"
test_file = 'test'
checkpoint = '1234000'

In [None]:
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/evaluator.py
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/predictions.txt
!wget https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/reference.txt

--2021-04-23 00:07:44--  https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/evaluator.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6916 (6.8K) [text/plain]
Saving to: ‘evaluator.py.1’


2021-04-23 00:07:44 (49.2 MB/s) - ‘evaluator.py.1’ saved [6916/6916]

--2021-04-23 00:07:45--  https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Code-Text/code-to-text/evaluator/predictions.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 196 [text/plain]
Saving to: ‘predictions.txt.1’


20

In [None]:
!python evaluator.py reference.txt < predictions.txt
!mkdir output

Total: 5
9.554726113590661
mkdir: cannot create directory ‘output’: File exists


In [None]:
for task in tasks:
  lang = task[1]
  with open(f'{lang}/predict_output.tsv-{checkpoint}') as predict_output:
    with open(f'{lang}/actual_output.tsv') as actual_output:
      with open(f'output/{lang}_reference.txt', 'w') as reference:
        with open(f'output/{lang}_predictions.txt', 'w') as predictions:
          for idx, (line1, line2) in enumerate(zip(predict_output, actual_output)):
            line1 = line1.replace('⁇', '')
            reference.write(f'{idx}\t{line2}')
            predictions.write(f'{idx}\t{line1}')
          print(f'language: {lang}')
          !python evaluator.py output/{lang}_reference.txt < output/{lang}_predictions.txt
          print('\n')
        

language: python
Total: 14779
19.731685546321724


language: java
Total: 10861
19.079527564567012


language: javascript
Total: 3200
14.965278143124621


language: go
Total: 7989
18.899657295993357


language: php
Total: 13820
24.603038462225744


language: ruby
Total: 1199
14.037486232881045




In [None]:
!zip -r output.zip output

updating: output/ (stored 0%)
updating: output/ruby_predictions.txt (deflated 65%)
updating: output/python_reference.txt (deflated 68%)
updating: output/php_predictions.txt (deflated 70%)
updating: output/java_predictions.txt (deflated 72%)
updating: output/php_reference.txt (deflated 69%)
updating: output/go_reference.txt (deflated 70%)
updating: output/python_predictions.txt (deflated 71%)
updating: output/javascript_predictions.txt (deflated 68%)
updating: output/go_predictions.txt (deflated 73%)
updating: output/java_reference.txt (deflated 69%)
updating: output/javascript_reference.txt (deflated 66%)
updating: output/ruby_reference.txt (deflated 65%)


In [None]:
language: python
Total: 14882
71.75255231835018


language: java
Total: 10899
16.06278592746895


language: javascript
Total: 3254
7.793938400469148


language: go
Total: 8009
22.548576422978876


language: php
Total: 14001
21.239563344111858


language: ruby
Total: 1204
7.628737805048416


In [None]:
checkpoint = 1245000
total_f1 = 0
total_precision = 0
total_recall = 0
anchor_pred_labels = []
anchor_actual_labels = []
for task in tasks:
    d = task[0]
    t = task[1]
    
    pred_file = os.path.join('/content/', t +'_predict_output.txt-%s'%checkpoint)
    actual_file = os.path.join('/content/', t + '_actual_output.txt')
    
    # pred_file = 't5-data_bio_data_NCBI_NER_predict_outputs_1603446926.txt-1017500'
    # actual_file = 'test_raw.txt'
    pred_labels = convert_RE_labels(pred_file)
    actual_labels = convert_RE_labels(actual_file)
#     print(pred_labels)
#     print(actual_labels)
#     pred_labels = np.zeros(len(actual_labels)).tolist()

    
    # f1score = f1_score(actual_labels, pred_labels, average='micro')
    # recallscore = recall_score(actual_labels, pred_labels, average='micro')
    # precisionscore = precision_score(actual_labels, pred_labels, average='micro')

    # total_f1 += f1score
    # total_recall += recallscore
    # total_precision += precisionscore
    # accuracy = accuracy_score(actual_labels, pred_labels)
    # print(t , f1score, recallscore, precisionscore)
    # break
    
#     f1score = f1_score(tmp_actual, tmp_pred)
#     recallscore = recall_score(tmp_actual, tmp_pred)
#     precisionscore = precision_score(tmp_actual, tmp_pred)
    
#     print("%s\t Precision: %2f \t Recall-score: %2f \t F1-score: %2f " % (t, precisionscore, recallscore, f1score))
#     print("Accuracy score: %2f" % accuracy_score(actual_labels, pred_labels))
#     print(t)|
    # print("Report:", classification_report(actual_labels, pred_labels, digits=4, labels=labels))
    print("Report %s:"%t, classification_report(actual_labels, pred_labels, digits=4))
    f1_score(y_pred=pred_labels, y_true=actual_labels, average='micro')
    p,r,f,_ = precision_recall_fscore_support(y_pred=pred_labels, y_true=actual_labels)
    results = dict()
    results["f1 score"] = f[1]
    results["recall"] = r[1]
    results["precision"] = p[1]
    results["specificity"] = r[0]     
    print(t, results) 

Report mednli:                precision    recall  f1-score   support

CONTRADICTION     0.8929    0.8966    0.8947       474
   ENTAILMENT     0.8148    0.8354    0.8250       474
      NEUTRAL     0.8391    0.8143    0.8266       474

     accuracy                         0.8488      1422
    macro avg     0.8489    0.8488    0.8488      1422
 weighted avg     0.8489    0.8488    0.8488      1422

mednli {'f1 score': 0.825, 'recall': 0.8354430379746836, 'precision': 0.8148148148148148, 'specificity': 0.8966244725738397}


In [None]:
euadr_10 {'f1 score': 0.9923076923076923, 'recall': 0.9847328244274809, 'precision': 1.0, 'specificity': 1.0}


In [None]:
import tensorflow.compat.v1 as tf
question_1 = "Where is the Google headquarters located?" 
# question_2 = "What is the most populous country in the world?" 
# question_3 = "Who are the 4 members of The Beatles?" 
# question_4 = "How many teeth do humans have?"

questions = [question_1]

now = time.time()
# Write out the supplied questions to text files.
predict_inputs_path = os.path.join(MODEL_DIR, "predict_inputs_%d.txt" % now)
predict_outputs_path = os.path.join(MODEL_DIR, "predict_outputs_%d.txt" % now)
# Manually apply preprocessing by prepending "triviaqa question:".

with tf.io.gfile.GFile(predict_inputs_path, "w") as f:
  for q in questions:
    f.write("chemprot_re: %s\n" % q.lower())

# Ignore any logging so that we only see the model's answers to the questions.
with tf_verbosity_level('ERROR'):
  model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
  model.predict(
      input_file=predict_inputs_path,
      output_file=predict_outputs_path,
      # Select the most probable output token at each step.
      temperature=0,
  )

# The output filename will have the checkpoint appended so we glob to get 
# the latest.
prediction_files = sorted(tf.io.gfile.glob(predict_outputs_path + "*"))
print("\nPredictions using checkpoint %s:\n" % prediction_files[-1].split("-")[-1])
with tf.io.gfile.GFile(prediction_files[-1]) as f:
  for q, a in zip(questions, f):
    if q:
      print("Q: " + q)
      print("A: " + a)
      print()


Predictions using checkpoint 1242600:

Q: Where is the Google headquarters located?
A: 




In [None]:
!pip install gdown
!gdown https://drive.google.com/uc?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h
!unzip Cleaned_CodeSearchNet.zip
!rm Cleaned_CodeSearchNet.zip

Downloading...
From: https://drive.google.com/uc?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h
To: /content/Cleaned_CodeSearchNet.zip
381MB [00:03, 122MB/s]
Archive:  Cleaned_CodeSearchNet.zip
   creating: CodeSearchNet/
   creating: CodeSearchNet/ruby/
  inflating: CodeSearchNet/ruby/valid.jsonl  
  inflating: CodeSearchNet/ruby/train.jsonl  
  inflating: CodeSearchNet/ruby/test.jsonl  
   creating: CodeSearchNet/php/
  inflating: CodeSearchNet/php/valid.jsonl  
  inflating: CodeSearchNet/php/train.jsonl  
  inflating: CodeSearchNet/php/test.jsonl  
   creating: CodeSearchNet/python/
  inflating: CodeSearchNet/python/valid.jsonl  
  inflating: CodeSearchNet/python/train.jsonl  
  inflating: CodeSearchNet/python/test.jsonl  
   creating: CodeSearchNet/java/
  inflating: CodeSearchNet/java/valid.jsonl  
  inflating: CodeSearchNet/java/train.jsonl  
  inflating: CodeSearchNet/java/test.jsonl  
   creating: CodeSearchNet/go/
  inflating: CodeSearchNet/go/valid.jsonl  
  inflating: CodeSearchNet/go