In [1]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

In [2]:
import tensorflow as tf

print("Running TensorFlow version ",tf.__version__)

# Parse tensorflow version
import re

version_match = re.match("([0-9]+)\.([0-9]+)", tf.__version__)
tf_major, tf_minor = int(version_match.group(1)) , int(version_match.group(2))
print("Version {v:d}, minor {m:d}".format(v=tf_major, m=tf_minor) )

Running TensorFlow version  2.0.0
Version 2, minor 0


In [3]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    print('Using GPU')
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('Using CPU')

Using CPU


In [4]:
if IN_COLAB:
  !pip install datasets evaluate transformers[sentencepiece]
  !pip install gradio

In [5]:
import gradio as gr

ModuleNotFoundError: No module named 'gradio'

# Use an Inference end-point
- Advange:
  - free
  - does not use *local* RAM so can run big models
- [paid hosting](https://huggingface.co/pricing#endpoints)
  - don't get charged for a *paused* end-point
- [guide](https://huggingface.co/docs/inference-endpoints/index)

In [None]:
import requests

from pathlib import Path



def get_API_token(token_file="/content/hf.token"):
    # Check for file containing API token to HuggingFace
    p = Path(token_file).expanduser()
    if not p.exists():
      print(f"Token file {p} not found.")
      return

    with open(token_file, 'r') as fp:
        token = fp.read()

    # Remove trailing newline
    token = token.rstrip()

    return token

API_TOKEN=get_API_token();


In [None]:
gen_text_key = "generated_text"
input_key = "inputs"
error_key = "error"

models = { "small": "EleutherAI/gpt-neo-1.3B",
           "big":   "EleutherAI/gpt-neox-20b"
}

In [None]:
import time

headers = {"Authorization": f"Bearer {API_TOKEN}"}

def query(payload, model_string):
  API_URL = f"https://api-inference.huggingface.co/models/{model_string}"

  response = requests.post(API_URL, headers=headers, json=payload)
  return response.json()

def execute_query(q, model_string):
  output = query({  input_key: q }, model_string)

  # Successful output is a list; error output is a dict
  if type(output) is dict:
    out = f"Error: {output[error_key]}"
  else:
    out = output[0][gen_text_key]

  return out



exemplars = [ "this movie was great: positive",
             "one of the best films of the year: positive",
             "just plain awful: negative",
             "I would not see this one again: negative",
             "this movie was great: positive",
             "one of the best films of the year: positive",
             "just plain awful: negative",
             "I would not see this one again: negative",
             "I love this film: positive"
]

sep = " \n "
exemplar_string = sep.join(exemplars)
few_shot_string =  exemplar_string + sep + "I've heard not so great things about this one:"

q = few_shot_string

# q = "Can you please let us know more details about your "

# Can run a very large model since execution is on remote end-point
model_string = models["big"]

time_start = time.time()

res = execute_query(q, model_string)

time_end = time.time()

print(f"[{time_end-time_start:3.2f} seconds, using {model_string}]\n {res}")


# Use a pipeline
- runs locally
- so can only use model small enough to fit in RAM

In [None]:
model_string = models["small"]


In [None]:
from transformers import pipeline

num_return = 3
len_return = 30


generator = pipeline('text-generation', model = model_string)


In [None]:
time_start = time.time()

q = few_shot_string

# q = "Hello, I'm a language model"
resp = generator(q, max_length = 30, num_return_sequences=num_return)

time_end = time.time()

print(f"[{time_end-time_start:3.2f} seconds, using {model_string}]")

for i, gen in enumerate(resp):
  print(gen[gen_text_key])
  print("\n")


# Interactive using Gradio
- [components doc](https://gradio.app/docs/)

**Warning**

When using a few-shot prompt: smaller models seem to be particularly sensitive to extra blanks at the end of the final line (the one *without* the answer)

In [None]:
import gradio as gr

from transformers import pipeline

# Use generator from pipeline above

def generate(prompt, num_return, len_return):
    resp = generator(prompt, max_length = len_return, num_return_sequences=num_return)

    # Create output string
    out = "\n\n".join( [ r[gen_text_key] for r in  resp ])

    return out

iface = gr.Interface(
          generate,
          inputs=[
              gr.Textbox(type="text",
                         value=few_shot_string,
                         label="Type your input here:", show_label=True
                         ),
              gr.Number(value=num_return,
                        precision=0,
                        label="# of outputs to generate", show_label=True
                        ),
              gr.Number(value=len_return,
                        precision=0,
                        label="length of output", show_label=True
                        ),
          ],
          outputs=[
              gr.Textbox(type="text", label="Output: ", show_label=True)
          ],
          title=f"Text completion using {model_string}"
)




In [None]:
iface.launch(share=True, debug=True)