In [1]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

We're running Colab


In [2]:
import tensorflow as tf

print("Running TensorFlow version ",tf.__version__)

# Parse tensorflow version
import re

version_match = re.match("([0-9]+)\.([0-9]+)", tf.__version__)
tf_major, tf_minor = int(version_match.group(1)) , int(version_match.group(2))
print("Version {v:d}, minor {m:d}".format(v=tf_major, m=tf_minor) )

Running TensorFlow version  2.12.0
Version 2, minor 12


In [3]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    print('Using GPU')
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('Using CPU')

Using CPU


In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.2/224.2 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.0 tokenizers-0.13.3 transformers-4.28.1


# Use an Inference end-point
- Advange:
  - free
  - does not use *local* RAM so can run big models
- [paid hosting](https://huggingface.co/pricing#endpoints)
  - don't get charged for a *paused* end-point
- [guide](https://huggingface.co/docs/inference-endpoints/index)

In [5]:
import requests

from pathlib import Path



def get_API_token(token_file="/content/hf.token"):
    # Check for file containing API token to HuggingFace
    p = Path(token_file).expanduser()
    if not p.exists():
      print(f"Token file {p} not found.")
      return

    with open(token_file, 'r') as fp:
        token = fp.read()
        
    # Remove trailing newline
    token = token.rstrip()
        
    return token

API_TOKEN=get_API_token();


In [6]:
gen_text_key = "generated_text"
input_key = "inputs"
error_key = "error"

models = { "small": "EleutherAI/gpt-neo-1.3B",
           "big":   "EleutherAI/gpt-neox-20b"
}

In [7]:
import time

headers = {"Authorization": f"Bearer {API_TOKEN}"}

def query(payload, model_string):
  API_URL = f"https://api-inference.huggingface.co/models/{model_string}"

  response = requests.post(API_URL, headers=headers, json=payload)
  return response.json()

def execute_query(q, model_string):
  output = query({  input_key: q }, model_string)

  # Successful output is a list; error output is a dict
  if type(output) is dict:
    out = f"Error: {output[error_key]}"
  else:  
    out = output[0][gen_text_key] 

  return out



exemplars = [ "this movie was great: positive",
             "one of the best films of the year: positive",
             "just plain awful: negative",
             "I would not see this one again: negative",
             "this movie was great: positive",
             "one of the best films of the year: positive",
             "just plain awful: negative",
             "I would not see this one again: negative",
             "I love this film: positive"
]

sep = " \n "
exemplar_string = sep.join(exemplars)
few_shot_string =  exemplar_string + sep + "I've heard not so great things about this one:"

q = few_shot_string   

# q = "Can you please let us know more details about your " 

# Can run a very large model since execution is on remote end-point
model_string = models["big"]

time_start = time.time()

res = execute_query(q, model_string)

time_end = time.time()

print(f"[{time_end-time_start:3.2f} seconds, using {model_string}]\n {res}")


[0.88 seconds, using EleutherAI/gpt-neox-20b]
 this movie was great: positive 
 one of the best films of the year: positive 
 just plain awful: negative 
 I would not see this one again: negative 
 this movie was great: positive 
 one of the best films of the year: positive 
 just plain awful: negative 
 I would not see this one again: negative 
 I love this film: positive 
 I've heard not so great things about this one: negative 
 I would not see this one again: negative 
 I love this film: positive 
 I


# Use a pipeline
- runs locally
- so can only use model small enough to fit in RAM

In [8]:
model_string = models["small"]


In [9]:
from transformers import pipeline

num_return = 3
len_return = 30


generator = pipeline('text-generation', model = model_string)


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [10]:
time_start = time.time()

q = few_shot_string

# q = "Hello, I'm a language model"
resp = generator(q, max_length = 30, num_return_sequences=num_return)

time_end = time.time()

print(f"[{time_end-time_start:3.2f} seconds, using {model_string}]")

for i, gen in enumerate(resp):
  print(gen[gen_text_key])
  print("\n")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 95, but `max_length` is set to 30. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


[17.59 seconds, using EleutherAI/gpt-neo-1.3B]
this movie was great: positive 
 one of the best films of the year: positive 
 just plain awful: negative 
 I would not see this one again: negative 
 this movie was great: positive 
 one of the best films of the year: positive 
 just plain awful: negative 
 I would not see this one again: negative 
 I love this film: positive 
 I've heard not so great things about this one: poor


this movie was great: positive 
 one of the best films of the year: positive 
 just plain awful: negative 
 I would not see this one again: negative 
 this movie was great: positive 
 one of the best films of the year: positive 
 just plain awful: negative 
 I would not see this one again: negative 
 I love this film: positive 
 I've heard not so great things about this one: negative


this movie was great: positive 
 one of the best films of the year: positive 
 just plain awful: negative 
 I would not see this one again: negative 
 this movie was great: positi

# Interactive using Gradio
- [components doc](https://gradio.app/docs/)

**Warning**

When using a few-shot prompt: smaller models seem to be particularly sensitive to extra blanks at the end of the final line (the one *without* the answer)

In [11]:
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.27.0-py3-none-any.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting orjson
  Downloading orjson-3.8.10-cp39-cp39-manylinux_2_28_x86_64.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.5/140.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client>=0.1.3
  Downloading gradio_client-0.1.3-py3-none-any.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.2/286.2 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting semantic-version
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.

In [12]:
import gradio as gr

from transformers import pipeline

# Use generator from pipeline above

def generate(prompt, num_return, len_return):
    resp = generator(prompt, max_length = len_return, num_return_sequences=num_return)

    # Create output string
    out = "\n\n".join( [ r[gen_text_key] for r in  resp ])
    
    return out

iface = gr.Interface(
          generate,
          inputs=[
              gr.Textbox(type="text", 
                         value=few_shot_string,
                         label="Type your input here:", show_label=True
                         ),
              gr.Number(value=num_return,
                        precision=0,
                        label="# of outputs to generate", show_label=True
                        ),
              gr.Number(value=len_return,
                        precision=0,
                        label="length of output", show_label=True
                        ),
          ],
          outputs=[
              gr.Textbox(type="text", label="Output: ", show_label=True)
          ],
          title=f"Text completion using {model_string}"
)
        



In [None]:
iface.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://d1011596016483924b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 95, but `max_length` is set to 30. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
