[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/hardik-vala/stable-lm-3b-24-hours-hackathon/blob/main/stablelm_3b_24_hours_hackathon.ipynb)

# Install Prerequisites

In [1]:
!nvidia-smi

Fri Nov 10 02:54:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -U pip
!pip install accelerate bitsandbytes cohere llama-index openai sentence_transformers torch transformers

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.1
Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl.metadata (9.8 kB)
Collecting cohere
  Downloading cohere-4.34-py3-none-any.whl.metadata (5.3 kB)
Collecting llama-index
  Downloading llama_index-0.8.66-py3-none-any.whl.metadata (7.8 kB)
Collecting openai
  Downloading openai-1.2.2-py3-none-any.whl.metadata (16 kB)
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86

# Setup

In [3]:
from IPython.display import Markdown, display

def hr(): display(Markdown('---'))
def cprint(msg: str, color: str = "blue", **kwargs) -> None:
    color_codes = {
        "blue": "\033[34m",
        "red": "\033[31m",
        "green": "\033[32m",
        "yellow": "\033[33m",
        "purple": "\033[35m",
        "cyan": "\033[36m",
    }

    if color not in color_codes:
        raise ValueError(f"Invalid info color: `{color}`")

    print(color_codes[color] + msg + "\033[0m", **kwargs)

In [4]:
import os
import uuid

from llama_index import (
    KeywordTableIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
)
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts import PromptTemplate
from openai import OpenAI
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    StoppingCriteria,
    StoppingCriteriaList
)

In [5]:
HUGGING_FACE_ACCESS_TOKEN = "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" #@param { "type": "string" }
OPENAI_API_KEY = "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" #@param { "type": "string" }

openai_client = OpenAI(
    api_key=OPENAI_API_KEY,
)

# Fetch Data

In [13]:
def fetch_document_data(prompt):
  response = openai_client.chat.completions.create(
    model="gpt-4",
    messages=[
      {
          "role": "user",
          "content": "Generate a FAQ with both questions and answers from this prompt: " + prompt
      }
    ],
    temperature=0.5,
  )
  return response.choices[0].message.content.strip()

data = fetch_document_data("Navigating a wildfire that's going to envelope my region.")
data

"Q: What are the first steps to take when a wildfire threatens my region?\nA: The first steps should be to stay informed about the situation, prepare your home and belongings for possible evacuation, and create an emergency plan. This includes knowing evacuation routes, having a packed emergency kit, and ensuring all family members understand the plan.\n\nQ: How can I stay informed about the wildfire situation?\nA: Tune in to local news channels, radio stations, and follow trusted local authorities on social media for updates. You can also sign up for emergency alerts in your area.\n\nQ: What should I include in my emergency kit?\nA: Your emergency kit should include water, non-perishable food, a first-aid kit, prescription medications, important documents, cash, a battery-powered or hand-crank radio, a flashlight with extra batteries, and personal items like clothing and hygiene supplies.\n\nQ: What can I do to prepare my home for a wildfire?\nA: Remove flammable materials from around

# Prepare Model

In [14]:
model_name = "stabilityai/stablelm-3b-4e1t"
# model_name = "stabilityai/stablelm-tuned-alpha-7b"

system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.5, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=model_name,
    model_name=model_name,
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={
        "max_length": 4096,
        "token": HUGGING_FACE_ACCESS_TOKEN,
    },
    model_kwargs={
        "torch_dtype": "auto",
        "load_in_8bit": False,
        "offload_folder": "./offload",
        "token": HUGGING_FACE_ACCESS_TOKEN,
        "trust_remote_code": True
    }
)

service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
)



# Build Index

In [15]:
def write_data_to_file(data, dir_name):
  if not os.path.exists(dir_name):
    os.makedirs(dir_name)

  file_path = os.path.join(dir_name, "data.txt")

  with open(file_path, "w") as file:
    file.write(data)

dir_name = str(uuid.uuid4())
write_data_to_file(data, dir_name)

documents = SimpleDirectoryReader(dir_name).load_data()

index = KeywordTableIndex.from_documents(
    documents, service_context=service_context
)

query_engine = index.as_query_engine()

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


OutOfMemoryError: ignored

# Generate Response

In [26]:
response = query_engine.query(
    "How can I stay up-to-date with the progress of the wildfire?"
)
response

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


KeyboardInterrupt: ignored

# Deploy App

In [6]:
import locale

def getpreferredencoding(do_setlocale = True):
    return "UTF-8"

locale.getpreferredencoding = getpreferredencoding

In [7]:
!pip install -q streamlit
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x cloudflared-linux-amd64

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[0m--2023-11-10 03:05:10--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2023.10.0/cloudflared-linux-amd64 [following]

In [8]:
%%writefile app.py

import os
import time
import uuid

from llama_index import (
    KeywordTableIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
)
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts import PromptTemplate
from openai import OpenAI
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    StoppingCriteria,
    StoppingCriteriaList
)

import streamlit as st

HUGGING_FACE_ACCESS_TOKEN = "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"
OPENAI_API_KEY = "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"

openai_client = OpenAI(
    api_key=OPENAI_API_KEY,
)

model_name = "stabilityai/stablelm-3b-4e1t"
# model_name = "stabilityai/stablelm-tuned-alpha-7b"

system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.5, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=model_name,
    model_name=model_name,
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={
        "max_length": 4096,
        "token": HUGGING_FACE_ACCESS_TOKEN,
    },
    model_kwargs={
        "torch_dtype": "auto",
        "load_in_8bit": False,
        "offload_folder": "./offload",
        "token": HUGGING_FACE_ACCESS_TOKEN,
        "trust_remote_code": True
    }
)

service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
)

def fetch_document_data(prompt):
  response = openai_client.chat.completions.create(
    model="gpt-4",
    messages=[
      {
          "role": "user",
          "content": "Generate a FAQ with both questions and answers from this prompt: " + prompt
      }
    ],
    temperature=0.5,
  )
  return response.choices[0].message.content.strip()

def write_data_to_file(data, dir_name):
  if not os.path.exists(dir_name):
    os.makedirs(dir_name)

  file_path = os.path.join(dir_name, "data.txt")

  with open(file_path, "w") as file:
    file.write(data)

if "page" not in st.session_state:
    st.session_state.page = "home"

if "pocket_pilots" not in st.session_state:
    st.session_state.pocket_pilots = [
        {
          "key": "wildfire_rescue_agent",
          "name": "🔥 Wildfire Rescue Agent",
          "prompt": "Provide real-time updates, evacuation guidance, safety procedures, and emergency support to help me prepare for and respond to an approaching wildfire." ,
          "messages": [
              {"role": "assistant", "content": "How can I help you?"}
          ],
          "query_engine": None
        },
        {
          "key": "antarctica_tour_guide",
          "name": "🧊 Antarctica Tour Guide",
          "prompt": "Information on local wildlife, historical sites, safety tips for extreme weather, and help with navigation and travel arrangements in Antarctica." ,
          "messages": [
              {"role": "assistant", "content": "How can I help you?"}
          ],
          "query_engine": None
        }
    ]

if "create_pocket_pilot_name" not in st.session_state:
    st.session_state.create_pocket_pilot_name = ""

if "create_pocket_pilot_prompt" not in st.session_state:
    st.session_state.create_pocket_pilot_prompt = ""

def set_page(page_name):
    st.session_state.page = page_name

def add_pocket_pilot(name, prompt, query_engine):
    st.session_state.create_pocket_pilot_name = name
    st.session_state.create_pocket_pilot_prompt = prompt

    key = name.lower().replace(" ", "_")

    st.session_state.pocket_pilots.append({
        "key": key,
        "name": name,
        "prompt": prompt,
        "messages": [
            {"role": "assistant", "content": "How can I help you?"}
        ],
        "query_engine": query_engine
    })

    set_page(key)

with st.sidebar:
    st.header("Create a new Pocket Pilot")

    create_name = st.text_input(
        "Name",
        key="create_name",
        type="default",
    )

    create_prompt = st.text_input(
        "Prompt",
        key="create_prompt",
        type="default",
    )

    if st.button("Generate"):
        with st.status("Building Pocket Pilot...") as status:
            st.write("Fetching data...")
            data = fetch_document_data(create_prompt)
            dir_name = str(uuid.uuid4())
            write_data_to_file(data, dir_name)
            st.write("Building index...")
            documents = SimpleDirectoryReader(dir_name).load_data()
            index = KeywordTableIndex.from_documents(
                documents, service_context=service_context
            )
            st.write("Preparing chat engine...")
            query_engine = index.as_query_engine()
            add_pocket_pilot(create_name, create_prompt, query_engine)
            status.update(label="Done!", state="complete", expanded=False)

    st.divider()

    if st.button("Home", use_container_width=True):
        set_page("home")

    st.header("Pocket Pilots")

    for pilot in st.session_state.pocket_pilots:
      if st.button(pilot["name"], use_container_width=True):
          set_page(pilot["key"])

if st.session_state.page == "home":
    st.title("Welcome to Pocket Pilot 🧑‍✈️")
    st.caption("A platform for building specialty assistants for offline use")

    st.write("To get started, you can either,")
    st.write("1) Start chatting with one of your pocket pilots")
    st.write("2) Create a new pocket pilot with a prompt and download it to your device")
else:
    for pilot in st.session_state.pocket_pilots:
        if st.session_state.page == pilot["key"]:
            st.title(pilot["name"])
            st.caption("Prompt: " + pilot["prompt"])

            for msg in pilot["messages"]:
                st.chat_message(msg["role"]).write(msg["content"])

            if prompt := st.chat_input():
              pilot["messages"].append({"role": "user", "content": prompt})
              st.chat_message("user").write(prompt)
              with st.spinner("Thinking 🤔..."):
                time.sleep(3)
                msg = {"role": "assistant", "content": "(Assistant response)"}
                pilot["messages"].append(msg)
                st.chat_message("assistant").write(msg["content"])


Writing app.py


In [9]:
!nohup /content/cloudflared-linux-amd64 tunnel --url http://localhost:8501 &

nohup: appending output to 'nohup.out'


In [12]:
!grep -o 'https://.*\.trycloudflare.com' nohup.out | head -n 1 | xargs -I {} echo "Your tunnel url {}"

Your tunnel url https://cover-dishes-calculated-remains.trycloudflare.com


In [11]:
!streamlit run /content/app.py &>/content/logs.txt &

# SCRATCH

In [None]:
# Select "big model inference" parameters
torch_dtype = "float16" # ["float16", "bfloat16", "float"]
load_in_8bit = False
device_map = "auto"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=HUGGING_FACE_ACCESS_TOKEN,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # torch_dtype=getattr(torch, torch_dtype),
    torch_dtype="auto",
    load_in_8bit=load_in_8bit,
    device_map=device_map,
    offload_folder="./offload",
    token=HUGGING_FACE_ACCESS_TOKEN,
    trust_remote_code=True
)

In [None]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [50278, 50279, 50277, 1, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

# Sampling args
max_new_tokens = 128 # min:32.0, max:3072.0
temperature = 0.7 # min:0.0, max:1.25
top_k = 0 #@ min:0.0, max:1.0
top_p = 0.9 # min:0.0, max:1.0
do_sample = True

def generate_text(prompt):
    # Create `generate` inputs
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs.to(model.device)

    # Generate
    tokens = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
        stopping_criteria=StoppingCriteriaList([StopOnTokens()])
    )

    # Extract out only the completion tokens
    completion_tokens = tokens[0][inputs['input_ids'].size(1):]
    completion = tokenizer.decode(completion_tokens, skip_special_tokens=True)
    return completion

user_prompt = "Can you write a song about a pirate at sea?"
if "tuned" in model_name:
    system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
    - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
    - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
    - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
    - StableLM will refuse to participate in anything that could harm a human.
    """
    prompt = f"{system_prompt}<|USER|>{user_prompt}<|ASSISTANT|>"
else:
    prompt = user_prompt

completion = generate_text(prompt)

# Display
print(user_prompt + " ", end="")
cprint(completion, color="green")

In [None]:
%%writefile app.py
import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList

model_name = "stabilityai/stablelm-3b-4e1t"

load_in_8bit = False
device_map = "auto"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token="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",
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    load_in_8bit=load_in_8bit,
    device_map=device_map,
    offload_folder="./offload",
    token="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",
    trust_remote_code=True
)

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [50278, 50279, 50277, 1, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

user_prompt = "Can you write a song about a pirate at sea?"
if "tuned" in model_name:
    # Add system prompt for chat tuned models
    system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
    - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
    - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
    - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
    - StableLM will refuse to participate in anything that could harm a human.
    """
    prompt = f"{system_prompt}<|USER|>{user_prompt}<|ASSISTANT|>"
else:
    prompt = user_prompt

# Sampling args
max_new_tokens = 128
temperature = 0.7
top_k = 0
top_p = 0.9
do_sample = True

# Create `generate` inputs
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to(model.device)

# Generate
tokens = model.generate(
    **inputs,
    max_new_tokens=max_new_tokens,
    temperature=temperature,
    top_k=top_k,
    top_p=top_p,
    do_sample=do_sample,
    pad_token_id=tokenizer.eos_token_id,
    stopping_criteria=StoppingCriteriaList([StopOnTokens()])
)

# Extract out only the completion tokens
completion_tokens = tokens[0][inputs['input_ids'].size(1):]
completion = tokenizer.decode(completion_tokens, skip_special_tokens=True)
print(completion)
st.write(completion)
