[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/hardik-vala/stable-lm-3b-24-hours-hackathon/blob/main/stablelm_3b_24_hours_hackathon.ipynb)

# Install Prerequisites

In [1]:
!nvidia-smi

Fri Nov 10 00:58:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -U pip
!pip install accelerate bitsandbytes cohere llama-index openai sentence_transformers torch transformers

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.1
Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl.metadata (9.8 kB)
Collecting cohere
  Downloading cohere-4.34-py3-none-any.whl.metadata (5.3 kB)
Collecting llama-index
  Downloading llama_index-0.8.66-py3-none-any.whl.metadata (7.8 kB)
Collecting openai
  Downloading openai-1.2.2-py3-none-any.whl.metadata (16 kB)
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86

# Setup

In [3]:
from IPython.display import Markdown, display

def hr(): display(Markdown('---'))
def cprint(msg: str, color: str = "blue", **kwargs) -> None:
    color_codes = {
        "blue": "\033[34m",
        "red": "\033[31m",
        "green": "\033[32m",
        "yellow": "\033[33m",
        "purple": "\033[35m",
        "cyan": "\033[36m",
    }

    if color not in color_codes:
        raise ValueError(f"Invalid info color: `{color}`")

    print(color_codes[color] + msg + "\033[0m", **kwargs)

In [18]:
import os

from llama_index import (
    KeywordTableIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
)
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts import PromptTemplate
from openai import OpenAI
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    StoppingCriteria,
    StoppingCriteriaList
)

In [9]:
HUGGING_FACE_ACCESS_TOKEN = "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" #@param { "type": "string" }
OPENAI_API_KEY = "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" #@param { "type": "string" }

openai_client = OpenAI(
    api_key=OPENAI_API_KEY,
)

# Fetch Data

In [15]:
def fetch_document_data(prompt):
  response = openai_client.chat.completions.create(
    model="gpt-4",
    messages=[
      {
          "role": "user",
          "content": "Generate a FAQ with both questions and answers from this prompt: " + prompt
      }
    ],
    temperature=0.5,
  )
  return response.choices[0].message.content.strip()

data = fetch_document_data("Navigating a wildfire that's going to envelope my region.")
data



# Prepare Model

In [23]:
model_name = "stabilityai/stablelm-3b-4e1t"
# model_name = "stabilityai/stablelm-tuned-alpha-7b"

system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.5, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=model_name,
    model_name=model_name,
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={
        "max_length": 4096,
        "token": HUGGING_FACE_ACCESS_TOKEN,
    },
    model_kwargs={
        "torch_dtype": "auto",
        "load_in_8bit": False,
        "offload_folder": "./offload",
        "token": HUGGING_FACE_ACCESS_TOKEN,
        "trust_remote_code": True
    }
)

service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
)



Downloading (…)okenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Build Index

In [24]:
def write_data_to_file(data):
  if not os.path.exists("data"):
    os.makedirs("data")

  file_path = os.path.join("data", "data.txt")

  with open(file_path, "w") as file:
    file.write(data)

write_data_to_file(data)

documents = SimpleDirectoryReader("data").load_data()

index = KeywordTableIndex.from_documents(
    documents, service_context=service_context
)

query_engine = index.as_query_engine()

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
[nltk_data] Downloading package stopwords to /tmp/llama_index...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Generate Response

In [26]:
response = query_engine.query(
    "How can I stay up-to-date with the progress of the wildfire?"
)
response

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


KeyboardInterrupt: ignored

# Deploy App

In [27]:
import locale

def getpreferredencoding(do_setlocale = True):
    return "UTF-8"

locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install -q streamlit
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x cloudflared-linux-amd64

In [None]:
%%writefile app.py

import os

from llama_index import (
    KeywordTableIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
)
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts import PromptTemplate
from openai import OpenAI
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    StoppingCriteria,
    StoppingCriteriaList
)

import streamlit as st

HUGGING_FACE_ACCESS_TOKEN = "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"
OPENAI_API_KEY = "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"

openai_client = OpenAI(
    api_key=OPENAI_API_KEY,
)

model_name = "stabilityai/stablelm-3b-4e1t"
# model_name = "stabilityai/stablelm-tuned-alpha-7b"

system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.5, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=model_name,
    model_name=model_name,
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={
        "max_length": 4096,
        "token": HUGGING_FACE_ACCESS_TOKEN,
    },
    model_kwargs={
        "torch_dtype": "auto",
        "load_in_8bit": False,
        "offload_folder": "./offload",
        "token": HUGGING_FACE_ACCESS_TOKEN,
        "trust_remote_code": True
    }
)

service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
)

def fetch_document_data(prompt):
  response = openai_client.chat.completions.create(
    model="gpt-4",
    messages=[
      {
          "role": "user",
          "content": "Generate a FAQ with both questions and answers from this prompt: " + prompt
      }
    ],
    temperature=0.5,
  )
  return response.choices[0].message.content.strip()

# SCRATCH

In [None]:
# Select "big model inference" parameters
torch_dtype = "float16" # ["float16", "bfloat16", "float"]
load_in_8bit = False
device_map = "auto"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=HUGGING_FACE_ACCESS_TOKEN,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # torch_dtype=getattr(torch, torch_dtype),
    torch_dtype="auto",
    load_in_8bit=load_in_8bit,
    device_map=device_map,
    offload_folder="./offload",
    token=HUGGING_FACE_ACCESS_TOKEN,
    trust_remote_code=True
)

In [None]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [50278, 50279, 50277, 1, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

# Sampling args
max_new_tokens = 128 # min:32.0, max:3072.0
temperature = 0.7 # min:0.0, max:1.25
top_k = 0 #@ min:0.0, max:1.0
top_p = 0.9 # min:0.0, max:1.0
do_sample = True

def generate_text(prompt):
    # Create `generate` inputs
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs.to(model.device)

    # Generate
    tokens = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
        stopping_criteria=StoppingCriteriaList([StopOnTokens()])
    )

    # Extract out only the completion tokens
    completion_tokens = tokens[0][inputs['input_ids'].size(1):]
    completion = tokenizer.decode(completion_tokens, skip_special_tokens=True)
    return completion

user_prompt = "Can you write a song about a pirate at sea?"
if "tuned" in model_name:
    system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
    - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
    - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
    - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
    - StableLM will refuse to participate in anything that could harm a human.
    """
    prompt = f"{system_prompt}<|USER|>{user_prompt}<|ASSISTANT|>"
else:
    prompt = user_prompt

completion = generate_text(prompt)

# Display
print(user_prompt + " ", end="")
cprint(completion, color="green")

UTF-8
UTF-8


In [None]:
!pip install -q streamlit
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x cloudflared-linux-amd64

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/4.8 MB[0m [31m30.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.8/4.8 MB[0m [31m77.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m115.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[0m--2023-11-08 04:48:59-

In [None]:
%%writefile app.py
import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList

model_name = "stabilityai/stablelm-3b-4e1t"

load_in_8bit = False
device_map = "auto"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token="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",
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    load_in_8bit=load_in_8bit,
    device_map=device_map,
    offload_folder="./offload",
    token="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",
    trust_remote_code=True
)

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [50278, 50279, 50277, 1, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

user_prompt = "Can you write a song about a pirate at sea?"
if "tuned" in model_name:
    # Add system prompt for chat tuned models
    system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
    - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
    - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
    - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
    - StableLM will refuse to participate in anything that could harm a human.
    """
    prompt = f"{system_prompt}<|USER|>{user_prompt}<|ASSISTANT|>"
else:
    prompt = user_prompt

# Sampling args
max_new_tokens = 128
temperature = 0.7
top_k = 0
top_p = 0.9
do_sample = True

# Create `generate` inputs
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to(model.device)

# Generate
tokens = model.generate(
    **inputs,
    max_new_tokens=max_new_tokens,
    temperature=temperature,
    top_k=top_k,
    top_p=top_p,
    do_sample=do_sample,
    pad_token_id=tokenizer.eos_token_id,
    stopping_criteria=StoppingCriteriaList([StopOnTokens()])
)

# Extract out only the completion tokens
completion_tokens = tokens[0][inputs['input_ids'].size(1):]
completion = tokenizer.decode(completion_tokens, skip_special_tokens=True)
print(completion)
st.write(completion)


Writing app.py


In [None]:
!nohup /content/cloudflared-linux-amd64 tunnel --url http://localhost:8501 &

nohup: appending output to 'nohup.out'


In [None]:
!grep -o 'https://.*\.trycloudflare.com' nohup.out | head -n 1 | xargs -I {} echo "Your tunnel url {}"

Your tunnel url https://experiment-build-new-fifth.trycloudflare.com


In [None]:
!streamlit run /content/app.py &>/content/logs.txt &

## License (Apache 2.0)

Copyright (c) 2023 by [StabilityAI LTD](https://stability.ai/)

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.