## Topic discovery: yelp review

This notebook shows how to load a huge LLM and perform inference using ```accelerate```. See ```0_guide``` for detail and reference.

In [1]:
# from huggingface_hub import hf_hub_download 
import torch
from datasets import load_dataset, load_from_disk
import evaluate
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
from transformers import MistralForCausalLM
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, TaskType, get_peft_model
from peft import PeftConfig, PeftModel
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from huggingface_hub import notebook_login
from accelerate import init_empty_weights, load_checkpoint_and_dispatch, \
                        infer_auto_device_map, dispatch_model, load_checkpoint_in_model

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import re
import os
import glob

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
print(transformers.__version__)

4.35.2


In [3]:
print(torch.__version__)
print(torch.cuda.is_available())

2.3.0+cu118
True


In [4]:
! nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:41:10_Pacific_Daylight_Time_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [5]:
llm_repo_dir = 'D:/projects/LLM'
cache_dir = '/cygdrive/d/projects/LLM/.cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME'] = cache_dir + '/huggingface'
os.environ['XDG_CACHE_HOME'] = cache_dir
os.environ['HF_DATASETS_CACHE'] = cache_dir

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Gen AI

In [6]:
## T5-3b
#model_name = 'google-t5/t5-3b'
#model_path = cache_dir + '/models--google-t5--t5-3b/snapshots/bed96aab9ee46012a5046386105ee5fd0ac572f0'

# Mistral 7B
model_name = 'mistralai/Mistral-7B-v0.1'
model_path = cache_dir + '/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24'
offload_folder = cache_dir + '/models--mistralai--Mistral-7B-v0.1/offload_folder'

In [7]:
# need this to get token to access a gated model
# notebook_login()

In [7]:
%%time

mistral_tokenizer = AutoTokenizer.from_pretrained(model_path, 
                                          cache_dir=cache_dir, 
                                          local_flies_only=True,
                                          padding_side="left")
mistral_tokenizer.pad_token = mistral_tokenizer.eos_token 

# load empty model to save memory
with init_empty_weights():
    mistral_model = AutoModelForCausalLM.from_pretrained(model_path, 
                                             cache_dir=cache_dir,
                                             local_files_only=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



CPU times: total: 6.05 s
Wall time: 3.47 s




In [8]:
# 0: cuda. cpu: RAM
device_map = infer_auto_device_map(
    mistral_model, 
    max_memory={0: "6GB", 'cpu': "30GB"},
    no_split_module_classes=["OPTDecoderLayer"], 
    dtype='float16'
)

device_map



OrderedDict([('model.embed_tokens', 0),
             ('model.layers.0', 0),
             ('model.layers.1', 0),
             ('model.layers.2', 0),
             ('model.layers.3', 0),
             ('model.layers.4', 0),
             ('model.layers.5', 0),
             ('model.layers.6', 0),
             ('model.layers.7', 0),
             ('model.layers.8', 0),
             ('model.layers.9', 0),
             ('model.layers.10', 0),
             ('model.layers.11', 0),
             ('model.layers.12.self_attn.q_proj', 0),
             ('model.layers.12.self_attn.k_proj', 'cpu'),
             ('model.layers.12.self_attn.v_proj', 'cpu'),
             ('model.layers.12.self_attn.o_proj', 'cpu'),
             ('model.layers.12.self_attn.rotary_emb', 'cpu'),
             ('model.layers.12.mlp', 'cpu'),
             ('model.layers.12.input_layernorm', 'cpu'),
             ('model.layers.12.post_attention_layernorm', 'cpu'),
             ('model.layers.13', 'cpu'),
             ('model.layers

In [9]:
mistral_model = load_checkpoint_and_dispatch(mistral_model, 
                         model_path,
                         device_map=device_map,
                         dtype='float16',
                         offload_folder=offload_folder,
                         offload_state_dict=True)

  0%|          | 0/203 [00:00<?, ?w/s]

  0%|          | 0/88 [00:00<?, ?w/s]



In [15]:
comment = "Fed announced an interest rate cut."

prompt = """
Give a one word topic that describes the following comment: {0} Topic: 
""".format(comment).strip()

print(prompt)

Give a one word topic that describes the following comment: Fed announced an interest rate cut. Topic:


In [16]:
%%time

input_ids = mistral_tokenizer([prompt], return_tensors="pt").input_ids.to(device)
with torch.no_grad():
    generated_ids = mistral_model.generate(input_ids, 
                                      do_sample=True,
                                      temperature=0.1,
                                      top_k=100,
                                      top_p=100,
                                      max_new_tokens=3)
    generated_text = mistral_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Give a one word topic that describes the following comment: Fed announced an interest rate cut. Topic: Interest Rate Cut
CPU times: total: 3.09 s
Wall time: 7.56 s
