The goal of this notebook is to distill down the essential compenents of training HF models in a way that is easy to port to a data pipeline and apply to new model training scenarios.

# Imports

In [1]:
import os
import pandas as pd
from rich import print
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datetime import datetime;
pd.set_option('display.max_colwidth', None)

# Define variables

In [2]:
if True:
    date_str = datetime.now().strftime("%Y_%m_%d")
date_str

'2023_11_11'

In [3]:
work_dir = "work"
seed = 42 #reproducability  over training runs
clobber_project_dir = False #wipe the project director clean
project_dir = f"work_for_{date_str}" #project name, also used as wandb project name

#creds
hf_token = os.environ["HF_TOKEN"]
wandb_key = os.environ["WANDB_KEY"]
hf_account_name = os.environ["HF_ACCOUNT_NAME"]

#HF touchpoints
hf_dataset_name = "knkarthick/dialogsum"
source_hf_model_name = "meta-llama/Llama-2-7b-hf"

checkpoint_dir = "best_checkpoint"
quantization_dir = 'quantized_8bit'
merged_dir = 'merged_model'
quantized_train_dir = 'quant_trained_model_new'

train_test_split_ratio = 0.025

#LoRA hparams
lora_r = 16  # rank
lora_alpha = lora_r * 2
lora_dropout = 0.05

#training hparams
epochs = 1
#max_steps = 500
per_device_train_batch_size=16
gradient_accumulation_steps=16
early_stopping_patience = 3
learning_rate = 7e-4
logging_steps = 5
device = "cuda"
max_training_sample_length = 512 #2048

#quantization params
quantization_bits = 4

destination_hf_model_name = f"{hf_account_name}/llama2-7b-dialogsum-qlora-gptq"
project_path = os.path.join(work_dir,project_dir, checkpoint_dir)
quant_path = os.path.join(project_dir, quantization_dir)
quantized_train_path = os.path.join(project_dir, quantized_train_dir)
merged_path = os.path.join(project_path, merged_dir)

project_path

'work/work_for_2023_11_11/best_checkpoint'

## State of working directory

In [4]:
print(project_dir)
!mkdir -p $project_dir

#Set clobber_project_dir to True to reset working directory
if clobber_project_dir:
    !rm -rf ./$project_dir

!mkdir -p $project_dir

!ls -latrs ./$project_dir

total 0
0 drwxr-xr-x 1 root root 4096 Nov 11 19:03 .
0 drwxr-xr-x 1 root root 4096 Nov 11 19:03 runs
0 drwxr-xr-x 1 root root 4096 Nov 11 19:38 ..


# Process data for training

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.ticker import MaxNLocator




def analyze_generation_params(text, model):
    print(text,"-"*60)
    results = []
    for temp in np.linspace(.5,1.5,num = 5):
        print(f"temp {temp}:",end="")
        results.append(generate(text, model, prompt=False, max_new_tokens=185,temp=temp))
    
    print("-"*60)
    results = []
    for top_p in np.linspace(0,1,num = 11):
        print(f"top_p {top_p}:",end="")
        results.append(generate(text, model, prompt=False, max_new_tokens=185,temp=.7, top_p=top_p))
    
    
    print("-"*60)
    results = []
    for top_k in np.linspace(0,100,num = 11,dtype=int):
        print(f"top_k {top_k}:",end="")
        results.append(generate(text,  model, prompt=False, max_new_tokens=185, top_k=int(top_k)))
    
    print("-"*60)
    results = []
    for do_sample in [True,False]:
        print(f"do_sample {do_sample}:",end="")
        results.append(generate(text,  model, prompt=False, max_new_tokens=185, do_sample = do_sample, top_k=None))
    
    
    print("-"*60)
    results = []
    for repetition_penalty in np.linspace(1.,2.,num=11):
        print(f"repetition_penalty {repetition_penalty}:",end="")
        results.append(generate(text,  model, prompt=False, max_new_tokens=48,repetition_penalty=repetition_penalty))
    
    print("-"*60)
    results = []
    for typical_p in np.linspace(.1,.9,num=9):
        print(f"typical_p {typical_p}:",end="")
        results.append(generate(text,  model, prompt=False, max_new_tokens=48,typical_p=typical_p))


def generate(text, model, prompt=False, temp=0.7,top_k= 40, top_p = 0.1, do_sample = True, repetition_penalty = 1.23, typical_p = 1, guidance_scale = 1, max_new_tokens = 24,):
    from transformers import TextStreamer
    if prompt:
        print(text,"-"*20)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
    inputs = tokenizer(text, return_tensors="pt").to('cuda')

    # Call the generate method of the model with the given inputs and additional generation configurations.
    outputs = model.generate(**inputs,  # "inputs" likely includes input_ids or prompts for generation.
                           streamer=streamer,  # "streamer" not a standard parameter in HF documentation, likely custom for model-specific streaming.
                           max_new_tokens=max_new_tokens,  # Sets the maximum number of new tokens to generate; range varies based on model and computational limits.
                           temperature=temp,  # Controls randomness: lower->more deterministic, higher->more random; typically in range [0.5, 1.5].
                           do_sample=do_sample,  # Enables sampling; when True, picks tokens based on probability distribution, rather than just most likely.
                           top_p=top_p,  # Nucleus sampling: selects top p% probability tokens for sampling; range [0, 1].
                           top_k=top_k,  # Top-k sampling: chooses from top k probability tokens; if k=0, it's the same as using no top-k.
                           repetition_penalty=repetition_penalty,  # Penalizes repeated tokens; >1 discourages, <1 encourages repetition; typically close to 1.
                           typical_p=typical_p,  # Typical sampling; selects tokens whose cumulative probability is above this threshold, range [0, 1], usually close to 1.
                           guidance_scale=guidance_scale,  # Affects the scale of guidance in models that support it, like CTRL; standard range not well-defined.
                           #seed=seed,  # Sets a seed for reproducibility; commented out, so not in use.
                          )
    generated_tokens = outputs[0].tolist()[len(inputs[0]):]
    result = tokenizer.decode(generated_tokens)
    return result

def set_plot_appearance(fontsize_title=14, 
                        fontsize_label=12, 
                        fontsize_ticks=8, 
                        theme='dark'):
    """
    Configure the plot's appearance with font sizes and theme for dark backgrounds.
    
    Parameters:
    fontsize_title: Font size for the title.
    fontsize_label: Font size for the x and y labels.
    fontsize_ticks: Font size for the x and y tick labels.
    theme: Seaborn theme for the plot's aesthetic style.
    """
    sns.set_theme(style=theme, palette='pastel')
    
    plt.rcParams.update({
        'figure.facecolor': '#1a1a1a',  # Very dark grey background for the figure
        'axes.facecolor': '#1a1a1a',    # Very dark grey background for the plots
        'grid.color': 'gray',           # Lighter grey grid lines (less contrast)
        'text.color': 'white',          # White text for better contrast on dark bg
        'axes.labelcolor': 'white',     # White labels for axes
        'xtick.color': 'white',         # White x-tick labels
        'ytick.color': 'white',         # White y-tick labels
        'axes.labelsize': fontsize_label,
        'axes.titlesize': fontsize_title,
        'xtick.labelsize': fontsize_ticks,
        'ytick.labelsize': fontsize_ticks,
        'legend.title_fontsize': fontsize_ticks + 2,
        'legend.fontsize': fontsize_ticks,
        'axes.edgecolor': 'lightgrey',  # Light grey color for the axes' spines
    })
# Possible modern themes: 'darkgrid', 'whitegrid', 'dark', 'white', and 'ticks'.
# Example usage: set_plot_appearance(theme='ticks')

def plot_text_length_histogram(df, bucket_size=64, percentile=95):
    """
    Plot a histogram of text lengths and indicate the percentile.

    Parameters:
    df: the pandas dataframe to render from
    bucket_size: the number of characters to bucket the histogram with
    percentile: draws a vertical line at this percentile to visualize whether an acceptable fraction of the data lenghts are being used for training set
    """
    set_plot_appearance()  # Apply appearance settings to the plot
    text_lengths = df['text'].apply(len)  # Calculate text lengths.
    bins = range(0, max(text_lengths) + bucket_size, bucket_size)  # Define histogram bins.
    percentile_value = np.percentile(text_lengths, percentile)  # Calculate percentile value.

    fig, ax = plt.subplots()  # Create figure and axis objects.
    ax.hist(text_lengths, bins=bins, edgecolor='black', alpha=0.7)  # Plot histogram.
    ax.axvline(x=percentile_value, color='red', linestyle='--')  # Draw percentile line.
    
    # Annotate the percentile line.
    ax.text(percentile_value, ax.get_ylim()[1]*0.9, f' {percentile}th Percentile: {int(round(percentile_value, -1))}',
            color='red', ha='left')

    ax.set_xlabel('Text Length')  # Set x-axis label.
    ax.set_ylabel('Frequency')  # Set y-axis label.
    ax.set_title('Histogram of Text Lengths')  # Set title.

    ax.xaxis.set_major_locator(MaxNLocator(integer=True))  # Set major locator for x-axis.
    plt.show()  # Display the plot.

def visualize_train_test_category_splits(train_df, test_df, top_x=5, normalize=True):
    """
    Visualize the frequency of top categories in training and test datasets.
    """
    set_plot_appearance()  # Apply appearance settings to the plot

    # Get top x categories from both datasets.
    train_topic_norm = train_df['topic'].value_counts(normalize=normalize).head(top_x)
    test_topic_norm = test_df['topic'].value_counts(normalize=normalize).head(top_x)
    
    # Prepare the data for plotting.
    train_topics = pd.DataFrame({'topic': train_topic_norm.index, 'frequency': train_topic_norm.values, 'dataset': 'Training'})
    test_topics = pd.DataFrame({'topic': test_topic_norm.index, 'frequency': test_topic_norm.values, 'dataset': 'Test'})
    combined_topics = pd.concat([train_topics, test_topics])

    sns.barplot(x='frequency', y='topic', hue='dataset', data=combined_topics)  # Create bar plot.
    plt.title('Top {} Topics Frequency Comparison'.format(top_x))  # Set title.
    plt.xlabel('Normalized Frequency')  # Set x-axis label.
    plt.ylabel('Topic')  # Set y-axis label.
    plt.tight_layout()  # Adjust layout.
    plt.show()  # Display the plot.



def get_data_stuff(dataset_name, split_ratio=0.2):    
    # %%
    from datasets import load_dataset
    from datasets import Dataset
    import pandas as pd
    pd.set_option('display.max_colwidth', None)
    
    
    #load the dataset
    dataset = load_dataset(dataset_name)
    
    # Shuffle the dataset (setting seed for reproducibility)
    shuffled_ds = dataset['train']#.shuffle(seed=seed)
    #shuffled_ds = shuffled_ds.select(range(1000))

    
    # Split the dataset into training and test sets with a test size of 20%
    train_test_split = shuffled_ds.train_test_split(test_size=train_test_split_ratio, seed = seed)
    
    # The train/test datasets are now accessible as follows:
    train_ds = train_test_split['train']
    raw_train_df = pd.DataFrame(train_ds)
    
    test_ds = train_test_split['test']
    raw_test_df = pd.DataFrame(test_ds)
    
    prepared_train_df = prepare_dataset(raw_train_df, "train")
    prepared_test_df = prepare_dataset(raw_test_df, "test")

    prepared_train_dataset = Dataset.from_pandas(prepared_train_df)
    prepared_test_dataset = Dataset.from_pandas(prepared_test_df)
    
    print(f"length of downloaded ds: {len(shuffled_ds)}")
    print(f"length of train_ds: {len(prepared_train_dataset)}")
    print(f"length of test_ds: {len(prepared_test_dataset)}")
    estimated_training_steps = epochs * len(prepared_train_dataset) / per_device_train_batch_size / gradient_accumulation_steps
    print(f"estimated training steps will be {estimated_training_steps}")
    return prepared_train_df, prepared_test_df, prepared_train_dataset, prepared_test_dataset


def prepare_dataset(df, split="train"):
    text_col = []
    instruction = """Write a concise summary of the below input text.Ensure that responses covers the key points of the text.Only provide full sentence responses."""  # change instuction according to the task
    if split == "train":
        for _, row in df.iterrows():
            input_q = row["dialogue"]
            output = row["summary"]
            text = (
                "### Instruction: \n"
                + instruction
                + "\n### Input: \n"
                + input_q
                + "\n### Response :\n"
                + output
                + "\n### End"
            )  # keeping output column in training dataset
            text_col.append(text)
        df.loc[:, "text"] = text_col
    else:
        for _, row in df.iterrows():
            input_q = row["dialogue"]
            text = (
                "### Instruction: \n"
                + instruction
                + "\n### Input: \n"
                + input_q
                + "\n### Response :\n"
            )  # not keeping output column in test dataset
            text_col.append(text)
        df.loc[:, "text"] = text_col
    return df



In [6]:
train_df, test_df, train_dataset, test_dataset = get_data_stuff(hf_dataset_name, 
                                                                split_ratio=train_test_split_ratio,
                                                               )
text = test_df['text'][1]

In [7]:
print(text)

In [None]:
%%time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

# loading the model with quantization config
model = AutoModelForCausalLM.from_pretrained(
    source_hf_model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map= "auto",
    #token = hf_token,
)

model.config.use_cache = True  #surpress the download and use cache. # apparently needed because of https://github.com/huggingface/transformers/pull/24906
model.config.pretraining_tp = 1  #disable tensor parallelism

#fixme
tokenizer = AutoTokenizer.from_pretrained(source_hf_model_name,
     trust_remote_code=True, 
     return_token_type_ids=False,
     token = hf_token,
 )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
from rich import print
print(model)

In [None]:
%%time
import trl
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training
from trl import SFTTrainer
from transformers import EarlyStoppingCallback
import transformers


peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    #target_modules=[ "q_proj","v_proj",],  # we will only create adopters for q, v metrices of attention module
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"],
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)


training_arguments = transformers.TrainingArguments(
    output_dir=project_dir,
    #load_from_checkpoint=str,
    seed = seed,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim="paged_adamw_8bit",
    learning_rate= learning_rate,
    lr_scheduler_type="linear",
    save_strategy="steps",
    logging_steps=logging_steps,
    num_train_epochs=epochs,
    #max_steps=max_steps,
    fp16=True,
    push_to_hub=False,
    evaluation_strategy="steps",  # Add this line
    eval_steps=logging_steps,  # Add this line
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,  # passing peft config
    dataset_text_field="text",  # mentioned the required column
    args=training_arguments,  # training agruments
    tokenizer=tokenizer,  # tokenizer
    packing=False,
    max_seq_length=max_training_sample_length,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
)
type(model)

In [None]:
%%time
if True:
    trainer.train()

In [None]:
print("done!")

In [None]:
%%time
import os
trainer.model.save_pretrained(project_path)

In [None]:
project_dir

In [None]:
!ls -lhatrs ./$project_path/

In [None]:
!ls -latrsh ./$project_path

In [None]:
!cat ./$project_path/README.md


In [None]:
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig.from_pretrained(project_path)
loaded_model = get_peft_model(model, lora_config)

In [None]:
#text = test_df['text'][500]
print(text)
result = generate(text, loaded_model, max_new_tokens=64, temp=0.1)
#del lmodel

In [None]:
#del  loaded_model
import torch
torch.cuda.empty_cache()

In [None]:
result

In [None]:
print(project_path)
!ls -latrsh $project_path
!cat $project_path/adapter_config.json

In [None]:
%%time
from peft import AutoPeftModelForCausalLM
from transformers import  AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    source_hf_model_name, 
    trust_remote_code=True, 
    return_token_type_ids=False,
    token = hf_token,
)
#tokenizer = AutoTokenizer.from_pretrained(source_hf_model_name)

persisted_model = AutoPeftModelForCausalLM.from_pretrained(
    project_path, # path to the project or model directory
    low_cpu_mem_usage=True, # reduces CPU memory usage if True
    return_dict=True, # ensures output is returned as a dictionary
    torch_dtype=torch.float16, # sets the data type for PyTorch tensors
    device_map=device, # specifies the device placement for the model
    #token=hf_token, # token for Hugging Face authentication

)

In [None]:
#text = test_df['text'][500]
print(text)
result = generate(text, persisted_model, max_new_tokens=96, temp=0.1)
#del persisted_model

In [None]:
result

In [None]:
%%time

# Merges the parts of the model that were possibly loaded on different devices and unloads them from memory
# ^^ from gpt4.  i suspect an halucination, verify.
merged_model = persisted_model.merge_and_unload(progressbar=True)

merged_model.save_pretrained(merged_path, safe_serialization=True)
tokenizer.save_pretrained(merged_path)

del persisted_model

In [None]:
print(merged_path)
!ls -lathrs $merged_path

### if dead again, try starting here

In [None]:
%%time
#qqqq
tokenizer = AutoTokenizer.from_pretrained(merged_dir)
merged_model_from_disk = AutoModelForCausalLM.from_pretrained( merged_dir, 
                                                              device_map=device,
                                                             )

In [None]:
print(text)
result = generate(text, merged_model_from_disk,max_new_tokens=96, temp = 0.01)

In [None]:
#del merged_model_from_disk

## Quantize it

In [8]:
merged_path


'work/work_for_2023_11_11/best_checkpoint/merged_model'

In [9]:
%%time
from transformers import GPTQConfig

quantization_config = GPTQConfig(
    bits=quantization_bits,
    dataset=["c4"],
    desc_act=False,
)

quant_model = AutoModelForCausalLM.from_pretrained(
    merged_path, 
    quantization_config=quantization_config, 
    device_map=device
)
tokenizer = AutoTokenizer.from_pretrained(merged_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Quantizing model.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

CPU times: user 36min 43s, sys: 2min 27s, total: 39min 10s
Wall time: 19min 32s


In [10]:
#del persisted_model

## Save quantized model to disk

In [11]:
quant_path

'work_for_2023_11_11/quantized_8bit'

In [12]:
%%time

quant_path = os.path.join(project_dir, quantization_dir)
print(f"saving to {quant_path}")

# Save the quantized model
quant_model.save_pretrained(quant_path, safe_serialization=True)
tokenizer.save_pretrained(quant_path)
#del persisted_model

CPU times: user 1.3 s, sys: 862 ms, total: 2.16 s
Wall time: 5.21 s


('work_for_2023_11_11/quantized_8bit/tokenizer_config.json',
 'work_for_2023_11_11/quantized_8bit/special_tokens_map.json',
 'work_for_2023_11_11/quantized_8bit/tokenizer.model',
 'work_for_2023_11_11/quantized_8bit/added_tokens.json',
 'work_for_2023_11_11/quantized_8bit/tokenizer.json')

In [13]:
print('done')

In [15]:
print(quant_path)
!ls -latrsh $quant_path

total 3.7G
   0 drwxr-xr-x 1 root root 4.0K Nov 11 19:58 ..
4.0K -rw-r--r-- 1 root root 1.2K Nov 11 19:58 config.json
   0 -rw-r--r-- 1 root root  183 Nov 11 19:58 generation_config.json
3.7G -rw-r--r-- 1 root root 3.7G Nov 11 19:58 model.safetensors
4.0K -rw-r--r-- 1 root root  900 Nov 11 19:58 tokenizer_config.json
   0 -rw-r--r-- 1 root root  414 Nov 11 19:58 special_tokens_map.json
492K -rw-r--r-- 1 root root 489K Nov 11 19:58 tokenizer.model
   0 drwxr-xr-x 1 root root 4.0K Nov 11 19:58 .
1.8M -rw-r--r-- 1 root root 1.8M Nov 11 19:58 tokenizer.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Load quantized model from disk

In [16]:
%%time
from transformers import AutoModelForCausalLM, AutoTokenizer

print(quant_path)
quant_model_from_disk = AutoModelForCausalLM.from_pretrained( quant_path, device_map=device)
tokenizer = AutoTokenizer.from_pretrained(quant_path)

CPU times: user 6.48 s, sys: 8.3 s, total: 14.8 s
Wall time: 1min


In [17]:
text = test_df['text'][1]
print(text,"\n","="*30)
result = generate(text, quant_model_from_disk,max_new_tokens=96, temp = 0.01)

Mister Ewing asks #Persons1# and #Persons2# to arrive by four in order for him to impress his colleagues with their punctuality. They decide to travel separately so they will know how to get back home afterward.
### End
</s>


In [18]:
result

'Mister Ewing asks #Persons1# and #Persons2# to arrive by four in order for him to impress his colleagues with their punctuality. They decide to travel separately so they will know how to get back home afterward.\n### End\n</s>'

## Push model to HuggingFace hub

In [19]:
%%time

tokenizer.push_to_hub(destination_hf_model_name, token=hf_token)
quant_model_from_disk.push_to_hub(destination_hf_model_name, token = hf_token)

pytorch_model.bin:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

CPU times: user 19.7 s, sys: 6.02 s, total: 25.8 s
Wall time: 49min 13s


CommitInfo(commit_url='https://huggingface.co/andersonjas/llama2-7b-dialogsum-qlora-gptq/commit/ce741f68cf91aeb1c0e9c8475f65b953105a19f3', commit_message='Upload LlamaForCausalLM', commit_description='', oid='ce741f68cf91aeb1c0e9c8475f65b953105a19f3', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
%%time
# %%
#from transformers import GPTQConfig

# quantization_config = GPTQConfig(
#     bits=4,
#     dataset=["c4"],
#     desc_act=False,
# )
#quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
from_disk_quant_model = AutoModelForCausalLM.from_pretrained(
    destination_hf_model_name, 
    #quantization_config=quantization_config_loading, 
    device_map=device,
)
tokenizer = AutoTokenizer.from_pretrained(destination_hf_model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

CPU times: user 11.2 s, sys: 11.2 s, total: 22.4 s
Wall time: 48.5 s


## Test the model by pulling from hugging face hub

In [21]:
#text = test_df['text'][500]
text = test_df['text'][1]
print(text)
result = generate(text, from_disk_quant_model, max_new_tokens=85,temp=1)

Mister Ewing asks #Persons1# and #Persons2# to arrive by four in order for him to impress his colleagues with their punctuality. They decide to travel separately so they will know how to get back home afterward.
### End
</s>


In [22]:
result

'Mister Ewing asks #Persons1# and #Persons2# to arrive by four in order for him to impress his colleagues with their punctuality. They decide to travel separately so they will know how to get back home afterward.\n### End\n</s>'