In [2]:
from datasets import load_dataset, DatasetDict
from config import HUGGING_FACE_TOKEN as token
import torch
from torch import cuda
import os
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

cache_dir = "models/"
model_name = "meta-llama/Llama-2-7b-chat-hf"
model_path = "final_model/"
device_map = {"": 0}

# Load Yahoo Answers Topics dataset
full_dataset = load_dataset("CShorten/ML-ArXiv-Papers")

In [None]:
# # Reload model in FP16 and merge it with LoRA weights
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map=device_map,
#     use_auth_token=True
# )

# # Reload tokenizer to save it
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

In [None]:
# model = PeftModel.from_pretrained(base_model, "final_checkpoint_arxiv")
# model = model.merge_and_unload()

# # Save the merged model
# model.save_pretrained(model_path)
# tokenizer.save_pretrained(model_path)

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, use_cache=True, return_dict=True, torch_dtype=torch.float16,
    device_map=device_map)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
system_prompt = """
    <s>[INST] <<SYS>>
    You are a helpful, respectful and honest assistant for labeling topics.
    <</SYS>>
    """

# Example prompt demonstrating the output we are looking for
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you only return the label and nothing more.
[/INST] Environmental impacts of eating meat
"""

In [11]:
user_prompt = full_dataset["train"][114000]["abstract"]
print(user_prompt)

main_prompt = f"""
[INST]
I have a topic that contains the following text: {user_prompt}

Based on the information about the topic above, please create a short label of this topic. Make sure you only return the label of this text and nothing more.
[/INST]
"""

Language models demonstrate both quantitative improvement and new qualitative
capabilities with increasing scale. Despite their potentially transformative
impact, these new capabilities are as yet poorly characterized. In order to
inform future research, prepare for disruptive new model capabilities, and
ameliorate socially harmful effects, it is vital that we understand the present
and near-future capabilities and limitations of language models. To address
this challenge, we introduce the Beyond the Imitation Game benchmark
(BIG-bench). BIG-bench currently consists of 204 tasks, contributed by 442
authors across 132 institutions. Task topics are diverse, drawing problems from
linguistics, childhood development, math, common-sense reasoning, biology,
physics, social bias, software development, and beyond. BIG-bench focuses on
tasks that are believed to be beyond the capabilities of current language
models. We evaluate the behavior of OpenAI's GPT models, Google-internal dense
transform

In [12]:
full_prompt = system_prompt + example_prompt + main_prompt

In [13]:
from transformers import pipeline

gen = pipeline('text-generation', model=model, tokenizer=tokenizer)
result = gen(full_prompt)
print(result[0]['generated_text'])


    <s>[INST] <<SYS>>
    You are a helpful, respectful and honest assistant for labeling topics.
    <</SYS>>
    
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST] Environmental impacts of eating meat

[INST]
I have a topic that contains the following text: Language models demonstrate both quantitative improvement and new qualitative
capabilities w