# Load llama2-7b-chat for Q&A task (baseline 1)




## 0. Set up the environment

In [None]:
# Specify package versions, otherwise install the latest versions
!pip install accelerate==0.23.0 bitsandbytes==0.41.1 peft==0.4.0 safetensors==0.4.0 transformers==4.34.0 trl==0.4.7 # datasets==2.13.0  # openai==0.28.1


Collecting accelerate==0.23.0
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m235.5/258.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.41.1
  Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors==0.4.0
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014

In [None]:
# load the libraries

import os
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# define model ID
model_id = "NousResearch/Llama-2-7b-chat-hf"

# load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16
)

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)


Downloading config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



Downloading tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Mount at gdrive to load data
from google.colab import drive
drive.mount('/content/gdrive')   # connect to gdrive

# Change to current working directory
import os
os.chdir('/content/gdrive/My Drive/Practice/4/ml-arxiv-papers/llama2-7b-chat')  # Make all the following ./path is relative to this path


Mounted at /content/gdrive


In [None]:
# load the data locally

import json

# load the whole dataset
with open('/content/gdrive/My Drive/Practice/4/ml-arxiv-papers/alldata.json', 'r', encoding='utf-8') as file:  # open from absolute path 绝对路径
    alldata = json.load(file)
print("# Total alldata samples:", len(alldata))
print()

# split test set
test_dataset = alldata[42000:43000]  # alldata[42000:43000]
print("# test_dataset samples:", len(test_dataset))
print("test_dataset 1st sample:", test_dataset[0])




# Total alldata samples: 43713

# test_dataset samples: 1000
test_dataset 1st sample: {'id': 112194, 'context': 'Neural Mesh: Introducing a Notion of Space and Conservation of Energy to   Neural Networks.Neural networks are based on a simplified model of the brain. In this project, we wanted to relax the simplifying assumptions of a traditional neural network by making a model that more closely emulates the low level interactions of neurons. Like in an RNN, our model has a state that persists between time steps, so that the energies of neurons persist. However, unlike an RNN, our state consists of a 2 dimensional matrix, rather than a 1 dimensional vector, thereby introducing a concept of distance to other neurons within the state. In our model, neurons can only fire to adjacent neurons, as in the brain. Like in the brain, we only allow neurons to fire in a time step if they contain enough energy, or excitement. We also enforce a notion of conservation of energy, so that a neuron canno

In [None]:
sample = test_dataset[2]

# From experience: during inference, exclude <s> and </s> in the prompt, as the model will generate them as special tokens
prompt = f"""[INST] <<SYS>>
You are a helpful, respectful and honest assistant. \
Your task is to generate an answer to the given question. \
And your answer should be based on the provided context only.
<</SYS>>

### Question: {sample['question']}
### Context: {sample['context']} [/INST]
### Answer:
"""


print(prompt)


[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Your task is to generate an answer to the given question. And your answer should be based on the provided context only.
<</SYS>>

### Question: What is AdaScale SGD and its key feature?
### Context: AdaScale SGD: A User-Friendly Algorithm for Distributed Training.When using large-batch training to speed up stochastic gradient descent, learning rates must adapt to new batch sizes in order to maximize speed-ups and preserve model quality. Re-tuning learning rates is resource intensive, while fixed scaling rules often degrade model quality. We propose AdaScale SGD, an algorithm that reliably adapts learning rates to large-batch training. By continually adapting to the gradient's variance, AdaScale automatically achieves speed-ups for a wide range of batch sizes. We formally describe this quality with AdaScale's convergence bound, which maintains final objective values, even as batch sizes grow large and the number of itera

In [None]:
# encode prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

# generate output
outputs = model.generate(input_ids=input_ids, max_new_tokens=128, temperature=0.8)  # max_new_tokens=128, larger max_new_tokens longer inference

# decode output
model_output = tokenizer.decode(outputs[0])
generated_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(prompt):]

print('\nModel output:\n', model_output)  # decoded_output
print("___" * 20)

print('\nGround truth:\n', sample['answer'])
print('\nGenerated answer:\n', generated_answer)



Model output:
 <s> [INST] <<SYS>>
You are a helpful, respectful and honest assistant. Your task is to generate an answer to the given question. And your answer should be based on the provided context only.
<</SYS>>

### Question: What is AdaScale SGD and its key feature?
### Context: AdaScale SGD: A User-Friendly Algorithm for Distributed Training.When using large-batch training to speed up stochastic gradient descent, learning rates must adapt to new batch sizes in order to maximize speed-ups and preserve model quality. Re-tuning learning rates is resource intensive, while fixed scaling rules often degrade model quality. We propose AdaScale SGD, an algorithm that reliably adapts learning rates to large-batch training. By continually adapting to the gradient's variance, AdaScale automatically achieves speed-ups for a wide range of batch sizes. We formally describe this quality with AdaScale's convergence bound, which maintains final objective values, even as batch sizes grow large and

Inference on the whole testset and save the results (1000 samples, using L4 GPU, duration 2h 8min, speed 7.7s/sample)

In [None]:
import json

# Check if the result file already exists, if not, create an empty list
if not os.path.exists("./test_dataset_inference_results_base.json"):
    with open("./test_dataset_inference_results_base.json", "w") as f:
        json.dump([], f)


for i in range(len(test_dataset)):
    sample = test_dataset[i]
    prompt = f"""[INST] <<SYS>>
You are a helpful, respectful and honest assistant. \
Your task is to generate an answer to the given question. \
And your answer should be based on the provided context only.
<</SYS>>

### Question: {sample['question']}
### Context: {sample['context']} [/INST]
### Answer:
"""

    # input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(input_ids=input_ids, max_new_tokens=128, temperature=0.8)
    # generated_answer = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]
    generated_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(prompt):]

    result = {
        'id': sample['id'],
        'question': sample['question'],
        'ground_truth': sample['answer'],
        'answer': generated_answer
    }

    # Read existing results
    with open("./test_dataset_inference_results_base.json", "r") as f:
        existing_results = json.load(f)

    # Append new result to the existing result list
    existing_results.append(result)

    # Write results to file
    with open("./test_dataset_inference_results_base.json", "w") as f:
        json.dump(existing_results, f)



In [None]:
# Load and check the results JSON file

import json

with open("./test_dataset_inference_results_base.json", "r") as f:
    test_dataset_inference_results = json.load(f)
print("# results samples:", len(test_dataset_inference_results))

# Check results
test_dataset_inference_results[-3:]


# results samples: 1000


[{'id': 114586,
  'question': 'How do kernel-based divergence portfolios improve portfolio optimization?',
  'ground_truth': 'Kernel-based divergence portfolios enhance portfolio optimization by relaxing assumptions and constraints, leading to faster convergence and improved performance on synthetic and real-world examples.',
  'answer': 'Kernel-based divergence portfolios can improve portfolio optimization in several ways:\n\n1. Relaxed assumptions: The traditional portfolio optimization approaches rely on strong assumptions about the distribution of returns, such as normality or stationarity. Kernel-based divergences, on the other hand, can handle more flexible and realistic distributions, such as non-Gaussian or time-varying returns.\n2. Robustness to model misspecification: The Kullback-Leibler (KL) divergence and other divergences used in traditional portfolio optimization are sensitive to'},
 {'id': 114587,
  'question': 'What is the impact of sub-task decomposition on learning i