In [1]:
import os
from datetime import datetime

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq


bin d:\anaconda3\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll


### Load dataset


In [2]:
from datasets import load_from_disk

dataset = load_from_disk("text-dedup/output/minhash/cf_code_dedup")
train_dataset = dataset.train_test_split(test_size=0.1)["train"]
eval_dataset = dataset.train_test_split(test_size=0.1)["test"]

In [3]:
print(train_dataset[0])

{'submission_id': '42900388', 'code': 'import math\nfrom fractions import Fraction\nimport time\n\nMAXN = 10000\n\ndef solve(free_nodes, n):\n\tseq = []\n\tused = {}\n\n\tused[0] = 0\n\tpos = 0\n\twhile used.get(free_nodes) == None:\n\t\tused[free_nodes] = pos\n\t\ttoss = 0\n\t\twhile free_nodes < n:\n\t\t\ttoss += 1\n\t\t\tfree_nodes *= 2\n\t\t\n\t\tseq.append((toss, Fraction(free_nodes - n, free_nodes)))\n\t\tfree_nodes -= n\n\t\tpos += 1\n\n\n\tfirst_loop_idx = used.get(free_nodes)\n\tans = Fraction(0, 1)\n\tprod_prob = Fraction(1, 1)\n\tpos = len(seq) - 1\n\twhile pos >= 0:\n\t\ttosses = seq[pos][0]\n\t\tprob = seq[pos][1]\n\t\tans = ans * prob + Fraction(tosses, 1)\n\t\tprod_prob *= prob\n\n\t\tif pos == first_loop_idx:\n\t\t\tbreak\n\t\tpos -= 1\n\n\texpected = ans / (Fraction(1, 1) - prod_prob)\n\tseq[first_loop_idx] = (expected, 0)\n\tpos = first_loop_idx\n\tans = Fraction(0, 1)\n\twhile pos >= 0:\n\t\ttosses = seq[pos][0]\n\t\tprob = seq[pos][1]\n\t\tans = ans * prob + Fractio

### Load model
I load code llama from huggingface in int8. Standard for Lora:

In [4]:
base_model = "codellama/CodeLlama-7b-Python-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Python-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


torch_dtype=torch.float16 means computations are performed using a float16 representation, even though the values themselves are 8 bit ints.

If you get error "ValueError: Tokenizer class CodeLlamaTokenizer does not exist or is not currently imported." Make sure you have transformers version is 4.33.0.dev0 and accelerate is >=0.20.3.


In [5]:
eval_prompt = """import math\r\nn, m, a = map(int, input().split())\r\n"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


import math
n, m, a = map(int, input().split())
Ї


def solve(n, m, a):
    if n == 1:
        return 1
    if m == 1:
        return 1
    if a == 1:
        return 1
    if a == 2:
        return 2
    if a == 3:
        return 3
    if a == 4:


In [6]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is:

In [7]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

And run convert each data_point into a prompt that I found online that works quite well:

In [8]:
def generate_and_tokenize_prompt(data_point):
    return tokenize(data_point["code"])

Reformat to prompt and tokenize each sample:

In [9]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/108850 [00:00<?, ? examples/s]

Map:   0%|          | 0/12095 [00:00<?, ? examples/s]

### 5. Setup Lora

In [10]:
model.train()
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)



In [11]:
resume_from_checkpoint = "D:\Code\大模型实战：AIGC技术原理及实际应用\9-CodeLLM\python-code-llama\checkpoint-540" # set this to the adapter_model.bin file you want to resume from

if resume_from_checkpoint:
    if os.path.exists(resume_from_checkpoint):
        print(f"Restarting from {resume_from_checkpoint}")
        adapters_weights = torch.load(resume_from_checkpoint)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {resume_from_checkpoint} not found")

Restarting from D:\Code\大模型实战：AIGC技术原理及实际应用\9-CodeLLM\python-code-llama\checkpoint-540


PermissionError: [Errno 13] Permission denied: 'D:\\Code\\大模型实战：AIGC技术原理及实际应用\\9-CodeLLM\\python-code-llama\\checkpoint-540'

In [None]:
if torch.cuda.device_count() > 1:
    model.is_parallelizable = True
    model.model_parallel = True

### 6. Training arguments
If you run out of GPU memory, change per_device_train_batch_size. The gradient_accumulation_steps variable should ensure this doesn't affect batch dynamics during the training run. All the other variables are standard stuff that I wouldn't recommend messing with:

In [None]:
batch_size = 64
per_device_train_batch_size = 16
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "python-code-llama"

training_args = TrainingArguments(
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    warmup_steps=100,
    max_steps=4000,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=10,
    optim="adamw_torch",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=20,
    save_steps=20,
    output_dir=output_dir,
    load_best_model_at_end=False,
    group_by_length=True
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

Then we do some pytorch-related optimisation (which just make training faster but don't affect accuracy):

In [None]:
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)

In [None]:
trainer.train()

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")

bin d:\anaconda3\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

To load a fine-tuned Lora/Qlora adapter use PeftModel.from_pretrained. ```output_dir``` should be something containing an adapter_config.json and adapter_model.bin:

In [2]:
from peft import PeftModel

output_dir = "python-code-llama/checkpoint-540"
model = PeftModel.from_pretrained(model, output_dir)

Try the same prompt as before:

In [3]:
eval_prompt = """import math\r\nn, m, a = map(int, input().split())\r\n"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


import math
n, m, a = map(int, input().split())

def is_prime(n):
    if n == 1:
        return False
    for i in range(2, int(math.sqrt(n)) + 1):
        if n % i == 0:
            return False
    return True

def get_prime_factors(n):
    factors = []
    while n % 2 == 0:

