# Inference notebook for Evol Instruction tuning

### Setup

In [2]:
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import PeftModel,get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, PeftConfig
import torch

np.random.seed(42)
torch.manual_seed(42)
#if args.n_gpu > 0:
    #torch.cuda.manual_seed_all(args.seed)

device = "cuda:0"
peft_model_id = "santa_instruct_model/"
model_name_or_path = "bigcode/santacoder"

config = PeftConfig.from_pretrained(peft_model_id)
org_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,trust_remote_code=True)
model = PeftModel.from_pretrained(org_model, peft_model_id)
model.eval()
model.to(device)


Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadCustomModel(
      (transformer): GPT2CustomModel(
        (wte): Embedding(49280, 2048)
        (wpe): Embedding(2048, 2048)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPT2CustomBlock(
            (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2MQAttention(
              (q_attn): Conv1D()
              (kv_attn): Conv1D()
              (c_proj): Linear(
                in_features=2048, out_features=2048, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lo

In [3]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request\n### Instruction:\n{instruction}\n\n### Response:"
    ),
}
DEFAULT_PAD_TOKEN = "<|pad|>"
DEFAULT_EOS_TOKEN = "<|endoftext|>"
DEFAULT_UNK_TOKEN = "<|unk|>"

In [4]:
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict,
    tokenizer,
    model,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg


        
special_tokens_dict = dict()
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

smart_tokenizer_and_embedding_resize(
    special_tokens_dict=special_tokens_dict,
    tokenizer=tokenizer,
    model=model,
)

Using pad_token, but it is not set yet.


In [7]:
stop_words_ids = [tokenizer.encode(stop_word) for stop_word in ["<|endoftext|>"]]

## Inference

In [37]:
def infer(model,input_text):
    inputs = tokenizer(input_text,return_tensors="pt")
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model.generate(
            input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=512,do_sample=True,temperature=0.6)
    
    return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)

### 1. Create a python code to validate an email address using Regular Expressions

#### Instruction tuned model

In [14]:
instruction = "Create a python code to validate an email address using Regular Expressions"

input_text = PROMPT_DICT['prompt_no_input'].format(instruction=instruction)
 
predictions = infer(model,input_text)
print(predictions[0])

Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request
### Instruction:
Create a python code to validate an email address using Regular Expressions

### Response:Validate an email address using regular expressions:

`import re`

email_address = 'ychag@example.com'

# Match against the email address's regular expression
match = re.match(r'[^@]+@[^@]+\.[^@]+', email_address)

# Print the match if it exists
if match:
    print(match.group())
else:
    print('Invalid email address')


# Explanations:
# - `re.match()` will match the email address's regular expression against thestring `email_address`
# - The groups will be returned by the match object.


#### Vanilla LM

In [51]:
org_model.eval()

input_text = """# Create a python code to validate an email address using Regular Expressions"""
 
predictions = infer(org_model,input_text)
print(predictions[0])

Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


# Create a python code to validate an email address using Regular Expressions
# You can use RegEx's to validate email address
# RegEx Patterns are composed of one or more characters that can be matched
# The first character can be any of:
# a - A letter
# b - A number
# l - A lower case letter
# u - An upper case letter
# d - A dollar sign
#. - A dot
# - - A hyphen
# # - A hash
# @ - A at sign
# $ - A dollar sign
# & - A ampersand
# * - A times sign
# () - An open parenthesis
# ) - A close parenthesis

import re

# The email address is composed of a user id, a dot, a domain name, a comma
# and a space. This is not normally a valid email address. But it does
# seem to be a valid email address.

email = 'user@domain.com,,, '

# Use re.compile to create a RegEx
regex = re.compile(
    r'user@domain\.com\,?,?'
)

# Search the email for matches
matches = regex.search(email)

# The first match is returned
print(matches.group())



### 2. Add the missing symbols and brackets for the for loop. for x range 0, 5 print(x)

#### Instruction tuned

In [44]:
instruction = "Add the missing symbols and brackets for the for loop. for x range 0, 5 print(x)"

input_text = PROMPT_DICT['prompt_no_input'].format(instruction=instruction)
 
predictions = infer(model,input_text)
print(predictions[0])

Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request
### Instruction:
Add the missing symbols and brackets for the for loop. for x range 0, 5 print(x)

### Response:Add the missing symbols and brackets for the for loop.
for x in range(0, 5):
    print(x)



#### Vanilla LLM

In [47]:
org_model.eval()

input_text = """# Add the missing symbols and brackets for the for loop. for x range 0, 5 print(x)"""
 
predictions = infer(org_model,input_text)
print(predictions[0])

Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


# Add the missing symbols and brackets for the for loop. for x range 0, 5 print(x)
for x in range(0, 5):
    print(x)

# Add the missing symbols and brackets for the while loop.
# while x < 5:
#     print(x)
#     x += 1

# Add the missing symbols and brackets for the forloop with range().
# for x in range(0, 5):
#     print(x)

# Add the missing symbols and brackets for the whileloop with range().
# while x < 5:
#     print(x)
#     x += 1



### 3. What is the correct syntax to create a dictionary in Python?

#### Instruction tuned

In [62]:

instruction = "What is the correct syntax to create a dictionary in Python?"

input_text = PROMPT_DICT['prompt_no_input'].format(instruction=instruction)
 
predictions = infer(model,input_text)
print(predictions[0])

Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request
### Instruction:
What is the correct syntax to create a dictionary in Python?

### Response:Dictionary in Python is a collection of key-value pairs that can be used for storing data. For example, if you wanted to store data about a particular person, you could create a dictionary that stores name, email and phone. Or you could store it for every person in your dataset.

You can create a dictionary from scratch:

```python
person_dict = {
    "name": "Mary",
    "email": "dycjh@example.com",
    "phone": "1234567890"
}

print(person_dict) # Should be {"name": "Mary", "email": "dycjh@example.com", "phone": "1234567890"}

# Alternatively, you could create a dictionary from another dictionary:

another_person_dict = {
    "name": "John",
    "email": "envkt@example.com",
    "phone": "1234567890"
}

person_dict = another_person_dict
pr

#### Vanilla LLM

In [66]:
org_model.eval()

input_text = """# What is the correct syntax to create a dictionary in Python?"""
 
predictions = infer(org_model,input_text)
print(predictions[0])

Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


# What is the correct syntax to create a dictionary in Python?
# We can use this syntax as follows:

my_dict = {}
my_dict["key1"] = "value1"
my_dict["key2"] = "value2"

print(my_dict)

# Alternatively, we can use the key-value pair syntax:

my_dict = {"key1": "value1", "key2": "value2"}

print(my_dict)

# In other words, the key-value pair syntax is preferred.

# In other words, you can create dictionaries with key-value pairs or with key-value
# pairs and lists as values, which would give you the same result:

my_dict = {"key1": "value1", "key2": "value2"}
my_list = [1, 2, 3]

my_dict = {"key1": "value1", "key2": "value2"}
my_list = [1, 2, 3]

my_dict = {"key1": "value1", "key2": "value2"}
my_list = [1, 2, 3]

# In the end, you will see that we have gone from three key-value pairs to three
# lists, and vice versa.

