In [1]:
!pip install -q langchain

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load the data

In [3]:
PROMPT = {
    'general': (
    "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    )
}
max_len = 512
device = "cuda" 

In [4]:
file_path = '../Datasets/pairs_data/cleaned_data/data_test.json'
datasets = load_dataset('json', data_files=file_path)['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
datasets

Dataset({
    features: ['output', 'instruction', 'input', 'api'],
    num_rows: 127
})

## Embedding Documentation 

In [6]:
model_name = '../Training/saved_models/Salesforce/instructcodet5p-16b'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                              torch_dtype=torch.float16,
                                              low_cpu_mem_usage=True, 
                                              trust_remote_code=True, 
                                              device_map='auto'
                                             )
model.tie_weights()
model.eval()

Downloading (…)iguration_codet5p.py:   0%|          | 0.00/4.07k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/instructcodet5p-16b:
- configuration_codet5p.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading modeling_codet5p.py:   0%|          | 0.00/43.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/instructcodet5p-16b:
- modeling_codet5p.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

CodeT5pEncoderDecoderModel(
  (encoder): CodeT5pModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeT5pBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeT5pAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): CodeT5pMLP(
          (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): CodeT5pForCausalLM(
    (transformer): CodeT5pModel(
      

In [8]:
def embedding_docu_with_decoder(model, tokenizer, prompt, docs):
    prompt_tok = tokenizer(prompt, return_tensors="pt").to(device)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=5000,                    
        chunk_overlap=300,                  
        add_start_index=True
    )
    
    all_splits = text_splitter.split_text(docs)
    
    f_split_tok = tokenizer(all_splits[0], return_tensors="pt").to(device)
    encoding_split = model.encoder(**f_split_tok)
    encoding_split = model.enc_to_dec_proj(encoding_split.last_hidden_state)
    encoding_prompt = model.decoder.transformer(
        input_ids=prompt_tok["input_ids"],
        attention_mask=prompt_tok["attention_mask"],
        encoder_hidden_states=encoding_split,
        encoder_attention_mask=f_split_tok["attention_mask"]).last_hidden_state
    
    for split in all_splits[1:]:
        split_tok = tokenizer(split, return_tensors="pt").to(device)
        encoding_split_out = model.encoder(**split_tok)
        encoding_split = model.enc_to_dec_proj(encoding_split_out.last_hidden_state)
        encoding_prompt = model.decoder.transformer.h[33].crossattention(
            hidden_states = encoding_prompt,
            attention_mask = f_split_tok.attention_mask,
            encoder_hidden_states = encoding_split,
            encoder_attention_mask = split_tok.attention_mask
        )[0]
    encoding_split_out.past_key_values = None
    encoding_split_out.last_hidden_state = encoding_prompt
    return encoding_split_out
        
prompt_type = 'general'
prompt = PROMPT[prompt_type]
inst = datasets[0]
d = inst["input"]
p = prompt.format_map({'instruction': inst["instruction"]})
encoder_outputs = embedding_docu_with_decoder(model, tokenizer, p, d)
prompt_tok = tokenizer(p, return_tensors="pt").to(device)
model.encoder.config.hidden_size = model.decoder.config.hidden_size
outputs = model.generate(
                decoder_input_ids = prompt_tok["input_ids"],
                decoder_attention_mask = prompt_tok["attention_mask"],
                encoder_outputs = encoder_outputs,
                max_new_tokens=50,
                pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

"Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Create a Python program that retrieves the metadata of a file on Google Drive using the PyDrive2 API. The program should take the file ID as input and display the metadata information of the file, including its title, size, creation date, and last modified date. Make sure the program is capable of handling cases where the file ID provided is invalid or when there is an authentication issue.

### Response: Find geop frank cit Pic frank gap frank bas Loc frank gr frank bas Loc frank cit Pic ge frank bas Loc ge frank cit Pic ge frank bas Loc ge frank cit frank bas Loc ge frank cit frank bas frank bas Loc ge frank cit frank cit ge


In [9]:
def embedding_docu_with_encoder(model, tokenizer, prompt, docs):
    prompt_tok = tokenizer(prompt, return_tensors="pt").to(device)
    encoding_prompt = model.encoder(**prompt_tok)
    encoding_prompt = model.enc_to_dec_proj(encoding_prompt.last_hidden_state)
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=5000,                    
        chunk_overlap=300,                  
        add_start_index=True
    )
    
    all_splits = text_splitter.split_text(docs)
    
    f_split_tok = tokenizer(all_splits[0], return_tensors="pt").to(device)
    encoding_split = model.encoder(**f_split_tok)
    encoding_split = model.enc_to_dec_proj(encoding_split.last_hidden_state)
    encoding_prompt = model.decoder.transformer.h[33].crossattention(
            hidden_states = encoding_split,
            attention_mask = f_split_tok.attention_mask,
            encoder_hidden_states = encoding_prompt,
            encoder_attention_mask = prompt_tok.attention_mask
        )[0]
    
    for split in all_splits[1:]:
        split_tok = tokenizer(split, return_tensors="pt").to(device)
        encoding_split_out = model.encoder(**split_tok)
        encoding_split = model.enc_to_dec_proj(encoding_split_out.last_hidden_state)
        encoding_prompt = model.decoder.transformer.h[33].crossattention(
            hidden_states = encoding_prompt,
            attention_mask = f_split_tok.attention_mask,
            encoder_hidden_states = encoding_split,
            encoder_attention_mask = split_tok.attention_mask
        )[0]

    encoding_split_out.last_hidden_state = encoding_prompt
    return encoding_split_out
        
prompt_type = 'general'
prompt = PROMPT[prompt_type]
inst = datasets[0]

d = inst["input"]
p = prompt.format_map({'instruction': inst["instruction"]})
encoder_outputs = embedding_docu_with_encoder(model, tokenizer, p, d)
prompt_tok = tokenizer(p, return_tensors="pt").to(device)
model.encoder.config.hidden_size = model.decoder.config.hidden_size
outputs = model.generate(
                decoder_input_ids = prompt_tok["input_ids"],
                decoder_attention_mask = prompt_tok["attention_mask"],
                encoder_outputs = encoder_outputs,
                max_new_tokens=max_len,
                pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

"Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Create a Python program that retrieves the metadata of a file on Google Drive using the PyDrive2 API. The program should take the file ID as input and display the metadata information of the file, including its title, size, creation date, and last modified date. Make sure the program is capable of handling cases where the file ID provided is invalid or when there is an authentication issue.

### Response:enHaUk routes bits� proc				 ardu phantom routes procaciaci ferry proc				 procaciaci ge picture routesaci frankaci geop Procaci proc greenhouse Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci Procaci ProcaciHa procaciaciaci frankaciaci procaciaci proc phantom proc phantom procaciaci frankaci Procaci Procaci frankaci frankaci Pro