## Fine-tune with Custom Dataset

In [None]:
import re
import requests
import torch
import json

from tqdm import tqdm
from bs4 import BeautifulSoup
from pprint import pprint

import intel_extension_for_pytorch as ipex
import transformers

from transformers import AutoTokenizer
from peft import LoraConfig
from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, PeftModel
from bigdl.llm.transformers import AutoModelForCausalLM
from datasets import load_dataset

<img src="imgs/finetune_pipe.png" width="800" />

# Custom Data Preparation

### 1. Web Data Scraping

In [None]:
page = requests.get("https://www.intel.com/content/www/us/en/support/products/96066/software/development-software/openvino-toolkit.html")

In [None]:
articles = re.findall("\"/content/www/us/en/support/articles/[0-9]+/[\w]+\"", page.text)
links = []
for article in articles:
    article = article.replace("\"", "")
    link = f"https://www.intel.com{article}.html"
    links.append(link)
pprint(links[:10])

In [None]:
contents = []
pages = []
for link in tqdm(links):
    page = requests.get(link)
    pages.append(page)

for page in tqdm(pages[:20]):
    soup = BeautifulSoup(page.text, "html.parser")
    title = soup.find('h1').text
    article = soup.find('div', { 'class': 'article-content' })
    summary = article.find('div', { 'class': 'article-intro' }).get_text(strip=True).replace("Summary", "")
    content = article.find('div', { 'id': 'resolution' }).get_text()[len("Resolution")+1:].replace("\xa0", " ").replace("\n","")
    contents.append({ "title": title, "summary": summary, "content": content})
    
pprint(contents, width=240, indent=4)

### 2. Generate Synthetic Dataset

In [None]:
model_path = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
device = "xpu"
model = model.to(device)

### 3. Using LLM to Generate More Questions

In [None]:
# MISTRAL_PROMPT_FORMAT = """<s>[INST]
# Turn the following sentence into a question.
# {sentence}
# [/INST]"""

# content = contents[0]
# sentence = content['title']

# results = []
# for content in tqdm(contents):
#     sentence = content['title']
#     with torch.inference_mode():
#         prompt = MISTRAL_PROMPT_FORMAT.format(sentence=sentence)
#         input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
#         output = model.generate(input_ids, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id)
#         torch.xpu.synchronize()
#         output = output.cpu()
#         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
#         result = output_str.split("[/INST]")

#         lines = result[1].strip().split("\n")
#         results.append({ "question": lines[0], "answer": content["content"] })
# results

In [None]:
MISTRAL_PROMPT_FORMAT = """<s>[INST]
Turn the following sentence into 3 different questions.
{sentence}
[/INST]"""

content = contents[0]
sentence = content['title']

results = []
for content in tqdm(contents[:10]):
    sentence = content['title']
    with torch.inference_mode():
        prompt = MISTRAL_PROMPT_FORMAT.format(sentence=sentence)
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        output = model.generate(input_ids, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id)
        torch.xpu.synchronize()
        output = output.cpu()
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        result = output_str.split("[/INST]")
        
        lines = result[1].strip().split("\n")
        for idx, line in enumerate(lines):
            cleaned_line = lines[idx] = line[2:].strip()
            results.append({ "question": cleaned_line, "answer": content["content"] })
results

In [None]:
import pickle
with open('outputs/custom_data.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Notices & Disclaimers 

Intel technologies may require enabled hardware, software or service activation. 

No product or component can be absolutely secure.  

Your costs and results may vary.  

No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (0BSD), Open Source Initiative. No rights are granted to create modifications or derivatives of this document. 

© Intel Corporation.  Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries.  Other names and brands may be claimed as the property of others.  