# Transform pdf to txt

In [None]:
!pip install PyPDF2
!pip install pycryptodome


In [None]:
from google.colab import drive
import os
import PyPDF2

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

search_path = '/content/drive/My Drive/Files'

# Get a list of all PDF files in the specified folder
pdf_files = [f for f in os.listdir(search_path) if f.endswith('.pdf')]

# Loop through each PDF file
for filename in pdf_files:
    # Extract company name from filename
    company_name = os.path.splitext(filename)[0]

    # Search for the file
    file_path = os.path.join(search_path, filename)

    # Open PDF file
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        num_pages = len(reader.pages)

        # Counter for parts and line count
        part_counter = 1
        line_counter = 0

        formatted_text = f"Source Document: {company_name}\n\n"

        # Iterate through each page
        for i in range(num_pages):
            # Add the section heading to the formatted text
            formatted_text += f"### Section: Page {i + 1}\n"
            line_counter += 1

            # Extract text from the page
            text = reader.pages[i].extract_text()

            # Split the text into lines and iterate through each line
            for line in text.split('\n'):
                formatted_text += line + '\n'
                line_counter += 1

                # Check if line counter exceeds n
                if line_counter >= 500:
                    # Define a function to save text to a file
                    def save_to_file(text, part):
                        file_name = f"{company_name} part {part}.txt"
                        file_path = os.path.join(search_path, file_name)
                        with open(file_path, 'w', encoding='utf-8') as text_file:
                            text_file.write(text)
                        print(f"Text file for {company_name} part {part} saved successfully.")

                    save_to_file(formatted_text, part_counter)
                    part_counter += 1
                    formatted_text = f"Source Document: {company_name}\n\n"
                    line_counter = 2

            # Add a delimiter between sections
            formatted_text += '\n'
            line_counter += 1

        # Save any remaining text that didn't reach 500 lines
        if formatted_text.strip() != f"Document: {company_name}":
            save_to_file(formatted_text, part_counter)


# Cut text files in smaller in order to reduce tokens


In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

search_path = '/content/drive/My Drive/Files'

# Get a list of all text files in the specified folder
text_files = [f for f in os.listdir(search_path) if f.endswith('.txt')]

# Function to save text to a file
def save_to_file(text, part, company_name):
    file_name = f"{company_name} part {part}.txt"
    file_path = os.path.join(search_path, file_name)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)
    print(f"Text file for {company_name} part {part} saved successfully.")

# Loop through each text file
for filename in text_files:
    # Extract file name without extension as company name
    company_name = os.path.splitext(filename)[0]

    # Search for the file
    file_path = os.path.join(search_path, filename)

    # Initialize counters
    part_counter = 1
    line_counter = 0
    formatted_text = ""

    # Open and read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            formatted_text += line
            line_counter += 1

            # Check if line counter exceeds 100
            if line_counter >= 100:
                save_to_file(formatted_text, part_counter, company_name)
                part_counter += 1
                formatted_text = ""
                line_counter = 0

    # Save any remaining text that didn't reach 100 lines
    if formatted_text.strip() != "":
        save_to_file(formatted_text, part_counter, company_name)


# Generate Questions About Document

In [None]:
pip install openai==0.28

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive', force_remount=True)

In [None]:
OPENAI_API_KEY=''


In [None]:
!pip install python-dotenv

In [None]:
import os
import pandas as pd
import openai
from tqdm import tqdm
import dotenv
from dotenv import load_dotenv
import re  # For regular expressions

# Load environment variables
load_dotenv()

def generate_questions(text, model="gpt-4-0125-preview"):
    """
    Generates a list of questions based on the provided text using OpenAI's GPT-4 Chat API.
    """
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert at creating multiple questions based on materials and documentation. Generate a list of 10 questions from the provided text."},
                {"role": "user", "content": text}
            ]
        )
        generated_text = response.choices[0].message['content'].strip()
        # Split the generated text into individual questions
        questions = re.split(r'\d+\.\s+', generated_text)[1:]  # Split and remove the first empty entry if any
        return questions
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

directory_path = './drive/My Drive/Files'

data = []

# Processing each text file in the directory
for filename in tqdm(os.listdir(directory_path), desc='Processing files'):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_path, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text_content = file.read().strip()
                generated_questions = generate_questions(text_content)
                for question in generated_questions:
                    instruction = "You are a helpful assistant. Answer this question: " + question.replace('\n', ' ').strip()
                    data.append({'Instruction': instruction, 'Input': text_content})
        except Exception as e:
            print(f"Error occurred while processing file: {file_path}\n{e}")

# Convert the list to a DataFrame
df = pd.DataFrame(data)

print(df)


In [None]:
df["Instruction"][200]

In [None]:
df

In [None]:
from huggingface_hub import login
login()

In [None]:
from datasets import Dataset

#convert dataset into dataframe to be able to push
dataset = Dataset.from_pandas(df)

dataset.push_to_hub("AnonymousAuthorICAIF24/Instruction_Input_dataset_08_04")

# Generate Answers and Create Training Dataset

In [None]:
pip install openai==0.28

In [None]:
pip install datasets

In [None]:
import openai

openai.api_key = ''

In [None]:
import pandas as pd
from datasets import load_dataset
import openai

# Load the dataset from Hugging Face
dataset_name = 'AnonymousAuthorICAIF24/Instruction_Input_dataset_08_04'
dataset = load_dataset(dataset_name)
df = pd.DataFrame(dataset['train'])

# Define the function to generate text
def generate_text(prompt, model="gpt-4-0125-preview"):
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that provides detailed, clear, and accurate answers."},
                {"role": "user", "content": prompt}
            ]
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        # Log the error for debugging
        print(f"An error occurred with prompt: {prompt}\nError: {e}")
        return "Error generating response."

# Placeholder for answers
answers = []

# Iterate over each row in the DataFrame and generate text
for index, row in df.iterrows():
    prompt = f"Please provide a detailed, clear, and accurate answer for this question based on the text: \"{row['Input']}\". Question: {row['Instruction']}"
    generated_text = generate_text(prompt, model="gpt-4-0125-preview")  # Switch model if needed
    answers.append(generated_text)

# Update the DataFrame with the generated answers
df['Output'] = answers




In [None]:
df['Output'][200]

In [None]:
df

In [None]:
from huggingface_hub import login
login()

In [None]:
from datasets import Dataset

#convert dataset into dataframe to be able to push
dataset = Dataset.from_pandas(df)

In [None]:
dataset.push_to_hub("AnonymousAuthorICAIF24/GPT-QA-V2_08_04")

# Create dataset for training the model

In [None]:
!pip install transformers trl accelerate torch bitsandbytes peft datasets -qU
!pip install flash-attn --no-build-isolation

In [None]:
from huggingface_hub import login
login()

In [None]:
from datasets import load_dataset
dataset = load_dataset("AnonymousAuthorICAIF24/GPT-QA-V2_08_04", split="train")
dataset
df = dataset.to_pandas()
df.head(10)

In [None]:
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer:param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    prefix_text = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n'
    # Samples with additional context into.
    if data_point['Input']:
        text = f"""<s>[INST]{prefix_text} {data_point["Instruction"]} Base the answer according to the text:  {data_point["Input"]} [/INST]</s> \\\\n <s>{data_point["Output"]}</s>"""
    # Without
    else:
        text = f"""<s>[INST]{prefix_text} {data_point["Instruction"]} [/INST] </s> \\\\n <s> {data_point["Output"]} </s>"""
    return text
# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

In [None]:
dataset["prompt"][1]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)


In [None]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

In [None]:
train_data = dataset["train"]

# Train Mistral 7B Instruct 2


In [None]:
# base model from huggingFace or path to model
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
new_model = "Fine-Tuned_Mistral-Instruct-V2_06-05"

dataset_name ="AnonymousAuthorICAIF24/GPT-QA-V2_08_04"

In [None]:
%%capture
!pip install -U bitsandbytes
!pip install transformers==4.36.2
!pip install -U peft
!pip install -U accelerate
!pip install -U trl
!pip install datasets==2.16.0
!pip install sentencepiece

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
from datasets import load_dataset
from trl import SFTTrainer
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset
import re


In [None]:
from huggingface_hub import login #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
login()

In [None]:
dataset = load_dataset(dataset_name, split="train")
dataset
df = dataset.to_pandas()
df.head(10)

In [None]:
# Load base model
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)


model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.bos_token, tokenizer.eos_token


In [None]:
def generate_prompt(data_point):

    text = f"""<s>[INST] {data_point["Instruction"]} [/INST] </s> \\\\n <s> {data_point["Output"]} </s>"""

    return text
# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

In [None]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

In [None]:
train_data = dataset

In [None]:
dataset

In [None]:
train_data

In [None]:
# count training tokens
from transformers import LlamaTokenizer
tokenizer_ = LlamaTokenizer.from_pretrained(base_model)
tokens = tokenizer_.tokenize(train_data.to_pandas().to_string())
len(tokens)

In [None]:
#Adding the adapters in the layers
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=1,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)


In [None]:
# Setting sft parameters

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="prompt",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
PYTORCH_NO_CUDA_MEMORY_CACHING=1

In [None]:
trainer.train()

In [None]:
# Save the fine-tuned model (the adapter)
trainer.model.save_pretrained(new_model)
model.config.use_cache = True
model.eval()

In [None]:
trainer.model.push_to_hub(new_model)

# Test model

In [None]:
%%capture
!pip install -U bitsandbytes
!pip install transformers==4.36.2
!pip install -U peft
!pip install -U accelerate
!pip install -U trl
!pip install datasets==2.16.0
!pip install sentencepiece

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
from datasets import load_dataset
from trl import SFTTrainer
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset
import re


In [None]:
from huggingface_hub import login
login()

In [None]:
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
new_model = "AnonymousAuthorICAIF24/Fine-Tuned_Mistral-Instruct-V2_06-05"

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        torch_dtype=torch.bfloat16,
        return_dict=True,
        low_cpu_mem_usage=True,
        device_map="auto",
        trust_remote_code=True,
)
model = PeftModel.from_pretrained(base_model_reload, new_model)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)



In [None]:
logging.set_verbosity(logging.CRITICAL)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=2000)

In [None]:
def build_prompt(question):
  prompt=f"<s>[INST] You are a helpful assistant. Make direct answers with good explanations. Do not lie and if you do not know the answer, say you do not know. Answer this question: {question} [/INST]"
  return prompt

In [None]:
question = "What are the five credit rating levels that can be assigned for tied assets? Explain each level."
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What are the primary investment principles for total assets?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What are the rules for investment in cash deposits?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "How are bonds and convertible bonds treated?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What specific limits are placed on investments in foreign currencies?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What are high-risk investments for insurance companies?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "How are investments in high-risk assets regulated?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What are the consequences for failing to comply with the guidelines?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "How should claims of non-life insurers against reinsurers be handled?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What is the protocol for the inclusion of new types of investments?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What is the principle of diversification within tied assets?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What limits are set for investments in equities and equity securities?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What are the specific limitations for investing in real estate within tied assets?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What are synthetic bonds and how are they used?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "How should insurance companies manage and report structured product investments?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What derivatives can be used to hedge credit risk of asset portfolio?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "How should liquidity be maintained when using derivatives in investment strategies?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What specific provisions must be followed when dealing with tied assets in relation to unit-linked life insurance policies?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "Explain the criteria and process for allocating an investment to tied assets."
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "How are claims of non-life insurers against reinsurers treated in the context of tied assets?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What are the limitations and conditions for credit exposure to counterparties within tied assets?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "How are mortgage receivables treated under the tied assets regulations?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "How do regulations ensure that the claims of the insured are prioritized in the event of an insurer's insolvency?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "What are the penalties for violating the tied asset regulations?"
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

In [None]:
question = "Describe the process for the annual evaluation of the insurance company's overall investment portfolio including tied assets."
prompt = build_prompt(question)
result = pipe(prompt)

print(result[0]['generated_text'])

# RAG Implementation


In [None]:
!nvidia-smi

In [None]:
!pip install pypdf
!pip install python-dotenv
!pip -q install git+https://github.com/huggingface/transformers
!pip install -q datasets loralib sentencepiece
!pip install -q einops accelerate langchain bitsandbytes
!pip install sentence_transformers
!pip install llama-index
%pip install llama-index-llms-huggingface

In [None]:
!pip install -U bitsandbytes
!pip install -U peft

In [None]:
%pip install llama-index-embeddings-langchain
%pip install -U langchain-community

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive', force_remount=True)

In [None]:
# load documents
documents = SimpleDirectoryReader("./drive/My Drive/Files").load_data()


In [None]:
documents

In [None]:
from llama_index.core import PromptTemplate
system_prompt = "You are a helpful assistant. Make direct answers with good explanations. Do not lie and if you do not know the answer, say you do not know. Answer this question:"
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = "<|USER|>{query_str}<|ASSISTANT|>"

In [None]:
from huggingface_hub import login
login()

In [None]:
import torch
torch.set_default_device('cuda')

In [None]:
PYTORCH_NO_CUDA_MEMORY_CACHING=1

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
adapter = "AnonymousAuthorICAIF24/Fine-Tuned_Mistral-Instruct-V2_06-05"
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(model, adapter)

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


In [None]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.1, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer=tokenizer,
    model=model,
    device_map="auto",
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": torch.float16}

)

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding


embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

In [None]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

In [None]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("What are the five credit rating levels that can be assigned for tied assets? Explain each level.")
print(response)

In [None]:
response = query_engine.query("What are the primary investment principles for total assets?")
print(response)

In [None]:
response = query_engine.query("What are the rules for investment in cash deposits?")
print(response)

In [None]:
response = query_engine.query("How are bonds and convertible bonds treated?")
print(response)

In [None]:
response = query_engine.query("What specific limits are placed on investments in foreign currencies?")
print(response)

In [None]:
response = query_engine.query("What are high-risk investments for insurance companies?")
print(response)

In [None]:
response = query_engine.query("How are investments in high-risk assets regulated?")
print(response)

In [None]:
response = query_engine.query("What are the consequences for failing to comply with the guidelines?")
print(response)

In [None]:
response = query_engine.query("How should claims of non-life insurers against reinsurers be handled?")
print(response)

In [None]:
response = query_engine.query("What is the protocol for the inclusion of new types of investments?")
print(response)

In [None]:
response = query_engine.query("What is the principle of diversification within tied assets?")
print(response)

In [None]:
response = query_engine.query("What limits are set for investments in equities and equity securities?")
print(response)

In [None]:
response = query_engine.query("What are the specific limitations for investing in real estate within tied assets?")
print(response)

In [None]:
response = query_engine.query("What are synthetic bonds and how are they used?")
print(response)

In [None]:
response = query_engine.query("How should insurance companies manage and report structured product investments?")
print(response)

In [None]:
response = query_engine.query("What derivatives can be used to hedge credit risk of asset portfolio?")
print(response)

In [None]:
response = query_engine.query("How should liquidity be maintained when using derivatives in investment strategies?")
print(response)

In [None]:
response = query_engine.query("What specific provisions must be followed when dealing with tied assets in relation to unit-linked life insurance policies?")
print(response)

In [None]:
response = query_engine.query("Explain the criteria and process for allocating an investment to tied assets.")
print(response)

In [None]:
response = query_engine.query("How are claims of non-life insurers against reinsurers treated in the context of tied assets?")
print(response)

In [None]:
response = query_engine.query("What are the limitations and conditions for credit exposure to counterparties within tied assets?")
print(response)

In [None]:
response = query_engine.query("How are mortgage receivables treated under the tied assets regulations?")
print(response)

In [None]:
response = query_engine.query("How do regulations ensure that the claims of the insured are prioritized in the event of an insurer's insolvency?")
print(response)

In [None]:
response = query_engine.query("What are the penalties for violating the tied asset regulations?")
print(response)

In [None]:
response = query_engine.query("Describe the process for the annual evaluation of the insurance company's overall investment portfolio including tied assets.")
print(response)

Answers Untrained Model:

In [None]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.1, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="mistralai/Mistral-7B-Instruct-v0.2",
    model_name="mistralai/Mistral-7B-Instruct-v0.2",
    device_map="auto",
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": torch.float16}

)

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding


embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

In [None]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

In [None]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("What are the five credit rating levels that can be assigned for tied assets? Explain each level.")
print(response)

In [None]:
response = query_engine.query("What are the primary investment principles for total assets?")
print(response)

In [None]:
response = query_engine.query("What are the rules for investment in cash deposits?")
print(response)

In [None]:
response = query_engine.query("How are bonds and convertible bonds treated?")
print(response)

In [None]:
response = query_engine.query("What specific limits are placed on investments in foreign currencies?")
print(response)

In [None]:
response = query_engine.query("What are high-risk investments for insurance companies?")
print(response)

In [None]:
response = query_engine.query("How are investments in high-risk assets regulated?")
print(response)

In [None]:
response = query_engine.query("What are the consequences for failing to comply with the guidelines?")
print(response)

In [None]:
response = query_engine.query("How should claims of non-life insurers against reinsurers be handled?")
print(response)

In [None]:
response = query_engine.query("What is the protocol for the inclusion of new types of investments?")
print(response)

In [None]:
response = query_engine.query("What is the principle of diversification within tied assets?")
print(response)

In [None]:
response = query_engine.query("What limits are set for investments in equities and equity securities?")
print(response)

In [None]:
response = query_engine.query("What are the specific limitations for investing in real estate within tied assets?")
print(response)

In [None]:
response = query_engine.query("What are synthetic bonds and how are they used?")
print(response)

In [None]:
response = query_engine.query("How should insurance companies manage and report structured product investments?")
print(response)

In [None]:
response = query_engine.query("What derivatives can be used to hedge credit risk of asset portfolio?")
print(response)

In [None]:
response = query_engine.query("How should liquidity be maintained when using derivatives in investment strategies?")
print(response)

In [None]:
response = query_engine.query("What specific provisions must be followed when dealing with tied assets in relation to unit-linked life insurance policies?")
print(response)

In [None]:
response = query_engine.query("Explain the criteria and process for allocating an investment to tied assets.")
print(response)

In [None]:
response = query_engine.query("How are claims of non-life insurers against reinsurers treated in the context of tied assets?")
print(response)

In [None]:
response = query_engine.query("What are the limitations and conditions for credit exposure to counterparties within tied assets?")
print(response)

In [None]:
response = query_engine.query("How are mortgage receivables treated under the tied assets regulations?")
print(response)

In [None]:
response = query_engine.query("How do regulations ensure that the claims of the insured are prioritized in the event of an insurer's insolvency?")
print(response)

In [None]:
response = query_engine.query("What are the penalties for violating the tied asset regulations?")
print(response)

In [None]:
response = query_engine.query("Describe the process for the annual evaluation of the insurance company's overall investment portfolio including tied assets.")
print(response)