In [7]:
# Cell 1: Import libraries and load templates & insurance datasets

import random
import re
import pandas as pd
import torch

from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, util  # for embedding rows

# Load the final templates CSV (generated previously)
templates_path = r"C:\Users\lathe\Desktop\rag model training\final_templates.csv"
templates_df = pd.read_csv(templates_path)
print("Final Templates (first 5 rows):")
print(templates_df.head())

# Load insurance data (adjust paths as needed)
df_level1 = pd.read_csv(r"C:\Users\lathe\Desktop\rag model training\knowledge_plan_grouping.csv")
df_level2 = pd.read_csv(r"C:\Users\lathe\Desktop\rag model training\knowledge_premium_grouping.csv")
df_level3 = pd.read_csv(r"C:\Users\lathe\Desktop\rag model training\knowlede_plan_disease_combinations.csv")
df_level4 = pd.read_csv(r"C:\Users\lathe\Desktop\rag model training\insurance_plans_by_disease.csv")  # used for Level 4 and 5

print("\nDataset shapes:")
print("Level 1:", df_level1.shape)
print("Level 2:", df_level2.shape)
print("Level 3:", df_level3.shape)
print("Level 4:", df_level4.shape)


Final Templates (first 5 rows):
  hierarchy_level                                         user_query  \
0          Level1  For the plan '{Plan Name}' and explain its pre...   
1          Level1  For the plan '{Plan Name}' and what premium op...   
2          Level1  For the plan '{Plan Name}' and what is its pre...   
3          Level1  Regarding plan '{Plan Name}' and what premium ...   
4          Level1  I need details for plan '{Plan Name}' and what...   

  target_columns                               output_template  
0   Premium Type  It is offered in the {Premium Type} category  
1   Premium Type  The available premium type is {Premium Type}  
2   Premium Type  It is offered in the {Premium Type} category  
3   Premium Type        Its premium category is {Premium Type}  
4   Premium Type            The premium type is {Premium Type}  

Dataset shapes:
Level 1: (7, 7)
Level 2: (3, 9)
Level 3: (21, 8)
Level 4: (900, 13)


In [8]:
# Cell 2: Define target_order, get_all_subsets, join_fragments, and placeholder standardization.

# These are our full target definitions (as in our training examples)
target_order = {
    1: ["Premium Type", "DisCount", "Diseases", "CoverageLevel", "PlanFocus", "Advantage"],
    2: ["Plan Name", "Deductible", "Co-pay Percentage", "Plan Term", "Tax Redemption", "Plan Count", "Monthly Payment", "Advantage"],
    3: ["Monthly Payment", "Deductible", "Co-pay Percentage", "Plan Term", "Tax Redemption", "Benefits"],
    4: ["Maximum Coverage", "Deductible", "Co-pay Percentage", "Waiting Period", "Claims Settled", "Renewability", "Hospital Coverage", "Benefits", "Tax Redemption"],
    5: ["Calculation"]
}

def get_all_subsets(lst):
    """Return all non-empty subsets of lst."""
    subsets = []
    n = len(lst)
    for i in range(1, 2**n):
        subset = [lst[j] for j in range(n) if (i >> j) & 1]
        subsets.append(subset)
    return subsets

def join_fragments(fragments):
    """
    Join fragments with commas and "and" before the final fragment.
    Also, if the first non-intro fragment starts with an unwanted connector ("and", "with"),
    remove it.
    """
    clean = [frag.strip().rstrip(".,") for frag in fragments if frag.strip()]
    if clean:
        for connector, replacement in [("and ", "It "), ("with ", "The ")]:
            if clean[0].lower().startswith(connector):
                clean[0] = replacement + clean[0][len(connector):].strip()
    if not clean:
        return ""
    if len(clean) == 1:
        return clean[0]
    return ", ".join(clean[:-1]) + " and " + clean[-1]

def standardize_placeholders(text):
    # Replace placeholders with standardized tokens.
    replacements = {
        "{Plan Name}": "<PLAN_NAME>",
        "{Premium Type}": "<PREMIUM_TYPE>",
        "{Disease}": "<DISEASE>",
        "{DisCount}": "<DISEASE_COUNT>",
        "{CoverageLevel}": "<COVERAGE_LEVEL>",
        "{PlanFocus}": "<PLAN_FOCUS>",
        "{Advantage}": "<ADVANTAGE>",
        "{Deductible}": "<DEDUCTIBLE>",
        "{Co-pay Percentage}": "<COPAY>",
        "{Plan Term}": "<PLAN_TERM>",
        "{Tax Redemption}": "<TAX_REDEMPTION>",
        "{Plan Count}": "<PLAN_COUNT>",
        "{Monthly Payment}": "<MONTHLY_PAYMENT>",
        "{Benefits}": "<BENEFITS>",
        "{Maximum Coverage}": "<MAX_COVERAGE>",
        "{Waiting Period}": "<WAITING_PERIOD>",
        "{Claims Settled}": "<CLAIMS_SETTLED>",
        "{Renewability}": "<RENEWABILITY>",
        "{Hospital Coverage}": "<HOSPITAL_COVERAGE>",
        "{Calculation}": "<CALCULATION>"
    }
    for k, v in replacements.items():
        text = text.replace(k, v)
    return text

# Standardize the templates
templates_df["user_query"] = templates_df["user_query"].apply(standardize_placeholders)
templates_df["output_template"] = templates_df["output_template"].apply(standardize_placeholders)


In [12]:
# Cell 3: Create Dataset and Fine-Tune BART

from torch.utils.data import Dataset

class TemplateDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_length=128, max_output_length=128):
        self.inputs = df["user_query"].tolist()
        self.outputs = df["output_template"].tolist()
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        output_text = self.outputs[idx]
        input_enc = self.tokenizer(input_text, truncation=True, padding="max_length",
                                    max_length=self.max_input_length, return_tensors="pt")
        output_enc = self.tokenizer(output_text, truncation=True, padding="max_length",
                                     max_length=self.max_output_length, return_tensors="pt")
        return {
            "input_ids": input_enc["input_ids"].squeeze(),
            "attention_mask": input_enc["attention_mask"].squeeze(),
            "labels": output_enc["input_ids"].squeeze()
        }

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
dataset = TemplateDataset(templates_df, tokenizer)
print("Number of training examples:", len(dataset))

# Fine-tune the model (this example uses a small number of epochs; adjust as needed)
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

training_args = TrainingArguments(
    output_dir="./bart_finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    evaluation_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()
model.save_pretrained("./bart_finetuned")
tokenizer.save_pretrained("./bart_finetuned")
print("Fine-tuning complete.")


Number of training examples: 10500


Step,Training Loss
100,2.452
200,0.132
300,0.0897
400,0.0824
500,0.0769
600,0.0757
700,0.0715
800,0.0701
900,0.0678
1000,0.0684




Fine-tuning complete.


In [13]:
from sentence_transformers import SentenceTransformer, util

# Initialize the sentence transformer model for embedding.
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def compute_embeddings(df, fields):
    """
    Given a dataframe and a list of fields (columns),
    compute a text representation (concatenation of the fields) and then embed it.
    """
    texts = df[fields].astype(str).apply(lambda row: " | ".join(row.values), axis=1).tolist()
    embeddings = embedder.encode(texts, convert_to_tensor=True)
    return embeddings, texts


In [14]:
def determine_level(prompt):
    prompt_lower = prompt.lower()
    if "amount" in prompt_lower:
        return 5
    elif "disease" in prompt_lower:
        # If both plan and premium are mentioned, assume level 4.
        if "plan" in prompt_lower and "premium" in prompt_lower:
            return 4
        else:
            return 4
    elif "plan" in prompt_lower and "premium" in prompt_lower:
        return 3
    elif "premium" in prompt_lower:
        return 2
    elif "plan" in prompt_lower:
        return 1
    else:
        return 1

def extract_inputs(prompt):
    inputs = {}
    plan_match = re.search(r"Plan(?:\s*Name)?:\s*['\"]([^'\"]+)['\"]", prompt, re.IGNORECASE)
    if plan_match:
        inputs["Plan Name"] = plan_match.group(1).strip()
    premium_match = re.search(r"Premium(?:\s*Type)?:\s*['\"]([^'\"]+)['\"]", prompt, re.IGNORECASE)
    if premium_match:
        inputs["Premium Type"] = premium_match.group(1).strip()
    disease_match = re.search(r"Disease(?:\s*Name)?:\s*['\"]([^'\"]+)['\"]", prompt, re.IGNORECASE)
    if disease_match:
        inputs["Disease"] = disease_match.group(1).strip()
    amount_match = re.search(r"Amount:\s*([\d]+)", prompt, re.IGNORECASE)
    if amount_match:
        inputs["Amount"] = float(amount_match.group(1))
    return inputs

def generate_model_response(prompt):
    # Use the fine-tuned BART model to generate an output template.
    inputs_enc = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    outputs = model.generate(inputs_enc["input_ids"], attention_mask=inputs_enc["attention_mask"],
                             max_length=128, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def retrieve_best_match(level, inputs, top_k=1):
    """
    Given extracted inputs and the level, select the corresponding dataset and use
    cosine similarity over embeddings to find the best matching record.
    For simplicity, here we show an example for Level 1; similar logic applies for other levels.
    """
    if level == 1:
        df = df_level1.copy()
        # Filter based on Plan Name if provided.
        if "Plan Name" in inputs:
            df = df[df["Plan Name"].str.contains(inputs["Plan Name"], case=False, na=False)]
        if df.empty:
            return None, None
        # Compute embeddings for the filtered rows:
        embeddings, texts = compute_embeddings(df, df.columns.tolist())
        # Create an embedding for the query (e.g., using the input fields)
        query_text = " | ".join([str(inputs[k]) for k in ["Plan Name"] if k in inputs])
        query_embedding = embedder.encode(query_text, convert_to_tensor=True)
        cos_scores = util.cos_sim(query_embedding, embeddings)[0]
        top_result_idx = int(torch.argmax(cos_scores))
        return df.iloc[top_result_idx].to_dict(), texts[top_result_idx]
    # Similarly implement for other levels...
    # For brevity, we return the head row for levels 2-4.
    elif level in [2,3,4]:
        if level == 2:
            df = df_level2.copy()
        elif level == 3:
            df = df_level3.copy()
        else:
            df = df_level4.copy()
        # Simple filtering by matching input fields:
        for key, value in inputs.items():
            if key in df.columns:
                df = df[df[key].str.contains(value, case=False, na=False)]
        if df.empty:
            return None, None
        return df.iloc[0].to_dict(), "First matched record"
    elif level == 5:
        df = df_level4.copy()
        for key, value in inputs.items():
            if key in df.columns:
                df = df[df[key].str.contains(value, case=False, na=False)]
        if df.empty:
            return None, None
        return df.iloc[0].to_dict(), "First matched record"
    
def compute_out_of_pocket(record, amount, premium_type):
    premium_type = premium_type.lower()
    if "basic" in premium_type:
        deductible_pct = 5
        copay_pct = 20
    elif "lite" in premium_type:
        deductible_pct = 10
        copay_pct = 10
    elif "premier" in premium_type:
        deductible_pct = 15
        copay_pct = 5
    else:
        deductible_pct = 5
        copay_pct = 20
    try:
        max_coverage = float(record.get("Maximum Coverage", 0))
    except:
        max_coverage = 0
    if max_coverage > 0 and amount > max_coverage:
        return f"Claim amount {amount} exceeds maximum coverage of {max_coverage}."
    deductible_value = amount * deductible_pct / 100
    updated_amount = amount - deductible_value
    copay_value = updated_amount * copay_pct / 100
    final_out_of_pocket = deductible_value + copay_value
    return final_out_of_pocket

In [15]:
def data_retriever_model(user_prompt):
    # 1. Determine level and extract inputs.
    level = determine_level(user_prompt)
    inputs = extract_inputs(user_prompt)
    
    # 2. Generate an output template using the fine-tuned BART model.
    generated_template = generate_model_response(user_prompt)
    
    # 3. Retrieve the best matching record using our embedding/retrieval method.
    record, record_text = retrieve_best_match(level, inputs)
    if record is None:
        return "No matching record found."
    
    # 4. For Level 5, if amount is provided, compute the out-of-pocket expense.
    if level == 5:
        if "Amount" in inputs:
            result = compute_out_of_pocket(record, inputs["Amount"], inputs.get("Premium Type", "basic"))
            answer = (f"For the plan '{record.get('Plan Name', 'Unknown')}' with premium '{record.get('Premium Type', 'Unknown')}', "
                      f"and disease '{record.get('Disease', 'Unknown')}', the calculated out-of-pocket expense is {result}.")
        else:
            try:
                default_amount = float(record.get("Maximum Coverage", 0))
            except:
                default_amount = 0
            result = compute_out_of_pocket(record, default_amount, inputs.get("Premium Type", "basic"))
            answer = (f"For the plan '{record.get('Plan Name', 'Unknown')}' with premium '{record.get('Premium Type', 'Unknown')}', "
                      f"and disease '{record.get('Disease', 'Unknown')}', assuming maximum coverage as the claim amount, "
                      f"the calculated out-of-pocket expense is {result}.")
    else:
        # For levels 1-4, simply format the retrieved record.
        answer = "Retrieved data: " + "; ".join([f"{k}: {v}" for k, v in record.items()])
    
    final_response = f"Generated Template: {generated_template}\n\nAnswer: {answer}"
    return final_response

# Test the full pipeline with example prompts:
test_prompts = [
    "Plan: 'Individual Health Insurance'",
    "Premium: 'Lite'",
    "Plan: 'Individual Health Insurance', Premium: 'Basic'",
    "Plan: 'Individual Health Insurance', Premium: 'Basic', Disease: 'Heart Attack'",
    "Plan: 'Individual Health Insurance', Premium: 'Basic', Disease: 'Heart Attack', Amount: 300000"
]

for prompt in test_prompts:
    print("\nUser Prompt:", prompt)
    print(data_retriever_model(prompt))



User Prompt: Plan: 'Individual Health Insurance'
Generated Template: the plan has a disease count of <DISEASE_COUNT>

Answer: Retrieved data: Plan Name: Individual Health Insurance; Premium Type: Basic, Lite, Premier; DisCount: 34; Diseases: Acute Myocardial Infarction, Alzheimer's Disease, Angina Pectoris, Aorta Surgery, Aortic Dissection, Atrial Flutter, Brain Surgery, Cancer, Cardiomyopathy, Chronic Liver Disease, Chronic Lung Disease, Congestive Heart Failure, Coronary Artery Disease, Endocarditis, Heart Attack, Kidney Failure, Left Ventricular Hypertrophy, Major Organ Transplant, Motor Neuron Disease, Multiple Sclerosis, Myocarditis, Parkinson's Disease, Pericarditis, Permanent Blindness, Permanent Deafness, Permanent Loss of Speech, Poliomyelitis, Primary Pulmonary Arterial Hypertension, Pulmonary Embolism, Sepsis, Severe Coma, Stroke, Ventricular Fibrillation, Ventricular Tachycardia; CoverageLevel: High; PlanFocus: Individual; Advantage: Plan 'Individual Health Insurance' (Ind