# Langchain <--> Elastic Search

Elasticsearch is an open source distributed, RESTful search and analytics engine, scalable data store, and vector database capable of addressing a growing number of use cases. As the heart of the Elastic Stack, it centrally stores your data for lightning-fast search, fine‑tuned relevancy, and powerful analytics that scale with ease.
Elasticsearch can store and index a variety of data, including structured and unstructured text, numerical data, and geospatial data. It's known for its ability to find queries in large-scale unstructured data
Elasticsearch uses a search index, which is similar to an index in the back of a book, to map content to its location in a document. This allows users to quickly find information without scanning through an entire document

- https://www.elastic.co/search-labs/blog/langchain-collaboration
- https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html
- https://python.langchain.com/docs/integrations/vectorstores/elasticsearch/
- https://www.elastic.co/blog/elasticsearch-is-open-source-again
- https://www.elastic.co/search-labs/blog/category/generative-ai


In [None]:
! pip install -r requirements.txt -q

# Install ELastic Search Docker

- docker network create elastic
- docker pull docker.elastic.co/elasticsearch/elasticsearch:8.15.3
- docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB docker.elastic.co/elasticsearch/elasticsearch:8.15.3

# Docker compose
To INstall all components in Containers, Ollama, ELastic and Setup ELastic Component

- docker compose -f docker-compose.yml up

In [1]:
import os
from dotenv import dotenv_values

In [2]:
config = dotenv_values("./keys/.env")

In [3]:
import os, tempfile
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.schema import HumanMessage, SystemMessage
from google.oauth2 import service_account
from dotenv import dotenv_values
import json
import vertexai

import itertools
import time


# SETUP ENVIRONMENT

In [4]:
config = dotenv_values("./keys/.env")
with open("./keys/complete-tube-421007-208a4862c992.json") as source:
    info = json.load(source)

vertex_credentials = service_account.Credentials.from_service_account_info(info)
vertexai.init(
    project=config["PROJECT"],
    location=config["REGION"],
    credentials=vertex_credentials,
)
google_api_key = config["GEMINI-API-KEY"]
os.environ["GEMINI_API_KEY"] = google_api_key

In [5]:
ROOT= os.getcwd()
ROOT

'D:\\repos3\\elastic'

In [6]:
llm = ChatGoogleGenerativeAI(
                    model="gemini-1.5-pro-001", credentials=vertex_credentials
                )

In [43]:

class GoogleAIClient:
    def __init__(self, model, credentials):
        self.credentials = credentials
        self.model = model
        self.client = ChatGoogleGenerativeAI(
            model=self.model, credentials=self.credentials)

    def generate_streaming_response(self, prompt):
        """
        Generates a streaming completion using LangChain ChatGoogleGenerativeAI.

        Args:
            prompt (str): The prompt for the model.
            model (str): The model to use (e.g., "gemini-pro").

        Yields:
            str: Chunks of the generated completion.
        """
        try:
            chat = ChatGoogleGenerativeAI(model=self.model)
            messages = [HumanMessage(content=prompt)]
            for chunk in chat.stream(messages):
                yield chunk.content
        except Exception as e:
            print(f"An error occurred: {e}")
            yield f"Error: {e}" # yield error message to the stream.

    def generate_non_streaming_response(self, prompt, system_prompt=""):
        try:
            chat = self.client
            messages = []
            if system_prompt:
                messages.append(SystemMessage(content=system_prompt))
            messages.append(HumanMessage(content=prompt))
            response = chat.invoke(messages)
            return response.content
        except Exception as e:
            print(f"An error occurred: {e}")
            return None
        
LLM = GoogleAIClient(model="gemini-1.5-pro-001", credentials=vertex_credentials)

# GENERATE FAKE DATA

In [9]:
import json
import random
from datetime import datetime, timedelta
import string
from pprint import pprint
import traceback
import uuid
import os
import pickle


def generate_random_email():
    # Generate a random username length between 5 and 12 characters
    username_length = random.randint(5, 12)
    
    # Generate a random username using lowercase letters and digits
    username = ''.join(random.choices(string.ascii_lowercase + string.digits, k=username_length))
    
    # List of common email domains
    domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'icloud.com']
    
    # Choose a random domain
    domain = random.choice(domains)
    
    # Construct and return the email
    return f"{username}@{domain}"


def generate_chinese_name():
    chinese_surnames = [
        "Tan", "Lim", "Lee", "Ng", "Ong", "Wong", "Goh", "Chua", "Koh", "Teo", "Chan", "Yeo", "Ang", "Chong", "Leong", "Foo", "Sim", "Tay", "Ho", "Low",
        "Chen", "Lin", "Huang", "Zhang", "Li", "Wang", "Liu", "Wu", "Yang", "Zhou", "Xu", "Sun", "Ma", "Zhu", "Hu", "Guo", "He", "Gao", "Lin", "Luo",
        "Zheng", "Liang", "Xie", "Song", "Tang", "Xu", "Han", "Feng", "Deng", "Xiao", "Cheng", "Cao", "Peng", "Zeng", "Xue", "Lu", "Su", "Pan", "Jiang", "Bai",
        "Du", "Yin", "Mei", "Fang", "Fu", "Yuan", "Cai", "Jia", "Gu", "Xiong", "Hao", "Shao", "Meng", "Long", "Wei", "Wan", "Duan", "Qiu", "Jiang", "Qin",
        "Chu", "Yu", "Shen", "Qi", "Cui", "Ren", "Tian", "Xia", "Shi", "Hou", "Yan", "Jin", "Kong", "Wei", "Xiang", "Yao", "Yan", "Sheng", "Zu", "Qian"
    ]

    chinese_given_names = [
        "Wei", "Hui", "Xin", "Yi", "Ying", "Jie", "Ling", "Zhi", "Qiang", "Mei", "Jun", "Xiang", "Hao", "Chen", "Ming", "Feng", "Yang", "Cheng", "Yong", "Tian",
        "Jing", "Yan", "Fei", "Yu", "Xiuying", "Guiying", "Chunmei", "Xiaohong", "Xiulan", "Guilan", "Huifang", "Xiuzhen", "Yumei", "Xiumei", "Guirong", "Shulan", "Guizhi", "Xiuyun", "Huiying", "Jinlan",
        "Qing", "Xuan", "Zhen", "Rui", "Kai", "Sheng", "Hong", "Xiong", "Lei", "Hua", "Bin", "Heng", "Xiaowei", "Xiaojun", "Xiaofeng", "Xiaogang", "Xiaoming", "Xiaohua", "Xiaohui", "Xiaolin",
        "An", "Bao", "Bo", "Chang", "Chao", "Da", "Dong", "En", "Gang", "Guo", "Hai", "Han", "Jian", "Jiao", "Jin", "Kang", "Lang", "Li", "Liang", "Miao",
        "Nan", "Peng", "Ping", "Qi", "Qian", "Rong", "Ru", "Shan", "Shu", "Tai", "Tao", "Wen", "Wu", "Xia", "Xian", "Xiao", "Xue", "Yao", "Yi", "Yin",
        "Yu", "Yuan", "Yun", "Zhan", "Zhe", "Zhong", "Zi", "Ai", "Bi", "Cai", "Can", "Ce", "Cui", "Di", "E", "Fu", "Gai", "Gan", "Huan", "Jia",
        "Jiu", "Ju", "Kui", "Lan", "Lian", "Meng", "Nian", "Ning", "Nu", "Pin", "Qiu", "Quan", "Sha", "Shi", "Si", "Song", "Su", "Ti", "Tong", "Wai",
        "Xi", "Xiu", "Xu", "Ya", "Yan", "Ye", "Ying", "You", "Zai", "Ze", "Zeng", "Zhi", "Zhuo", "Zi", "Zong", "Zou"
    ]
    
    surname = random.choice(chinese_surnames)
    given_name = random.choice(chinese_given_names)
    
    # Sometimes add a second character to the given name
    if random.random() < 0.5:  # 50% chance for a two-character given name
        given_name += ' '+random.choice(chinese_given_names)
    
    return f"{surname} {given_name}"


def generate_malay_name():
    malay_given_names = [
        "Abdullah", "Abdul Rahman", "Abdul Rahim", "Abdul Aziz", "Abdul Kadir", "Abdul Latif", "Abdul Malik", "Abdul Razak", "Abu Bakar", "Adam", "Adil", "Adnan", "Ahmad", "Aiman", "Aizat", "Akmal", "Ali", "Amin", "Amir", "Ammar",
        "Anuar", "Arif", "Ashraf", "Asraf", "Azhar", "Aziz", "Azlan", "Azman", "Azmi", "Badrul", "Baharudin", "Bakri", "Borhan", "Burhanuddin", "Che", "Danial", "Daud", "Dzulkifli", "Edzham", "Fadil", "Fahmi", "Faisal", "Faizal", "Farid", "Faris",
        "Fauzi", "Fuad", "Ghazali", "Hadi", "Hafiz", "Hakim", "Halim", "Hamid", "Hamzah", "Hanafi", "Haris", "Harith", "Haron", "Hasan", "Hashim", "Hassan", "Haziq", "Helmi", "Hisham", "Husain", "Hussein", "Ibrahim", "Idris", "Ihsan",
        "Imran", "Irfan", "Isa", "Ismail", "Izwan", "Jafar", "Jamal", "Jamil", "Johari", "Kamal", "Kamarul", "Kamaruzaman", "Khairil", "Khairuddin", "Khalid", "Lokman", "Lutfi", "Mahathir", "Mahmud", "Majid", "Malik", "Mansor", "Mas", "Mat", "Megat",
        "Mizan", "Mohamad", "Mohamed", "Mohammad", "Mohammed", "Mohd", "Muhamad", "Muhammad", "Muhsin", "Mukhriz", "Munir", "Mustafa", "Muthu", "Nasir", "Nasrudin", "Nazri", "Nik", "Nizam", "Noor", "Nor", "Nordin", "Omar", "Osman", "Othman",
        "Radzi", "Rafiq", "Rahimi", "Rahim", "Rahman", "Rashid", "Razak", "Razali", "Redza", "Redzuan", "Riduan", "Rizal", "Roslan", "Ruslan", "Saad", "Sabri", "Saffuan", "Saiful", "Saleh", "Salleh", "Samad", "Shafiq", "Shah", "Shahrul", "Shamsudin",
        "Shamsul", "Sharif", "Sulaiman", "Syed", "Syukri", "Tarmizi", "Taufiq", "Tengku", "Umar", "Wan", "Yusof", "Yusoff", "Yusri", "Zafran", "Zainal", "Zakaria", "Zaki", "Zamri", "Zikri", "Zulkifli",
        "Adibah", "Adila", "Adina", "Afiqah", "Aida", "Aishah", "Aisyah", "Alya", "Amalina", "Amelia", "Amira", "Amirah", "Aminah", "Anisah", "Aqilah", "Arissa", "Asma", "Asmah", "Atiqah", "Azizah",
        "Azlina", "Azwa", "Balqis", "Dalila", "Dayang", "Eliana", "Emilia", "Farah", "Farhana", "Farhanah", "Fariha", "Faridah", "Farihah", "Fasihah", "Fatimah", "Fatin", "Fazlin", "Hafizah", "Halimatun", "Hamidah",
        "Hanisah", "Hasmah", "Hasnah", "Haziqah", "Hazwani", "Hidayah", "Humaira", "Izzati", "Jamilah", "Khadijah", "Khairunnisa", "Laila", "Latifah", "Lina", "Madiha", "Maisarah", "Mariam", "Maryam", "Mas", "Mastura",
        "Mawar", "Nabila", "Nabilah", "Nadiah", "Nadirah", "Nafeesa", "Najwa", "Nasyitah", "Natasha", "Nazifah", "Nazirah", "Nik", "Noor", "Noorul", "Nor", "Nora", "Noraini", "Norashikin", "Norazlina", "Norhayati",
        "Noriah", "Norizan", "Norlia", "Normah", "Norziana", "Nur", "Nurain", "Nuraina", "Nuraliya", "Nuramira", "Nurassyifa", "Nurdiana", "Nurfadilah", "Nurfaizah", "Nurfarahana", "Nurhafizah", "Nurhaliza", "Nurhayati", "Nurhidayah", "Nurin",
        "Nurliyana", "Nurmala", "Nurshahira", "Nursyafiqah", "Nursyahirah", "Nurul", "Puteri", "Qistina", "Rabiatul", "Rabiatuladawiyah", "Radin", "Rahmah", "Raja", "Rashidah", "Rosmah", "Rossita", "Rozita", "Safiah", "Safinah", "Safiyyah",
        "Saleha", "Salina", "Salma", "Saodah", "Sarah", "Shafiqah", "Sharifah", "Siti", "Sofia", "Sofiah", "Sofiyah", "Sumaiyah", "Suraya", "Syafiqah", "Syahirah", "Syairah", "Syakila", "Syamimi", "Syaza", "Syazwani",
        "Tengku", "Ummi", "Umi", "Wan", "Yasmin", "Yusrina", "Zainab", "Zainal", "Zainun", "Zakiah", "Zaleha", "Zalina", "Zanariah", "Zarina", "Zulaika", "Zulaikha", "Zulaikhah", "Zulfah"
    ]

    malay_surnames = ["bin", "binti"]
    
    first_name = random.choice(malay_given_names)
    middle_name = random.choice(malay_given_names)
    surname = random.choice(malay_surnames)
    
    # Sometimes add a title or honorific
    titles = ["Haji", "Hajjah", "Tan Sri", "Puan Sri", "Datuk", "Datin", "Tun", "Toh Puan"]
    if random.random() < 0.1:  # 10% chance to add a title
        title = random.choice(titles)
        return f"{title} {first_name} {surname} {middle_name}"
    
    return f"{first_name} {surname} {middle_name}"

def generate_indian_name():
    indian_given_names = [
        "Aadhav", "Aadit", "Aaditya", "Aakash", "Aalam", "Aalok", "Aamir", "Aanjaneya", "Aarav", "Aarnav", "Aarush", "Aayush", "Abha", "Abhai", "Abhay", "Abhijat", "Abhijeet", "Abhimanyu", "Abhinav", "Abhishek",
        "Abishek", "Aditi", "Aditya", "Advik", "Agastya", "Agni", "Aishwarya", "Ajay", "Ajeet", "Akash", "Akhil", "Akshay", "Akshita", "Alok", "Amal", "Aman", "Amar", "Amarnath", "Amey", "Amish",
        "Amit", "Amita", "Amitabh", "Amolak", "Amrita", "Anand", "Anandi", "Anamika", "Ananth", "Ananya", "Anarya", "Anay", "Anaya", "Aniket", "Anil", "Aniruddha", "Anish", "Anit", "Anita", "Anjali",
        "Anjana", "Anjan", "Anjney", "Ankit", "Ankita", "Ankur", "Anmol", "Ansh", "Anshika", "Anshul", "Anuj", "Anupam", "Anushka", "Anurag", "Aparna", "Apoorva", "Arav", "Arjun", "Arka", "Arnav",
        "Arohi", "Arpit", "Artha", "Arun", "Aruna", "Arundhati", "Arushi", "Arya", "Asha", "Ashok", "Ashwin", "Asim", "Astha", "Atharv", "Ati", "Atiksh", "Atishay", "Atul", "Aum", "Avani",
        "Avantika", "Avichal", "Avinash", "Ayaan", "Ayush", "Ayushi", "Bala", "Balaji", "Bharat", "Bharath", "Bhargav", "Bhargavi", "Bhaskar", "Bhavana", "Bhavesh", "Bhavya", "Bhoomi", "Bijay", "Bina", "Bindu",
        "Chandan", "Chandra", "Chandran", "Charu", "Chetan", "Chetana", "Chirag", "Chitrangada", "Darshan", "Daya", "Deepa", "Deepak", "Deepika", "Dev", "Deva", "Devdan", "Devendra", "Devi", "Devika", "Dhairya",
        "Dhananjay", "Dharma", "Dharmendra", "Dhruv", "Dilip", "Disha", "Divya", "Diya", "Durga", "Esha", "Ekta", "Gauri", "Gautam", "Gayathri", "Geeta", "Girish", "Gita", "Gitanjali", "Gopal", "Gopinath",
        "Govind", "Gowri", "Gulshan", "Gunjan", "Guru", "Harsh", "Harsha", "Harshad", "Harshita", "Hema", "Hemant", "Himani", "Hira", "Hiren", "Indira", "Indra", "Indu", "Ira", "Ishan", "Isha",
        "Ishaan", "Ishani", "Ishita", "Jai", "Jatin", "Jaya", "Jayant", "Jayanti", "Jayin", "Jhanvi", "Jitendra", "Jiya", "Jyoti", "Kabir", "Kalindi", "Kalpana", "Kalyani", "Kanak", "Karan", "Karthik",
        "Kartik", "Karuna", "Kaustubh", "Kavita", "Kavya", "Keerthi", "Keshav", "Ketan", "Khushi", "Kiara", "Kiran", "Kirti", "Krishna", "Krish", "Kriti", "Kritika", "Kshitij", "Kunal", "Kushal", "Lakshmi",
        "Lalit", "Lalita", "Lavanya", "Laxmi", "Leela", "Madhav", "Madhavi", "Madhur", "Mahendra", "Mahesh", "Mahima", "Mahi", "Mallika", "Manasi", "Manish", "Manju", "Manjula", "Manoj", "Manohar", "Maya",
        "Mayank", "Meena", "Meera", "Megha", "Mehul", "Mira", "Mitali", "Mohit", "Mridula", "Mukesh", "Mukta", "Muskaan", "Nachiket", "Naman", "Namita", "Nandini", "Narayan", "Naren", "Naveen", "Navin",
        "Neela", "Neelam", "Neeti", "Neha", "Nidhi", "Nikhil", "Nikita", "Nilam", "Nilesh", "Nilima", "Nimesh", "Nirmal", "Nirmala", "Nirupama", "Nisha", "Nishant", "Nishtha", "Nitesh", "Niti", "Nitya",
        "Om", "Ojas", "Omkar", "Pankaj", "Parag", "Paras", "Parth", "Parvati", "Pooja", "Prabhat", "Prachi", "Pradip", "Pragya", "Prakash", "Pramod", "Pranav", "Praney", "Pranita", "Prasad", "Pratap",
        "Pratibha", "Pratik", "Praveen", "Prem", "Prerna", "Preeti", "Priya", "Priyanka", "Puja", "Puneet", "Purvi", "Pushpa", "Rachana", "Radha", "Radhika", "Raghu", "Rahul", "Raj", "Raja", "Rajat",
        "Rajeev", "Rajendra", "Rajesh", "Raju", "Rakesh", "Ram", "Rama", "Ramesh", "Rani", "Ranjana", "Ranjit", "Rashmi", "Ravi", "Ravindra", "Rekha", "Renuka", "Reva", "Richa", "Riddhi", "Riddhima",
        "Rishabh", "Rishi", "Rita", "Ritesh", "Ritika", "Rohan", "Rohit", "Roopa", "Ruchi", "Rudra", "Rupal", "Rupali", "Rushil", "Sachin", "Sahil", "Sakshi", "Sameer", "Samir", "Sandeep", "Sandya",
        "Sanjay", "Sanjiv", "Sankar", "Santosh", "Saras", "Sarika", "Sarthak", "Satish", "Satyam", "Saurabh", "Savar", "Seema", "Shailesh", "Shalu", "Shanta", "Shantanu", "Sharad", "Sharmila", "Shashi", "Shekhar",
        "Shilpa", "Shiva", "Shivani", "Shraddha", "Shreeya", "Shreya", "Shri", "Shriram", "Shubha", "Shubham", "Shweta", "Siddharth", "Simar", "Simran", "Smita", "Smriti", "Sneha", "Soham", "Sohini", "Sonam",
        "Sonia", "Srijan", "Srinivas", "Subhash", "Suchitra", "Sudhir", "Sujata", "Sukanya", "Suman", "Sumati", "Sumit", "Sundar", "Sundari", "Sunil", "Sunita", "Supriya", "Suraj", "Suresh", "Surya", "Sushil",
        "Sushma", "Swapna", "Swapnil", "Swati", "Tanisha", "Tanmay", "Tanuj", "Tanvi", "Tanya", "Tarun", "Tej", "Tejas", "Tejashri", "Tina", "Trisha", "Triveni", "Tuhina", "Tushar", "Udai", "Uday",
        "Ujjwal", "Uma", "Umang", "Upasana", "Urvi", "Usha", "Uttam", "Vaibhav", "Vaishnavi", "Varun", "Varsha", "Vasant", "Vasudha", "Vedant", "Vidhi", "Vidya", "Vijay", "Vimal", "Vinay", "Vineet",
        "Vinod", "Vipul", "Viraj", "Vishal", "Vishnu", "Vivek", "Yash", "Yashoda", "Yogesh", "Yuvraj"
    ]

    indian_surnames = [
        "Acharya", "Agarwal", "Aggarwal", "Ahluwalia", "Ahuja", "Arora", "Anand", "Awasthi", "Babu", "Badal", "Bajaj", "Bajwa", "Bakshi", "Balakrishnan", "Balan", "Balasubramanian", "Banerjee", "Banik", "Bansal", "Basu",
        "Batra", "Bhagat", "Bhalla", "Bhandari", "Bhardwaj", "Bhargava", "Bhasin", "Bhat", "Bhatia", "Bhatt", "Bhattacharya", "Bhavsar", "Bedi", "Bhojwani", "Bose", "Buch", "Chauhan", "Chadha", "Chakrabarti", "Chakraborty",
        "Chandra", "Chatterjee", "Chaturvedi", "Chauhan", "Chawla", "Cherian", "Chokshi", "Chopra", "Choudhary", "Choudhury", "D'Souza", "Dalmia", "Das", "Dasgupta", "Datta", "Dave", "Dayal", "Desai", "Deshmukh", "Deshpande",
        "Devan", "Dewan", "Dhar", "Dhawan", "Dhillon", "Dixit", "Doshi", "Dua", "Dube", "Dubey", "Dugar", "Dutt", "Dutta", "Dwivedi", "Fernandes", "Gandhi", "Ganesh", "Ganguly", "Garg", "George", "Ghosh", "Gokhale", "Goel",
        "Goswami", "Gour", "Goyal", "Guha", "Gulati", "Gupta", "Halder", "Handa", "Hans", "Hegde", "Hora", "Iyengar", "Iyer", "Jain", "Jaiswal", "Jani", "Jayaraman", "Jha", "Jhaveri", "Johar", "Joshi", "Kakkar", "Kala",
        "Kale", "Kalra", "Kanda", "Kannan", "Kapoor", "Kapur", "Kar", "Karnik", "Kashyap", "Kaul", "Kaur", "Khatri", "Khanna", "Khandelwal", "Kher", "Khosla", "Khurana", "Kohli", "Kochhar", "Kothari", "Krishna", "Krishnamurthy",
        "Krishnan", "Kulkarni", "Kumar", "Kumari", "Kurian", "Kuruvilla", "Lal", "Lalla", "Lamba", "Lobo", "Madhavan", "Mahajan", "Mahalingam", "Maheshwari", "Majumdar", "Malhotra", "Malik", "Manikandan", "Mani", "Manna",
        "Mathew", "Mathur", "Mehra", "Mehrotra", "Mehta", "Menon", "Mirchandani", "Mishra", "Misra", "Mistry", "Mitra", "Modi", "Mohan", "Mohanty", "Mukherjee", "Mukhopadhyay", "Nagar", "Nagarajan", "Nair", "Nambiar",
        "Nambudiripad", "Nanda", "Narang", "Narayan", "Narayanan", "Nath", "Nayak", "Nayar", "Nazareth", "Nigam", "Nimbkar", "Oak", "Om", "Padmanabhan", "Pai", "Pal", "Palan", "Pande", "Pandey", "Pandit", "Pant", "Parekh",
        "Parikh", "Patel", "Pathak", "Patil", "Patnaik", "Patra", "Pillai", "Prabhakar", "Prabhu", "Pradhan", "Prakash", "Prasad", "Prashad", "Puri", "Purohit", "Radhakrishnan", "Ragavan", "Raghavan", "Rai", "Raj", "Raja",
        "Rajan", "Rajagopalan", "Raju", "Ram", "Rama", "Raman", "Ramanathan", "Ramaswamy", "Ramachandran", "Ramakrishnan", "Rangan", "Ranganathan", "Rao", "Rastogi", "Ratta", "Rattan", "Ratti", "Rau", "Raval", "Ravindran",
        "Ray", "Reddy", "Roy", "Sabharwal", "Sachdev", "Sachdeva", "Sagar", "Saha", "Sahni", "Saini", "Salvi", "Samarth", "Sampath", "Sampat", "Samuel", "Sandhu", "Sane", "Sanghi", "Sanghvi", "Sankar", "Sankaran", "Sant",
        "Saraf", "Sarin", "Sarkar", "Sarma", "Sarna", "Sastry", "Sathe", "Savant", "Sawhney", "Saxena", "Sebastian", "Sehgal", "Sen", "Sengupta", "Sequeira", "Seth", "Sethi", "Setty", "Shah", "Shankar", "Sharma", "Shenoy",
        "Sheth", "Shetty", "Shroff", "Shukla", "Sinha", "Sodhi", "Solanki", "Som", "Soman", "Somani", "Soni", "Sood", "Sridhar", "Srinivas", "Srinivasan", "Srivastava", "Subramaniam", "Subramanian", "Sundaram", "Sur", "Suri",
        "Swaminathan", "Swamy", "Tagore", "Talwar", "Tandon", "Tata", "Tella", "Thakkar", "Thakur", "Thomas", "Tiwari", "Trivedi", "Upadhyay", "Upadhyaya", "Vaidya", "Varghese", "Varkey", "Varma", "Varman", "Vasa", "Venkataraman",
        "Venkatesh", "Verma", "Vijayakumar", "Virk", "Viswanathan", "Vohra", "Vora", "Vyas", "Wable", "Wadhwa", "Wagle", "Wahi", "Walia", "Walla", "Warrior", "Wason", "Yadav", "Yogi", "Zaveri", "Zachariah"
    ]
    
    given_name = random.choice(indian_given_names)
    surname = random.choice(indian_surnames)
    
    return f"{given_name} {surname}"

def generate_name():
    name_generators = [generate_chinese_name, generate_malay_name, generate_indian_name]
    chosen_generator = random.choice(name_generators)
    return chosen_generator(), chosen_generator



def generate_nric():
    digits = ''.join(random.choices(string.digits, k=8))
    checksum = random.choice(string.ascii_uppercase)
    return f"S{digits}{checksum}"

def generate_phone_number():
    return f"+65 {random.randint(8000, 9999)} {random.randint(1000, 9999)}"

def generate_drivers_license():
    return f"S{random.randint(1000000, 9999999):07d}{random.choice(string.ascii_uppercase)}"

def generate_cpf_number():
    return f"S{random.randint(1000000, 9999999):07d}{random.choice(string.ascii_uppercase)}"

def calculate_age(dob):
    today = datetime.now()
    return today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))

def generate_passport_number():
    return f"K{random.randint(1000000, 9999999)}{random.choice(string.ascii_uppercase)}"



def generate_date(start_date, end_date):
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    return start_date + timedelta(days=random_number_of_days)

def generate_profile():
    countries = ["Singapore", "Malaysia", "China", "India", "Indonesia", "Philippines", "Vietnam", "Thailand", "Myanmar", "Cambodia", "Laos", "Brunei", "Japan", "South Korea", "North Korea", "Taiwan", "Hong Kong", "Macau", "Bangladesh", "Sri Lanka", "Nepal", "Bhutan", "Pakistan", "Afghanistan", "Iran", "Iraq", "Saudi Arabia", "UAE", "Oman", "Yemen", "Qatar", "Kuwait", "Bahrain", "Jordan", "Lebanon", "Syria", "Israel", "Palestine", "Turkey", "Cyprus", "Greece", "Italy", "Spain", "Portugal", "France", "Germany", "United Kingdom", "Ireland", "Netherlands", "Belgium", "Luxembourg", "Switzerland", "Austria", "Czech Republic", "Slovakia", "Hungary", "Poland", "Romania", "Bulgaria", "Serbia", "Croatia", "Bosnia and Herzegovina", "Montenegro", "North Macedonia", "Albania", "Kosovo", "Slovenia", "United States", "Canada", "Mexico", "Brazil", "Argentina", "Chile", "Peru", "Colombia", "Venezuela", "Ecuador", "Bolivia", "Paraguay", "Uruguay", "Guyana", "Suriname", "French Guiana", "Australia", "New Zealand", "Papua New Guinea", "Fiji", "Solomon Islands", "Vanuatu", "New Caledonia", "Egypt", "Libya", "Tunisia", "Algeria", "Morocco", "Sudan", "South Sudan", "Ethiopia", "Eritrea", "Djibouti", "Somalia", "Kenya", "Uganda", "Tanzania", "Rwanda", "Burundi", "Congo", "Democratic Republic of the Congo", "Angola", "Zambia", "Zimbabwe", "Mozambique", "Malawi", "South Africa", "Namibia", "Botswana", "Lesotho", "Eswatini"]

    occupations = ["Teacher", "Engineer", "Doctor", "Lawyer", "Accountant", "Nurse", "Salesperson", "Manager", "Chef", "Artist", "Software Developer", "Data Scientist", "Architect", "Pharmacist", "Dentist", "Veterinarian", "Police Officer", "Firefighter", "Paramedic", "Pilot", "Flight Attendant", "Electrician", "Plumber", "Carpenter", "Mechanic", "Hairdresser", "Beautician", "Photographer", "Journalist", "Writer", "Editor", "Translator", "Interpreter", "Psychologist", "Counselor", "Social Worker", "Financial Advisor", "Insurance Agent", "Real Estate Agent", "Marketing Specialist", "Public Relations Specialist", "Human Resources Manager", "Graphic Designer", "Web Designer", "UX Designer", "Product Manager", "Project Manager", "Business Analyst", "Systems Analyst", "Network Administrator", "Database Administrator", "Cybersecurity Specialist", "Librarian", "Museum Curator", "Zoologist", "Marine Biologist", "Environmental Scientist", "Geologist", "Meteorologist", "Astronomer", "Physicist", "Chemist", "Biologist", "Mathematician", "Statistician", "Economist", "Political Scientist", "Sociologist", "Anthropologist", "Archaeologist", "Historian", "Philosopher", "Theologian", "Actor", "Musician", "Dancer", "Choreographer", "Film Director", "Producer", "Screenwriter", "Fashion Designer", "Interior Designer", "Landscape Architect", "Urban Planner", "Civil Engineer", "Mechanical Engineer", "Electrical Engineer", "Chemical Engineer", "Aerospace Engineer", "Biomedical Engineer", "Environmental Engineer", "Nuclear Engineer", "Petroleum Engineer", "Agricultural Engineer", "Food Scientist", "Nutritionist", "Dietitian", "Personal Trainer", "Sports Coach", "Athlete", "Referee", "Umpire", "Tour Guide", "Travel Agent", "Hotel Manager", "Restaurant Manager", "Bartender", "Waiter/Waitress", "Housekeeping Staff", "Janitor", "Security Guard", "Locksmith", "Tailor", "Seamstress", "Jeweler", "Watchmaker", "Optician", "Optometrist", "Audiologist", "Speech Therapist", "Occupational Therapist", "Physical Therapist", "Massage Therapist", "Chiropractor", "Acupuncturist", "Naturopath", "Homeopath", "Midwife", "Doula", "Farmer", "Rancher", "Fisherman", "Forester", "Gardener", "Florist", "Botanist", "Zoologist", "Entomologist", "Paleontologist", "Geographer", "Cartographer", "Surveyor", "Air Traffic Controller", "Ship Captain", "Train Conductor", "Bus Driver", "Taxi Driver", "Truck Driver", "Courier", "Postal Worker", "Librarian", "Archivist", "Curator", "Conservator", "Restorer", "Printer", "Bookbinder", "Engraver", "Sculptor", "Painter", "Illustrator", "Animator", "Game Designer", "Voice Actor", "Stunt Performer", "Magician", "Circus Performer", "Street Performer", "Busker", "Tattoo Artist", "Piercer", "Makeup Artist", "Special Effects Artist", "Prosthetics Technician", "Orthodontist", "Endodontist", "Periodontist", "Oral Surgeon", "Radiologist", "Anesthesiologist", "Surgeon", "Cardiologist", "Neurologist", "Oncologist", "Pediatrician", "Geriatrician", "Psychiatrist", "Dermatologist", "Gynecologist", "Urologist", "Ophthalmologist", "Otolaryngologist", "Podiatrist", "Nephrologist", "Pulmonologist", "Rheumatologist", "Gastroenterologist", "Endocrinologist", "Hematologist", "Immunologist", "Pathologist", "Forensic Scientist", "Toxicologist", "Biochemist", "Microbiologist", "Virologist", "Geneticist", "Embryologist", "Ecologist", "Oceanographer", "Seismologist", "Volcanologist", "Hydrologist", "Glaciologist", "Climatologist", "Astronaut", "Cosmonaut", "Taikonaut"]

    languages = ["English", "Mandarin", "Malay", "Tamil", "Hokkien", "Cantonese", "Teochew", "Hakka", "Hainanese", "Hindi", "Bengali", "Urdu", "Punjabi", "Gujarati", "Malayalam", "Telugu", "Kannada", "Marathi", "Tagalog", "Indonesian", "Vietnamese", "Thai", "Burmese", "Khmer", "Lao", "Japanese", "Korean", "Arabic", "Persian", "Turkish", "Russian", "French", "German", "Spanish", "Portuguese", "Italian", "Greek", "Dutch", "Swedish", "Norwegian", "Danish", "Finnish", "Polish", "Czech", "Slovak", "Hungarian", "Romanian", "Bulgarian", "Serbian", "Croatian", "Bosnian", "Albanian", "Macedonian", "Slovenian", "Ukrainian", "Belarusian", "Lithuanian", "Latvian", "Estonian", "Georgian", "Armenian", "Azerbaijani", "Kazakh", "Uzbek", "Turkmen", "Kyrgyz", "Tajik", "Mongolian", "Tibetan", "Nepali", "Sinhala", "Dzongkha", "Tetum", "Fijian", "Samoan", "Tongan", "Maori", "Hawaiian", "Swahili", "Zulu", "Xhosa", "Afrikaans", "Amharic", "Somali", "Yoruba", "Igbo", "Hausa", "Wolof", "Fulani", "Oromo", "Hebrew", "Yiddish", "Latin", "Ancient Greek", "Sanskrit", "Classical Chinese", "Esperanto"]

    blood_types = ["A+", "A-", "B+", "B-", "O+", "O-", "AB+", "AB-"]

    ns_ranks = ["Private", "Lance Corporal", "Corporal", "Sergeant", "Staff Sergeant", "2nd Lieutenant", "Lieutenant", "Captain"]

    marital_statuses = ["Single", "Married", "Divorced", "Separated", "Widowed", "Civil Partnership", "Domestic Partnership", "Engaged", "Annulled"]

    religions = ["Buddhism", "Christianity", "Islam", "Hinduism", "Taoism", "No Religion"]
    
    dob = generate_date(datetime(1950, 1, 1), datetime(2024, 12, 31))

    towns = [
    "Ang Mo Kio", "Bedok", "Tampines", "Woodlands", "Jurong West", "Sengkang", "Punggol",
    "Yishun", "Hougang", "Jurong East", "Choa Chu Kang", "Bukit Batok", "Toa Payoh",
    "Serangoon", "Bukit Merah", "Pasir Ris", "Clementi", "Bishan", "Queenstown",
    "Bukit Panjang", "Kallang", "Geylang", "Marine Parade", "Novena", "Tanjong Pagar",
    "Bukit Timah", "Sembawang", "Central Area", "Rochor", "Orchard", "Newton",
    "Outram", "River Valley", "Downtown Core", "Marina South", "Straits View"
    ]

    streets = [
        "Orchard Road", "Serangoon Road", "Shenton Way", "Raffles Place", "Boon Lay Way",
        "Jurong Gateway Road", "Ang Mo Kio Avenue 3", "Tampines Avenue 5", "Woodlands Avenue 1",
        "Bedok North Street 1", "Punggol Central", "Sengkang East Way", "Yishun Ring Road",
        "Hougang Avenue 7", "Choa Chu Kang Avenue 4", "Bukit Batok East Avenue 3",
        "Toa Payoh Lorong 1", "Bishan Street 22", "Clementi Avenue 3", "Marine Parade Road",
        "Pasir Ris Drive 1", "Upper Serangoon Road", "Upper Thomson Road", "Bukit Timah Road",
        "Jalan Besar", "Victoria Street", "North Bridge Road", "South Bridge Road",
        "New Upper Changi Road", "Eu Tong Sen Street", "Telok Blangah Road", "Alexandra Road",
        "Joo Chiat Road", "Geylang Road", "Kallang Way", "Lavender Street", "Beach Road",
        "Balestier Road", "Bartley Road", "Braddell Road", "Bukit Panjang Ring Road",
        "Commonwealth Avenue", "Holland Road", "East Coast Road", "Sembawang Road",
        "Mandai Road", "Changi Road", "Upper East Coast Road", "Loyang Avenue",
        "Admiralty Drive", "Yio Chu Kang Road", "Lorong Chuan", "Kovan Road", "Simei Street 1"
    ]
    work_pass_types = ["Employment Pass", "S Pass", "Work Permit", "Dependent's Pass", "Long Term Visit Pass"]
    language_proficiencies = ["Basic", "Conversational", "Fluent", "Native"]

    races = ["Chinese", "Malay", "Indian", "Others"]
    religions = ["Buddhism", "Christianity", "Islam", "Hinduism", "Taoism", "No Religion"]
    towns = ["Ang Mo Kio", "Bedok", "Tampines", "Woodlands", "Jurong West", "Sengkang", "Punggol"]
    email_providers = ["gmail.com", "hotmail.com", "yahoo.com", "outlook.com"]
    sg_qualifications = ["PSLE", "N-Levels", "O-Levels", "A-Levels", "Diploma", "Bachelor's", "Master's", "PhD"]
    language_proficiencies = ["Basic", "Conversational", "Fluent", "Native"]
    ns_statuses = ["Pre-enlistee", "NSF", "NSman", "Exempted"]
    ns_ranks = ["Private", "Lance Corporal", "Corporal", "Sergeant", "Staff Sergeant", "2nd Lieutenant", "Lieutenant", "Captain"]
    work_pass_types = ["Employment Pass", "S Pass", "Work Permit", "Dependent's Pass", "Long Term Visit Pass"]

    citizenship=random.choice(["Singapore Citizen", "Singapore PR", "Foreigner"])
    name, name_generator=generate_name()
    profile = {
        "nric": generate_nric(),
        "name": name_generator(),  # Assume this function generates culturally appropriate names
        "race": random.choice(races),
        "gender": random.choice(["Male", "Female"]),
        "date_of_birth": (datetime.now() - timedelta(days=random.randint(6570, 36500))).strftime("%Y-%m-%d"),
        "age": 0,  # Will be calculated later
        "country_of_birth": random.choice(countries),
        "citizenship": citizenship,
        "religion": random.choice(religions),
        "marital_status": random.choice(marital_statuses),
        "address": {
            "block": f"{random.randint(1, 999)}",
            "street No.": f"{random.randint(1, 999)}",
            "street": f"{random.choice(streets)}",
            "unit": f"#{random.randint(1, 30)}-{random.randint(1, 999):03d}",
            "town": random.choice(towns),
            "postal_code": f"{random.randint(100000, 999999)}"
        },
        "phone_number": generate_phone_number(),
        "email": generate_random_email(),
        "occupation": random.choice(occupations),
        "cpf_number": generate_cpf_number(),
        "education": {
            "highest_qualification": random.choice(sg_qualifications),
            "institution": random.choice(["NUS", "NTU", "SMU", "SUTD", "Local Polytechnic", "Local JC", "Others"])
        },
        "languages": {
            "spoken": {lang: random.choice(language_proficiencies) for lang in random.sample(languages, random.randint(1, 3))},
            "written": {lang: random.choice(language_proficiencies) for lang in random.sample(languages, random.randint(1, 3))}
        },
        "height_cm": random.randint(150, 190),
        "weight_kg": random.randint(45, 100),
        "blood_type": random.choice(blood_types),
        "passport_number": generate_passport_number(),
        "drivers_license_number": generate_drivers_license(),
        "national_service": {
            "status": None,
            "rank": None
        },
        "immigration_status": None,  # Will be filled for non-citizens
        "emergency_contact": {
            "name": name_generator(),
            "relationship": random.choice(["Parent", "Sibling", "Spouse", "Friend"]),
            "phone_number": generate_phone_number()
        },
        "deceased": random.choice([True, False])
    }

    # Calculate age
    profile["age"] = calculate_age(datetime.strptime(profile["date_of_birth"], "%Y-%m-%d"))

    # Set NS status for males
    if profile["gender"] == "Male" and profile["citizenship"] in ["Singapore Citizen", "Singapore PR"]:
        if profile["age"] < 18:
            profile["national_service"]["status"] = "Pre-enlistee"
        elif 18 <= profile["age"] <= 20:
            profile["national_service"]["status"] = "NSF"
            profile["national_service"]["rank"] = random.choice(ns_ranks[:5])  # Lower ranks for NSF
        elif profile["age"] > 20:
            profile["national_service"]["status"] = "NSman"
            profile["national_service"]["rank"] = random.choice(ns_ranks)

    # Set immigration status for non-citizens
    if profile["citizenship"] == "Foreigner":
        profile["immigration_status"] = random.choice(work_pass_types)

    # Set date of death if deceased
    if profile["deceased"]:
        # max_days = (datetime.now() - datetime.strptime(profile["date_of_birth"], "%Y-%m-%d")).days
        # profile["date_of_death"] = (datetime.now() - timedelta(days=random.randint(0, max_days))).strftime("%Y-%m-%d")
        dob = datetime.strptime(profile["date_of_birth"], "%Y-%m-%d")
        age_in_days = profile["age"] * 365  # This is an approximation, not accounting for leap years
        date_of_death = dob + timedelta(days=age_in_days)
        profile["date_of_death"] = date_of_death.strftime("%Y-%m-%d")
    return profile

def generate_profiles(n):
    return [generate_profile() for _ in range(n)]

# UPLOAD DATA TO ELASTIC

In [10]:
from collections import defaultdict

def bulk_upload_pickle_to_elasticsearch(file_path, index_name, es, batch_size=1000):
    
    total_uploaded = 0
    total_failed = 0
    
    def create_action(doc):
        # doc=merge_nested_dictionaries(doc, default_template)
        return {
            "_index": index_name,
            "_id": uuid.uuid4(),
            "_source": doc
        }

    def read_and_upload_batch(data):
        batch = []
        for doc in data:
            batch.append(create_action(doc))
            if len(batch) == batch_size:
                yield batch
                batch = []
        if batch:
            yield batch

    with open(file_path, 'rb') as f:
        data=pickle.load(f)
        for batch in read_and_upload_batch(data):
            try:
                success, failed = helpers.bulk(es, batch, raise_on_error=False)
                if type(failed) is list: 
                    failed=len(failed)
                total_uploaded += success
                total_failed += failed
                print(f"Uploaded {success} documents, Failed {failed} documents")
            except Exception as e:
                print(f"Error during bulk upload: {str(e)}")
                total_failed += len(batch)

    return total_uploaded, total_failed

In [11]:
from elasticsearch import Elasticsearch, helpers



In [12]:
try:
    es_endpoint ="http://127.0.0.1:9200"
    es_client = Elasticsearch(
        es_endpoint,
        #api_key=os.environ.get("ELASTIC_API_KEY")
    )
except Exception as e:
    print("No Client")
    es_client=None

In [13]:
index = "langchain-demo"

In [14]:
runs=200
profile_batch_size=1000

for i in range(runs):
    profiles = generate_profiles(profile_batch_size)
    filename='./data/personal_info.pkl'
    with open(filename, 'wb') as file:
        pickle.dump(profiles, file)
    try:
        bulk_upload_pickle_to_elasticsearch(filename, "langchain-demo", es_client)
    except Exception as e:
        print(traceback.print_exc())

Error during bulk upload: Connection timed out
Error during bulk upload: Connection timed out
Error during bulk upload: Connection timed out
Error during bulk upload: Connection timed out
Error during bulk upload: Connection timed out
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 documents, Failed 0 documents
Uploaded 1000 doc

# GET ELASTIC SCHEMA

In [15]:
import requests
import json

# Replace with your Elasticsearch host and index name
es_host = "http://localhost:9200"
index_name = "langchain-demo"

# Make the request to get the mapping
response = requests.get(f"{es_host}/{index_name}/_mapping")

# Check if the request was successful
if response.status_code == 200:
    mapping = response.json()
    with open("langchain-demo-mapping.json", "w") as f:
        json.dump(mapping, f, indent=4)

else:
    print(f"Error: {response.status_code}")

In [16]:
mapping.keys()

dict_keys(['langchain-demo'])

In [21]:
mapping['langchain-demo']['mappings']['properties'].keys()

dict_keys(['address', 'age', 'blood_type', 'citizenship', 'country_of_birth', 'cpf_number', 'date_of_birth', 'date_of_death', 'deceased', 'drivers_license_number', 'education', 'email', 'emergency_contact', 'gender', 'height_cm', 'immigration_status', 'languages', 'marital_status', 'name', 'national_service', 'nric', 'occupation', 'passport_number', 'phone_number', 'race', 'religion', 'weight_kg'])

# QUERY ELASTIC

In [63]:
query = {'query': {'bool': {'minimum_should_match': 3,
                    'should': [{'match': {'gender': 'female'}},
                               {'match': {'deceased': True}},
                               {'match': {'blood_type': 'o-'}},
                               {'match': {'country_of_birth': 'singapore'}}]}}}

In [64]:
search_results = es_client.search(index=index, body=query)

total_hits = search_results['hits']['total']['value']
print(f"Total matches: {total_hits}")

for hit in search_results['hits']['hits']:
    print(f"Score: {hit['_score']}")
    print(f"Name: {hit['_source']['name']}")
    print(f"Blood Type: {hit['_source']['blood_type']}")
    print(f"Gender: {hit['_source']['gender']}")
    print(f"Country of Birth: {hit['_source']['country_of_birth']}")
    print(f"Deceased: {hit['_source']['deceased']}")
    print("---")

Total matches: 10000
Score: 7.909445
Name: Huang Xiang Yin
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Yao Yan Jiao
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Norashikin bin Madiha
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Eliana bin Rashidah
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Natasha binti Raja
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Lim Zong Xuan
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Zheng Guirong Jinlan
Blood Type: O+
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Sarah binti Mohd
Blood Type: O+
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Deepika Palan
Blood Typ

# BUILD RAG COMPONENT AND TEST

In [65]:
prompt='''
You are an AI assistant specialized in converting natural language queries into Elasticsearch queries. Your task is to interpret user questions about personal profiles and generate the appropriate Elasticsearch query in JSON format.

The document schema for the profiles is as follows:

{
  "nric": "string",
  "name": "string",
  "race": "string",
  "gender": "string",
  "date_of_birth": "date",
  "age": "integer",
  "country_of_birth": "string",
  "citizenship": "string",
  "religion": "string" ["Buddhism", "Christianity", "Islam", "Hinduism", "Taoism", "No Religion"],
  "marital_status": "string" ["Single", "Married", "Divorced", "Separated", "Widowed", "Civil Partnership", "Domestic Partnership", "Engaged", "Annulled"],
  "address": {
    "block": "string",
    "street_no": "string",
    "street": "string",
    "unit": "string",
    "town": "string",
    "postal_code": "string"
  },
  "phone_number": "string",
  "email": "string",
  "occupation": "string",
  "cpf_number": "string",
  "education": {
    "highest_qualification": "string",
    "institution": "string"
  },
  "languages": {
    "spoken": {"language":"fluency" ["Basic", "Conversational", "Fluent", "Native"]},
    "written": {"language":"fluency" ["Basic", "Conversational", "Fluent", "Native"]},
  },
  "height_cm": "integer",
  "weight_kg": "integer",
  "blood_type": "string" ["A+", "A-", "B+", "B-", "O+", "O-", "AB+", "AB-"],
  "passport_number": "string",
  "drivers_license_number": "string",
  "national_service": {
    "status": "string",
    "rank": "string"
  },
  "immigration_status": "string",
  "emergency_contact": {
    "name": "string",
    "relationship": "string",
    "phone_number": "string"
  },
  "deceased": "boolean",
  "date_of_death": "date"
}
-----------------------------------------------------------------------------------
Example query 1:
User: Find all male Singapore citizens between 25 and 30 years old who work as software developers and speak fluent English.

Your response should be:

{
  "query": {
    "bool": {
      "should": [
        { "match": { "gender": "Male" } },
        { "match": { "citizenship": "Singapore Citizen" } },
        { "range": { "age": { "gte": 25, "lte": 30 } } },
        { "match": { "occupation": "Software Developer" } },
        {
          "match": {
            "languages.spoken.English": {
              "query": "Fluent",
              "fuzziness": "AUTO"
            }
          }
        }
      ],
      "minimum_should_match": 2
    }
  }
}
-------------------------------
Example query 2:
User: All non-Singaporean men over the age of 25 who are software people living in woodlands 

Your response should be:
{
  "query": {
    "bool": {
            "minimum_should_match": 4,
            "should": [{
                "bool": {
                  "must_not": [
                  {"match": {"citizenship": "singapore citizen"}}
                             ]
                            }
                        },
                               {"match": {"gender": "male"}},
                               {"range": {"age": {"gte": 25}}},
                               {"multi_match": {"fields": ["occupation",
                                                           "job_title",
                                                           "role"],
                                                "fuzziness": "AUTO",
                                                "query": "software",
                                                "type": "best_fields"}},
                               {"match": {"address.town": {"fuzziness": "AUTO",
                                                           "query": "woodlands"}}}]}}}

----------------------------------------------

Consider using multi_match for fields that might contain the value in different subfields:
{
  "multi_match": {
    "query": "Software Developer",
    "fields": ["occupation", "job_title", "role"],
    "type": "best_fields",
    "fuzziness": "AUTO"
  }
}

For names or other fields where word order matters, you might want to use match_phrase with slop:
{
  "match_phrase": {
    "full_name": {
      "query": "John Doe",
      "slop": 1
    }
  }
}

when using minimum_should_match, try to guaranty that the minumum number of clauses satisface the user query in Natural Language

When dealing with queries that involve categories, groups, or regions (such as language families, geographical areas, or professional fields), expand the search to include all relevant specific instances. 
For example, if asked about Slavic languages, include searches for Russian, Polish, Czech, etc. If asked about people from Europe, include searches for various European countries.


Generate a JSON query for Elasticsearch. Provide only the raw JSON without any surrounding tags or markdown formatting, because we need to convert your response to an object. 
Use a lenient approach with 'should' clauses instead of strict 'must' clauses. Include a 'minimum_should_match' parameter to ensure some relevance while allowing flexibility. Avoid using 'must' clauses entirely.
All queries must be lowercase.

Use 'match' queries instead of 'term' queries to allow for partial matches and spelling variations. Where appropriate, include fuzziness parameters to further increase tolerance for spelling differences. 
For name fields or other phrases where word order matters, consider using 'match_phrase' with a slop parameter. Use 'multi_match' for fields that might contain the value in different subfields.

Try to create a query which satisfaces most closely what the user is requesting.
let's think step by step

'''

In [27]:
# ELastic Query Model

from langchain_ollama import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

# LLM
llm = ChatOllama(model="llama3.2:latest", format="json", temperature=0.1, top_p=.9, tok_k=30, num_ctx=8192)

prompt2 = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> {prompt}
     <|eot_id|><|start_header_id|>user<|end_header_id|>

    Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "prompt"],
)

elastic_llm = prompt2 | llm 


# LLama 3.2 3Billions

In [66]:
question = "All men over the age of 35, who  are living in Tanjong Pagar"

response =elastic_llm.invoke({"question": question, "prompt": prompt})

es_query=json.loads(response.content)
pprint(es_query)

{'query': {'bool': {'minimum_should_match': 2,
                    'should': [{'match': {'gender': 'male'}},
                               {'range': {'age': {'gte': 35}}},
                               {'match': {'address.town': {'fuzziness': 'AUTO',
                                                           'query': 'Tanjong '
                                                                    'Pagar'}}}]}}}


In [67]:
search_results = es_client.search(index=index, body=es_query)

total_hits = search_results['hits']['total']['value']
print(f"Total matches: {total_hits}")

for hit in search_results['hits']['hits']:
    print(f"Score: {hit['_score']}")
    print(f"Name: {hit['_source']['name']}")
    print(f"Age: {hit['_source']['age']}")
    print(f"Gender: {hit['_source']['gender']}")
    print(f"Citizenship: {hit['_source']['citizenship']}")
    print(f"Occupation: {hit['_source']['occupation']}")
    print(f"Address: {hit['_source']['address']}")
    print("---")

Total matches: 10000
Score: 1.6929345
Name: Borhan bin Farhanah
Age: 63
Gender: Male
Citizenship: Foreigner
Occupation: Agricultural Engineer
Address: {'block': '224', 'street No.': '716', 'street': 'Toa Payoh Lorong 1', 'unit': '#24-342', 'town': 'Jurong West', 'postal_code': '312210'}
---
Score: 1.6929345
Name: Chua Shulan Liang
Age: 66
Gender: Male
Citizenship: Singapore PR
Occupation: Photographer
Address: {'block': '734', 'street No.': '571', 'street': 'Tampines Avenue 5', 'unit': '#7-250', 'town': 'Punggol', 'postal_code': '105993'}
---
Score: 1.6929345
Name: Zu Zhen
Age: 64
Gender: Male
Citizenship: Singapore PR
Occupation: Microbiologist
Address: {'block': '649', 'street No.': '621', 'street': 'Commonwealth Avenue', 'unit': '#30-291', 'town': 'Punggol', 'postal_code': '317005'}
---
Score: 1.6929345
Name: Zhou Shu Ti
Age: 55
Gender: Male
Citizenship: Singapore PR
Occupation: Environmental Scientist
Address: {'block': '560', 'street No.': '486', 'street': 'Serangoon Road', 'unit'

# Gemini

In [68]:
query=  "All men over the age of 35, who  are living in Tanjong Pagar"
response=LLM.generate_non_streaming_response(query, system_prompt=prompt)


In [70]:
json.loads(response)

{'query': {'bool': {'minimum_should_match': 2,
   'should': [{'match': {'gender': 'male'}},
    {'range': {'age': {'gte': 35}}},
    {'match': {'address.town': {'query': 'tanjong pagar',
       'fuzziness': 'AUTO'}}}]}}}

In [71]:
es_query=json.loads(response)
pprint(es_query)

{'query': {'bool': {'minimum_should_match': 2,
                    'should': [{'match': {'gender': 'male'}},
                               {'range': {'age': {'gte': 35}}},
                               {'match': {'address.town': {'fuzziness': 'AUTO',
                                                           'query': 'tanjong '
                                                                    'pagar'}}}]}}}


In [72]:
search_results = es_client.search(index=index, body=es_query)

total_hits = search_results['hits']['total']['value']
print(f"Total matches: {total_hits}")

for hit in search_results['hits']['hits']:
    print(f"Score: {hit['_score']}")
    print(f"Name: {hit['_source']['name']}")
    print(f"Age: {hit['_source']['age']}")
    print(f"Gender: {hit['_source']['gender']}")
    print(f"Citizenship: {hit['_source']['citizenship']}")
    print(f"Occupation: {hit['_source']['occupation']}")
    print(f"Address: {hit['_source']['address']}")
    print("---")

Total matches: 10000
Score: 1.6929345
Name: Borhan bin Farhanah
Age: 63
Gender: Male
Citizenship: Foreigner
Occupation: Agricultural Engineer
Address: {'block': '224', 'street No.': '716', 'street': 'Toa Payoh Lorong 1', 'unit': '#24-342', 'town': 'Jurong West', 'postal_code': '312210'}
---
Score: 1.6929345
Name: Chua Shulan Liang
Age: 66
Gender: Male
Citizenship: Singapore PR
Occupation: Photographer
Address: {'block': '734', 'street No.': '571', 'street': 'Tampines Avenue 5', 'unit': '#7-250', 'town': 'Punggol', 'postal_code': '105993'}
---
Score: 1.6929345
Name: Zu Zhen
Age: 64
Gender: Male
Citizenship: Singapore PR
Occupation: Microbiologist
Address: {'block': '649', 'street No.': '621', 'street': 'Commonwealth Avenue', 'unit': '#30-291', 'town': 'Punggol', 'postal_code': '317005'}
---
Score: 1.6929345
Name: Zhou Shu Ti
Age: 55
Gender: Male
Citizenship: Singapore PR
Occupation: Environmental Scientist
Address: {'block': '560', 'street No.': '486', 'street': 'Serangoon Road', 'unit'

# LLama 3.2 3B

In [75]:
question = "Women who are not alive currently, who are universal blood donors born in singapore"

response =elastic_llm.invoke({"question": question, "prompt": prompt})

es_query=json.loads(response.content)
pprint(es_query)

{'query': {'bool': {'minimum_should_match': 2,
                    'should': [{'match': {'gender': 'Female'}},
                               {'match': {'deceased': 'true'}},
                               {'match': {'citizenship': 'Singapore Citizen'}},
                               {'range': {'date_of_birth': {'gte': None,
                                                            'lte': 'now/days '
                                                                   'ago'}}},
                               {'match': {'blood_type': 'Universal Donor'}}]}}}


In [76]:
search_results = es_client.search(index=index, body=es_query)

total_hits = search_results['hits']['total']['value']
print(f"Total matches: {total_hits}")

for hit in search_results['hits']['hits']:
    print(f"Score: {hit['_score']}")
    print(f"Name: {hit['_source']['name']}")
    print(f"Blood Type: {hit['_source']['blood_type']}")
    print(f"Gender: {hit['_source']['gender']}")
    print(f"Country of Birth: {hit['_source']['country_of_birth']}")
    print(f"Deceased: {hit['_source']['deceased']}")
    print("---")

BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'operator not supported for date math [/days ago]')

In [77]:
query="Women who are not alive currently, who are universal blood donors born in singapore" 

response=LLM.generate_non_streaming_response(query, system_prompt=prompt)
es_query=json.loads(response)
pprint(es_query)

{'query': {'bool': {'minimum_should_match': 3,
                    'should': [{'match': {'gender': 'female'}},
                               {'match': {'deceased': True}},
                               {'match': {'blood_type': 'o-'}},
                               {'match': {'country_of_birth': 'singapore'}}]}}}


In [78]:
search_results = es_client.search(index=index, body=es_query)

total_hits = search_results['hits']['total']['value']
print(f"Total matches: {total_hits}")

for hit in search_results['hits']['hits']:
    print(f"Score: {hit['_score']}")
    print(f"Name: {hit['_source']['name']}")
    print(f"Blood Type: {hit['_source']['blood_type']}")
    print(f"Gender: {hit['_source']['gender']}")
    print(f"Country of Birth: {hit['_source']['country_of_birth']}")
    print(f"Deceased: {hit['_source']['deceased']}")
    print("---")

Total matches: 10000
Score: 7.909445
Name: Huang Xiang Yin
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Yao Yan Jiao
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Norashikin bin Madiha
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Eliana bin Rashidah
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Natasha binti Raja
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Lim Zong Xuan
Blood Type: O-
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Zheng Guirong Jinlan
Blood Type: O+
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Sarah binti Mohd
Blood Type: O+
Gender: Female
Country of Birth: Singapore
Deceased: True
---
Score: 7.909445
Name: Deepika Palan
Blood Typ

# LLama 3.2 3B

In [79]:
question = "People with height equal to 175 centimeters" 

response =elastic_llm.invoke({"question": question, "prompt": prompt})

es_query=json.loads(response.content)
pprint(es_query)

{'query': {'bool': {'minimum_should_match': 1,
                    'should': [{'match': {'height_cm': '175'}}]}}}


In [80]:
search_results = es_client.search(index=index, body=es_query)

total_hits = search_results['hits']['total']['value']
print(f"Total matches: {total_hits}")

for hit in search_results['hits']['hits']:
    print(f"Score: {hit['_score']}")
    print(f"Name: {hit['_source']['name']}")
    print(f"languages: {hit['_source']}")
    print("---")

Total matches: 4798
Score: 1.0
Name: Lim Chao
languages: {'nric': 'S71077031I', 'name': 'Lim Chao', 'race': 'Indian', 'gender': 'Female', 'date_of_birth': '1997-04-26', 'age': 27, 'country_of_birth': 'Botswana', 'citizenship': 'Singapore Citizen', 'religion': 'No Religion', 'marital_status': 'Civil Partnership', 'address': {'block': '635', 'street No.': '436', 'street': 'Eu Tong Sen Street', 'unit': '#9-184', 'town': 'Ang Mo Kio', 'postal_code': '651374'}, 'phone_number': '+65 9674 3504', 'email': 'n3wkolgv@yahoo.com', 'occupation': 'Endocrinologist', 'cpf_number': 'S2336914N', 'education': {'highest_qualification': 'N-Levels', 'institution': 'NTU'}, 'languages': {'spoken': {'Portuguese': 'Native', 'Dzongkha': 'Conversational'}, 'written': {'Bosnian': 'Conversational', 'Zulu': 'Native'}}, 'height_cm': 175, 'weight_kg': 93, 'blood_type': 'A-', 'passport_number': 'K4534843L', 'drivers_license_number': 'S9379748K', 'national_service': {'status': None, 'rank': None}, 'immigration_status': 

# Gemini 1.5 Pro

In [81]:
query="People which height is equal to 175 centimeters" 

response=LLM.generate_non_streaming_response(query, system_prompt=prompt)
es_query=json.loads(response)
pprint(es_query)

{'query': {'match': {'height_cm': '175'}}}


In [82]:
search_results = es_client.search(index=index, body=es_query)

total_hits = search_results['hits']['total']['value']
print(f"Total matches: {total_hits}")

for hit in search_results['hits']['hits']:
    print(f"Score: {hit['_score']}")
    print(f"Name: {hit['_source']['name']}")
    print(f"languages: {hit['_source']}")
    print("---")

Total matches: 4798
Score: 1.0
Name: Lim Chao
languages: {'nric': 'S71077031I', 'name': 'Lim Chao', 'race': 'Indian', 'gender': 'Female', 'date_of_birth': '1997-04-26', 'age': 27, 'country_of_birth': 'Botswana', 'citizenship': 'Singapore Citizen', 'religion': 'No Religion', 'marital_status': 'Civil Partnership', 'address': {'block': '635', 'street No.': '436', 'street': 'Eu Tong Sen Street', 'unit': '#9-184', 'town': 'Ang Mo Kio', 'postal_code': '651374'}, 'phone_number': '+65 9674 3504', 'email': 'n3wkolgv@yahoo.com', 'occupation': 'Endocrinologist', 'cpf_number': 'S2336914N', 'education': {'highest_qualification': 'N-Levels', 'institution': 'NTU'}, 'languages': {'spoken': {'Portuguese': 'Native', 'Dzongkha': 'Conversational'}, 'written': {'Bosnian': 'Conversational', 'Zulu': 'Native'}}, 'height_cm': 175, 'weight_kg': 93, 'blood_type': 'A-', 'passport_number': 'K4534843L', 'drivers_license_number': 'S9379748K', 'national_service': {'status': None, 'rank': None}, 'immigration_status': 