In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer

In [2]:
import json
from sentence_transformers import SentenceTransformer

# Load the JSON data
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

# Extract sentences and metadata from the JSON structure
def prepare_data(data):
    documents = []
    # Access the list of sections within the "PRIVACY POLICY" key
    for section in data["PRIVACY POLICY"]:  # Changed line
        # Add the main section title and content
        if section["Content"]:
            documents.append({
                "text": section["Content"],
                "context": section["Title"]
            })
        # Add subheaders
        for subheader in section.get("Subheaders", []):
            documents.append({
                "text": subheader["Content"],
                "context": f"{section['Title']} > {subheader['Title']}"
            })
    return documents

# Embed the data using SentenceTransformer
def embed_data(documents, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [doc["text"] for doc in documents]
    embeddings = model.encode(texts)
    return embeddings

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Find the most relevant document based on cosine similarity
def find_most_relevant(query, documents, embeddings, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings).flatten()
    most_relevant_idx = np.argmax(similarities)
    return documents[most_relevant_idx], similarities[most_relevant_idx]

# Example usage
if __name__ == "__main__":
    file_path = "privacy_policy.json"  # Replace with your JSON file path
    data = load_json(file_path)

    # Prepare and embed the data
    documents = prepare_data(data)
    embeddings = embed_data(documents)

    # Query
    query = "How many types of data collected?"
    relevant_doc, similarity = find_most_relevant(query, documents, embeddings)
    print(f"Query: {query}")
    print(f"Most Relevant Context: {relevant_doc['context']}")
    print(f"Answer: {relevant_doc['text']}")
    print(f"Similarity: {similarity}")


Query: How many types of data collected?
Most Relevant Context: Information Collection and Use
Answer: We collect several different types of information for various purposes to provide and improve our Service to you. 
Similarity: 0.45990991592407227


In [5]:
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load JSON data
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

# Prepare the data with weighted embeddings
def prepare_and_embed_data(data, model, title_weight=2.0):
    documents = []
    embeddings = []

    # Process each main section
    for section in data:
        title = section.get("Title", "")
        content = section.get("Content", "")
        subheaders = section.get("Subheaders", [])

        # If there are no subheaders
        if not subheaders:
            combined_text = f"{title} {content}"
            documents.append({"text": combined_text, "context": title})
            embeddings.append(
                model.encode(title) * title_weight + model.encode(content)
            )
        else:
            # Process each subheader
            for subheader in subheaders:
                sub_title = subheader.get("Title", "")
                sub_content = subheader.get("Content", "")
                #sub_combined_text = f"{title} > {sub_title} {sub_content}"

                documents.append({"text": sub_content, "context": sub_title})
                embeddings.append(model.encode(sub_title)*title_weight + model.encode(sub_content))

            # Add a weighted embedding for the header with combined subheaders' content
            subheader_contents = " ".join([f"{sh['Title']}: {sh['Content']}" for sh in subheaders])
            header_combined_text = f"{title} {content} {subheader_contents}"
            documents.append({"text": header_combined_text, "context": title})
            embeddings.append(
                model.encode(title) * title_weight + model.encode(header_combined_text)
            )

    # Convert embeddings to a numpy array for similarity calculations
    embeddings = np.array(embeddings)
    return documents, embeddings

# Find the most relevant response
def find_most_relevant(query, documents, embeddings, model):
    query_embedding = model.encode(query)
    similarities = cosine_similarity([query_embedding], embeddings).flatten()
    most_relevant_idx = np.argmax(similarities)
    return documents[most_relevant_idx], similarities[most_relevant_idx]

# Main function
if __name__ == "__main__":
    # Load data
    file_path = "privacy_policy.json"  # Replace with your JSON file path
    json_data = load_json(file_path)["PRIVACY POLICY"]

    # Load model
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Prepare and embed data
    documents, embeddings = prepare_and_embed_data(json_data, model)

    # Example queries
    queries = [
        "How many types of data collected?",
        "What is Personal Data?",
        "What are cookies?",
        "What is privacy policy?",
        "What is the latest update?",
        "Personal Data included email address and what more?"
    ]

    # Query and find relevant results
    thresh_hold=0.5
    for query in queries:
        result, similarity = find_most_relevant(query, documents, embeddings, model)
        if similarity>thresh_hold:
          print(f"Query: {query}")
          print(f"Most Relevant Context: {result['context']}")
          print(f"Answer: {result['text']}")
          print(f"Similarity: {similarity}")
          print("\n" + "-" * 50 + "\n")

        else:
          print(f"Query: {query}")
          print(f"Most Relevant Context: {result['text']}")
          print(f"Similarity: {similarity}")
          print("No suitable answer")
          print("\n" + "-" * 50 + "\n")


Query: How many types of data collected?
Most Relevant Context: Types of Data Collected
Answer: Types of Data Collected  Personal Data: While using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to: Email address; First name and last name; Phone number; Address, State, Province, ZIP/Postal code, City; Cookies and Usage Data;  Usage Data: We may also collect information that your browser sends whenever you visit our Service or when you access the Service by or through a mobile device ("Usage Data"). This Usage Data may include information such as your computer's Internet Protocol address (e.g. IP address), browser type, browser version, the pages of our Service that you visit, the time and date of your visit, the time spent on those pages, unique device identifiers, and other diagnostic data. 
Similarity: 0.8671891

In [6]:
import google.generativeai as genai


In [7]:
def make_request(query, thresh_hold=0.5):
  result, similarity=find_most_relevant(query, documents, embeddings, model)

  genai.configure(api_key="AIzaSyCg6A3eNqdYekIRzbGpBFGpTR4r2tY4JHs")
  model_2 = genai.GenerativeModel("gemini-1.5-flash")
  if similarity>thresh_hold:
    response = model_2.generate_content(f"You are an assitant about company privacy policy. Your task is to answer {query} base on this mock answer {result}. Please save as much infomation of {result} as possible and also answer reasonably")
    print(response.text)
  else:
    response = model_2.generate_content(f"You are an assitant about company privacy policy. Your task is to answer {query} and then told \"If you want to get further information about {query}, please visit our website https://www.presight.io/privacy-policy.html\"")
    print(response.text)


In [8]:
query="What is Personal data?"
make_request(query)


Personal data is any information that can be used to contact or identify you.  This includes, but is not limited to, your email address, first and last name, phone number, address (including state, province, ZIP/postal code, and city), cookies, and usage data.  The company may request this information while you are using its service.



In [9]:
query = "How many Types of Data Collected?"
make_request(query)

Based on the provided text, there are two types of data collected:

1. **Personal Data:** This includes information that can be used to identify or contact a specific individual.  Examples given include email address, name, phone number, address, and cookies.  Note that "Cookies and Usage Data" are listed here, implying that cookies are considered a subset of Personal Data in this context.

2. **Usage Data:** This encompasses information automatically collected when a user interacts with the service. Examples include IP address, browser type, pages visited, timestamps of visits, and device identifiers.


Therefore, the answer is $\boxed{2}$



In [10]:
query = "What is policy?"
make_request(query)

A policy is a set of rules and guidelines that a company creates to govern its operations and interactions with its users, customers, and employees.  It outlines the company's approach to various aspects of its business, such as data handling, intellectual property, and employee conduct.  Our privacy policy, for example, details how we collect, use, and protect your personal information.

If you want to get further information about what a policy is, specifically regarding our company's practices, please visit our website: https://www.presight.io/privacy-policy.html



In [11]:
query="What the company uses the collected data for?"
make_request(query)

Presight uses collected data to:

* **Maintain and improve its service:** This includes providing the service itself, monitoring its usage, and gathering analysis to identify areas for improvement.  They also use data to detect, prevent, and address technical issues.

* **Communicate with users:** This covers notifying users about service changes and providing customer support.

* **Enable user interaction:**  Data is used to allow users to participate in interactive features of the service, but only when they choose to do so.



In [12]:
query = "What is sausage?"
make_request(query)

Sausage is a food typically made from ground meat, often pork, beef, or poultry, seasoned and stuffed into a casing.  It can be cured, smoked, or cooked in various ways.

If you want to get further information about *What is sausage?*, please visit our website https://www.presight.io/privacy-policy.html

