1. Process data

    1. Read documents
    2. Chunk documents
    3. Store and index documents

In [12]:
import os
import json
import openai
import pandas as pd
from dotenv import load_dotenv
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from demo_utils import demo_utils
from openai.embeddings_utils import cosine_similarity
from IPython.display import Markdown

utils = demo_utils()
collection = utils.getorcreatecollection()

1.1. Read document

In [8]:
CHUNK_SIZE = 5000
CHUNK_OVERLAP = CHUNK_SIZE*0.1

# get chromadb collnection
def docx2txt(filepath, extract_page: bool=False, target_path: str="./chunks-txt", collection=None):
    if extract_page:
        loader = Docx2txtLoader(filepath)
        data = loader.load()
        print(data)

        prefixfilename  = os.path.splitext(os.path.basename(filepath))[0]

        for i in range(0, len(data)):
            with open(f"./{target_path}/{prefixfilename}_{str(i)}.txt", "w") as f:
                f.write(data[i].page_content)
    else:
        loader = Docx2txtLoader(filepath)
        data = loader.load()

        # text_splitter = RecursiveCharacterTextSplitter(
        #     # Set a really small chunk size, just to show.
        #     chunk_size = CHUNK_SIZE,
        #     chunk_overlap  = CHUNK_OVERLAP,
        #     length_function = len,
        # )

        text_splitter = CharacterTextSplitter(separator="\n\n", 
                                              chunk_size = CHUNK_SIZE, 
                                              chunk_overlap  = CHUNK_OVERLAP, 
                                              length_function = len,)

        texts = text_splitter.split_documents(data)

        i = 0 
        prefixfilename  = os.path.splitext(os.path.basename(filepath))[0]
        
        # if collection is not None:
        #     utils.deletecollection()

        for t in texts:
            # save text as txt file
            target_file_name = f"./{target_path}/{prefixfilename}_{str(i)}"
            with open(f"./{target_file_name}.txt", "w") as f:
                f.write(t.page_content)
            if collection is not None:
                # save text to chromadb
                collection.add(
                    documents = [t.page_content],
                    ids = [f"{prefixfilename}_{str(i)}"]
                    )


1.2. Chunk document 

Set source and target path to process documents

In [9]:
source_path = "doc=docx"
source_files = []

for subfolder in os.listdir(source_path):
    for file in os.listdir(f"{source_path}/{subfolder}"):
        # if file extension is docx
        if file.endswith(".docx"):
            source_file_path = f"{source_path}/{subfolder}/{file}"
            version = subfolder.split("version=")[1]
            source_files.append({"source_file_path": source_file_path, "version": version})

print(source_files)

# create a folder named contents=chunks/doc=txt
target_path = "contents=chunks/doc=txt"
if not os.path.exists(f"./{target_path}"):
    os.makedirs(f"./{target_path}")

for item in source_files:
    os.makedirs(f"./{target_path}/version={item['version']}", exist_ok=True)
    # delete all files in the folder in the target_path 
    for file in os.listdir(f"./{target_path}/version={item['version']}"):
        os.remove(f"./{target_path}/version={item['version']}/{file}")
    if collection is not None:
        docx2txt(item['source_file_path'], extract_page=False, target_path=f"./{target_path}/version={item['version']}", collection=collection)
    else:
        docx2txt(item['source_file_path'], extract_page=False, target_path=f"./{target_path}/version={item['version']}")


[{'source_file_path': 'doc=docx/version=1/Data Protection in Relational Databases v1.docx', 'version': '1'}, {'source_file_path': 'doc=docx/version=2/Data Protection in Relational Databases v2.docx', 'version': '2'}]


Insert of existing embedding ID: Data Protection in Relational Databases v1_0
Add of existing embedding ID: Data Protection in Relational Databases v1_0
Insert of existing embedding ID: Data Protection in Relational Databases v2_0
Add of existing embedding ID: Data Protection in Relational Databases v2_0


In [None]:
# if collection is not None:
#     # print("collection is not None")
#     # save collection to chromadb
#     # collection.get(ids=["Data Protection in Relational Databases v1_0"])
#     print(collection.query(
#         query_texts=["encryption"],
#         include=["metadatas", "documents", "distances"]
#     ))

Check versions

In [11]:
# get a list of versions from source_files 
versions = []
for item in source_files:
    versions.append(item['version'])
print(versions)

['1', '2']


Get the files for both version 1 and version 2

In [None]:
# read txt from both versions
# get the list of chuck files
version1_files = []
version2_files = []

for version in versions:
    for file in os.listdir(f"./{target_path}/version={version}"):
        if os.path.isfile(f"./{target_path}/version={version}/{file}"):
            if version == "1":
                version1_files.append(f"./{target_path}/version={version}/{file}")
            elif version == "2":
                version2_files.append(f"./{target_path}/version={version}/{file}")

# print the number of files in each version
print("Version 1 has", len(version1_files), "files")
print("Version 2 has", len(version2_files), "files")

2. Analysis

In [None]:
# system_message = """
# You are a compliance reviewer in a security team. You are responsible for reviewing the compliance policies and providing insights. 

# ## Review process
# There are two versions of the documents with cosine similarity. 
# If the cosine similarity is less than 0.9, then the reviewer needs to review the differences.
# Version 1 has base line. 
# Version 2 has updated document. 

# ## Review guideline
# Make a markdown table to show the difference that is found during the process.
# The table includes Line, Versions, and the exact differences in bold.
# If there is no difference, then do not return a table and say "no difference is found"
# If two documents are completely different then return "Not able to compare"
# For example, if version 2 has completely different topic or topics than version 1, then return "Not able to compare"

# ## Response Example
# [Use a table to summarize the answers]
# |Item Number|Line|Version 1|Version 2|  
# |-|-|-|-|  
# |1|5|Access control can be implemented at different levels of the database, such as the schema, table, column, row, or cell level.|Access control can be implemented at different levels of the database, such as the schema, table, column, row, or cell level. **The view and the stored procedures also need to have controlled access.**|  
# """

In [None]:
system_message = """
You are a compliance reviewer in a security team. You are responsible for reviewing the compliance policies and providing insights. 

## Review process
There are two versions of the documents with cosine similarity. 
If the cosine similarity is less than 0.9, then the reviewer needs to review the differences.
Version 1 is baseline. 
Version 2 is new document. 

## Review guideline
### 1. Rule
Response in json format that wraped with '```'
In the 'note' field, you can add additional information about the difference. added, removed, or modified.

### 2. When the two versions are identical
Use 'No difference is found' in the note field. 
Do not retuen 'N/A' in fields, version1 and version2.

### 3. When the two versions are off the topic or completely different
Use "Not able to compare" in the note field.

### 4. When the two versions are similar but not identical
Return the difference in the field, version1 and version2.

## Examples
### Example When the two versions are similar but not identical
The cosine similarity between the two documents is 0.993.

---version 1 --- version1 file name Databases v1_0.txt
Access control can be implemented at different levels of the database, such as the schema, table, column, row, or cell level.
------

---version 2 --- version2 file name Databases v2_0.txt
Access control can be implemented at different levels of the database, such as the schema, table, column, row, or cell level. The view and the stored procedures also need to have controlled access.
------

Your Answer:
```
{
    "similarity": 0.993,
    "version1_file_name":"Databases v1_0.txt",
    "version2_file_name":"Databases v2_0.txt",
    "version1":"Access control can be implemented at different levels of the database, such as the schema, table, column, row, or cell level.",
    "version2":"Access control can be implemented at different levels of the database, such as the schema, table, column, row, or cell level. **The view and the stored procedures also need to have controlled access.**",
    "note":"New content is added"
}
```
### Example When the two versions are identical
The cosine similarity between the two documents is 0.992.

---version 1 --- version1 file name Databases v1_0.txt
Access control can be implemented at different levels of the database.
------

---version 2 --- version2 file name Databases v2_0.txt
Access control can be implemented at different levels of the database.
------

Your Answer:
```
{
    "similarity": 0.992,
    "version1_file_name":"Databases v1_0.txt",
    "version2_file_name":"Databases v2_0.txt",
    "version1":"N/A",
    "version2":"N/A",
    "note":"No difference is found"
}
```
### Example When the two versions are off the topic or completely different
The cosine similarity between the two documents is 0.990.

---version 1 --- version1 file name Databases v1_0.txt
Access control can be implemented at different levels of the database.
------

---version 2 --- version2 file name Databases v2_0.txt
Americano is made with espresso and hot water.
------

Your Answer:
```
{
    "similarity": 0.990,
    "version1_file_name":"Databases v1_0.txt",
    "version2_file_name":"Databases v2_0.txt",
    "version1":"N/A",
    "version2":"N/A",
    "note":"Not able to compare"
}
```
"""

In [None]:
user_prompt = """
The cosine similarity between the two documents is {{similarity}}.

---version 1 --- version1 file name {{version1_file_name}}
{{version1}}
------

---version 2 --- version2 file name {{version2_file_name}}
{{version2}}
------

Your Answer:
"""

In [None]:
# utils.similarity(version1_files[0], version2_files[0])
analysis_results = []

for i in range(0, len(version1_files)):
    # get the similarity score
    similarity = utils.similarity(version1_files[i], version2_files[i])
    # get the text
    with open(version1_files[i], 'r') as f:
        version1 = f.read()
    with open(version2_files[i], 'r') as f:
        version2 = f.read()
    # generate the prompt
    updated_user_prompt = user_prompt.replace("{{similarity}}", str(similarity)).\
        replace("{{version1}}", version1).\
            replace("{{version2}}", version2).\
                replace("{{version1_file_name}}", os.path.basename(version1_files[i])).\
                    replace("{{version2_file_name}}", os.path.basename(version2_files[i]))
    # generate the response
    system_msg = {"role":"system","content":system_message}
    user_msg = {"role":"user","content":updated_user_prompt}
    prompt = [system_msg, user_msg]
    response = utils.run(prompt, temperature=0.0, max_tokens=2500, top_p=0.0)
    # analysis_results.append(response)
    analysis_results.append(json.loads(response.split("```")[1]))

In [None]:
results = pd.DataFrame(analysis_results)
results

In [None]:
pd.set_option("display.max_colwidth", None) # default is 50
display(results[["version1","version2"]])
pd.set_option("display.max_colwidth", 50) # default is 50

In [None]:
# compare version 1 and version 2
results[["version1","version2"]]

In [None]:
system_message_insights="""
You are a compliance reviewer in a security team. You are responsible for reviewing the compliance policies and providing insights.

## Review process
### 1. Find differences
Identify differences. Show what is addedd, removed or modified as a Markdown table.

### 2. Extract differences
If there are changes regarding specification, business or technical requirements, then extract them as key phrases.
Extract key phrases of changes and rephrase them as clear terms to describe the changes.

### 3. Provide insights
Provide additional expalination around the changes
What should be done to complice with the changes? 
What feature should be on or off


## Response
Follow the 'Review process' and reorganize into categories 'Access control', 'Entryption' and 'Audit', summarize using bullet points.
Use Markdown to format the response.

[Replace with Category,'Access control', 'Entryption' and 'Audit']

Differences
* Version1:
 - Focuses on recording access, modification, deletion of data, and execution of queries or commands.
 - Does not specify the duration for which logs should be kept.
* Version2:
 - Includes authentication and authorization in the list of activities to be logged.
 - Specifies that logs must be kept for 24 months.

Key Phrases

Insights
* Actionable items 
 - Step by step guide to apply the changes or rules
"""

In [None]:
user_prompt_insights="""
--- version 1 ---
{{version1}}
------

--- version 2 ---
{{version2}}
------
"""

In [None]:
diff_ver1 = ""
diff_ver2 = ""
for item in results.iterrows():
    diff_ver1 += item[1]['version1'] + "\n"
    diff_ver2 += item[1]['version2'] + "\n"

system_msg_insights = {"role":"system","content":system_message_insights}
user_msg_insights = {"role":"user","content":user_prompt_insights.replace("{{version1}}", diff_ver1).replace("{{version2}}", diff_ver2)}
prompt_insights = [system_msg_insights, user_msg_insights]
res_insights = utils.run(prompt_insights, temperature=0.0, max_tokens=2500, top_p=0.0)
display(Markdown(res_insights))

In [None]:
# for item in results.iterrows():    
#     system_msg_insights = {"role":"system","content":system_message_insights}
#     user_msg_insights = {"role":"user","content":user_prompt_insights.replace("{{version1}}", item[1]['version1']).replace("{{version2}}", item[1]['version2'])}
#     prompt_insights = [system_msg_insights, user_msg_insights]
#     res_insights = utils.run(prompt_insights, temperature=0.0, max_tokens=2500, top_p=0.0)
#     display(Markdown(res_insights))

Automate

In [None]:
system_message_automate="""
You are a compliance policy maker in a security team.

## Review insights
### 1. Indentify goal of changes
Identify differences. Show what is addedd, removed or modified as a Markdown table.

### 2. Use the insights to share updated rules
Provide additional expalination around the changes
What should be done to complice with the changes? 
What feature should be on or off


## Response
Use Markdown to format the response.

"""

In [None]:
user_prompt_automate="""
--- insights ---
{{results}}
------
"""

In [None]:
system_msg_automate = {"role":"system","content":system_message_automate}
user_msg_automate = {"role":"user","content":user_prompt_automate.replace("{{results}}", res_insights)}
prompt_automate = [system_msg_automate, user_msg_automate]
res_automate = utils.run(prompt_automate, temperature=1.0, max_tokens=2500, top_p=1.0)
display(Markdown(res_automate))