In [19]:
import openai
import pandas as pd
import numpy as np

SEP = '=================='

# setting openai key
OPENAI_API = open('OPENAI_API.txt', "r").read()
openai.api_key = OPENAI_API

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [4]:
def get_embedding(text: str, 
                  model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result['data'][0]['embedding']

In [5]:
def vector_similarity(x: list[float], 
                      y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


def order_document_sections_by_query_similarity(query: str, 
                                                knowledge_base: pd.DataFrame) -> pd.DataFrame:
    query_vector = get_embedding(query)

    knowledge_base["similarities"] = knowledge_base['embedding'].apply(lambda x: vector_similarity(x, query_vector))

    return knowledge_base.sort_values("similarities", ascending=False).head()

In [6]:
GUIDE = "Suppose you are a person from a company filling out a survey using the given context, "\
        "Answer the question as truthfully as possible and keep the answer short."\
        "Do not mention context or original answer."\
        "And if you are unsure of the answer, say only 'N/A' without giving any reason.\n"

In [25]:
def gpt_response(question: str, context: str='') -> str:
    complete_prompt= context + 'Question:\n' + question

#     return openai.ChatCompletion.create(
#         temperature=0,
#         messages=[
#             {"role": "system", "content": GUIDE},
#             {"role": "user", "content": complete_prompt}
#             ],
#         max_tokens=1000,
#         model=COMPLETIONS_MODEL
#         )["choices"][0]["message"]['content'].strip(" \n")
    text = openai.Completion.create(
                                  model=COMPLETIONS_MODEL,
                                  prompt=GUIDE + complete_prompt,
                                  max_tokens=3000,
                                  temperature=0
                                )["choices"][0]['text'].strip(" \n")

    return text

In [21]:
def find_doc(question: str, 
             knowledge_base: pd.DataFrame) -> str:
    
    # calculate the descending similarities between a question and the knowledge base entries
    result = order_document_sections_by_query_similarity(question, knowledge_base)
    context = ''
    
    # take the five most relevant entries from knowledge base and construct context
    # use numbered list to help GPT seperate entries
    # only use entries with similarities over 75% to avoid misleading GPT
    for n in range(5):
        curr = result.iloc[n]
        if curr['similarities'] > 0.75:
            context += f'{n + 1}. ' + str(curr['question']) + ' ' + str(curr['answer']) + '\n'
        
    return context


def fill_survey(question_df: pd.DataFrame, 
                question_col: str, 
                knowledge_base: pd.DataFrame) -> pd.DataFrame:
    
    result_df = question_df.copy()
    temp_kb = knowledge_base.copy()
    
    # find the context for each question
    result_df['context'] = result_df.apply(lambda x: find_doc(x[question_col], temp_kb), axis=1)
    
    # answer each question with the given context
    result_df['answer'] = result_df.apply(lambda x: gpt_response(x[question_col], x['context']), axis=1)

    return result_df
  

# Import dataset

In [9]:
kb_name = '/Users/xiaoanlu/Desktop/School/OPPOS_RA/research_files/tables/Signifi_KB.csv'
validation_name = '/Users/xiaoanlu/Desktop/School/OPPOS_RA/research_files/tables/Signifi_validation.csv'

kb_raw = pd.read_csv(kb_name)
validation_raw = pd.read_csv(validation_name)

# 1. Embed the questions and the answers


In [10]:
kb = kb_raw.copy()

# combine the question and answer into one string with a line break
kb['combined'] = kb['question'] + '\n' + kb['answer']

# embed the combined question and answer string
kb['embedding'] = kb['combined'].apply(get_embedding)

In [11]:
# store the question, answer, quesntion + answer string, and embeddings to JSON file
kb[['question', 'answer', 'combined', 'embedding']].to_json('signifi_surveys_embedding.json', orient='index')
kb.head()[['combined','embedding']]

Unnamed: 0,combined,embedding
0,Company/Business name\nSignifi Solutions Inc.,"[-0.007606054190546274, -0.009326151572167873,..."
1,How long has the company been in business?\n15...,"[0.010836029425263405, -0.012713591568171978, ..."
2,Are there any material claims or judgments aga...,"[-0.007499844301491976, -0.01380800362676382, ..."
3,Has your company suffered a data loss or secur...,"[-0.006935591343790293, -0.02605941705405712, ..."
4,Have any of your third party vendors suffered ...,"[0.005503055639564991, -0.029340874403715134, ..."


In [12]:
# load the embeddings from JSON file
kb = pd.read_json('signifi_surveys_embedding.json', orient='index')
kb

Unnamed: 0,question,answer,combined,embedding
0,Company/Business name,Signifi Solutions Inc.,Company/Business name\nSignifi Solutions Inc.,"[-0.0076060542, -0.0093261516, 0.0062319920000..."
1,How long has the company been in business?,15 Years,How long has the company been in business?\n15...,"[0.0108360294, -0.0127135916, -0.0002136688, -..."
2,Are there any material claims or judgments aga...,No,Are there any material claims or judgments aga...,"[-0.0074998443, -0.0138080036, -0.0143131744, ..."
3,Has your company suffered a data loss or secur...,No,Has your company suffered a data loss or secur...,"[-0.0069355913, -0.0260594171, 0.0175394863, -..."
4,Have any of your third party vendors suffered ...,No,Have any of your third party vendors suffered ...,"[0.0055030556, -0.0293408744, 0.017100187, -0...."
...,...,...,...,...
112,Do you have a vulnerability management program...,Yes,Do you have a vulnerability management program...,"[-0.0035102519, -0.0244408082, 0.0211445577, -..."
113,"Vulnerability assessments, scans or penetratio...",External network connections only\nThird party...,"Vulnerability assessments, scans or penetratio...","[-0.0198095068, -0.0004751207, 0.0429685712, -..."
114,"Are vulnerability assessements, scans or penet...",Yes,"Are vulnerability assessements, scans or penet...","[-0.0009909542000000001, -0.0102768075, 0.0177..."
115,Are vulnerability assessments or scans on inte...,Yes,Are vulnerability assessments or scans on inte...,"[-0.0104705654, -0.0129698208, 0.0341477208, -..."


In [13]:
validation = validation_raw.copy()

# take only the questions from the validation set as our mock survey
val_questions = validation[['question']].copy()
val_questions

Unnamed: 0,question
0,Company name
1,Responder name
2,Responder title
3,Responder email address
4,Do you have a dedicated security authority for...
...,...
111,Is data encrypted in transit from a user’s de...
112,Must the user grant the application device per...
113,Are users authenticated and authorized for the...
114,Do you store Motorola Solutions data on your e...


# 2. Answering new survey

In [27]:
# use GPT to fill the given unanswered survey
result_response = fill_survey(val_questions, 'question', kb)

In [28]:
result_response

Unnamed: 0,question,context,answer
0,Company name,1. Company/Business name Signifi Solutions Inc...,?\nSignifi Solutions Inc.
1,Responder name,1. Company/Business name Signifi Solutions Inc...,?\nN/A
2,Responder title,1. Is there a formal Incident Response Plan? Y...,? N/A
3,Responder email address,1. Company/Business name Signifi Solutions Inc...,?\nN/A
4,Do you have a dedicated security authority for...,1. Is there a information security function re...,Yes
...,...,...,...
111,Is data encrypted in transit from a user’s de...,1. How does your company use encryption to sec...,"Yes, data is encrypted in transit from a user'..."
112,Must the user grant the application device per...,1. Are mobile devices used /allowed to access ...,Answer: N/A
113,Are users authenticated and authorized for the...,1. Are mobile devices used /allowed to access ...,Yes
114,Do you store Motorola Solutions data on your e...,1. Are business information systems used to tr...,


In [29]:
result_response.to_csv('output_qa.csv', index=False)

# 3. Compare results

In [30]:
ours = pd.read_csv('output_qa.csv')
conveyor = pd.read_csv('conveyor.csv')
comparison = ours.drop('context', axis=1)
comparison['correct_answer'] = validation['answer']
comparison['conveyor_answer'] = conveyor['answer']
comparison

Unnamed: 0,question,answer,correct_answer,conveyor_answer
0,Company name,?\nSignifi Solutions Inc.,Signifi Solutions Inc,Signifi Solutions Inc
1,Responder name,?\nN/A,Razvan Anghelidi,?
2,Responder title,? N/A,Director of Technology,?
3,Responder email address,?\nN/A,ranghelidi@signifi.com,?
4,Do you have a dedicated security authority for...,Yes,"Yes\nRazvan Anghelidi, Director of Technology",?
...,...,...,...,...
111,Is data encrypted in transit from a user’s de...,"Yes, data is encrypted in transit from a user'...",Yes\nBoth the kiosks/clients and the portal ar...,?
112,Must the user grant the application device per...,Answer: N/A,"No\nThe portal is web based, no device permiss...",Yes. We carry out vulnerability scans as part ...
113,Are users authenticated and authorized for the...,Yes,"N/A\nThe portal is web based, so accessible ov...","Internet-facing and internal network systems, ..."
114,Do you store Motorola Solutions data on your e...,,Yes,?


In [31]:
comparison.to_csv('comparison_qa.csv', index=False)

# 4. Add the power of documents


In [439]:
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, PromptHelper, ServiceContext
import os

os.environ["OPENAI_API_KEY"] = OPENAI_API

In [474]:
# data loader, see https://gpt-index.readthedocs.io/en/latest/how_to/data_connectors.html 
# for connecting different data sources

documents = SimpleDirectoryReader('signifi_policies/').load_data()

# set maximum input size
max_input_size = 1000
# set number of output tokens
num_outputs = 0
# set maximum chunk overlap
max_chunk_overlap = 20
# set chunk size limit
chunk_size_limit = 2000

prompt_helper = PromptHelper(max_input_size, 
                             num_outputs, 
                             max_chunk_overlap, 
                             chunk_size_limit=chunk_size_limit)

service_context = ServiceContext.from_defaults(prompt_helper=prompt_helper, chunk_size_limit=chunk_size_limit)

In [None]:
# generate embeddings for document chunks and save the embeddings

index = GPTSimpleVectorIndex.from_documents(documents, service_context = service_context)

index.save_to_disk('signifi_policies.json')

In [440]:
# read document index

index = GPTSimpleVectorIndex.load_from_disk('signifi_policies.json')

In [441]:
def find_doc_with_pdf(question: str, 
                      target_kb: pd.DataFrame) -> str:
    
    # calculate the descending similarities between a question and the knowledge base entries
    result = order_document_sections_by_query_similarity(question, target_kb)
    context = ''
    
    # take the five most relevant entries from knowledge base and construct context
    # use numbered list to help GPT seperate entries
    # only use entries with similarities over 75% to avoid misleading GPT
    for n in range(5):
        curr = result.iloc[n]
        if curr['similarities'] > 0.75:
            context += f'{n + 1}. ' + str(curr['question']) + ' ' + str(curr['answer']) + '\n'
    
    # find the most relevant PDF document for the question
    pdf_response = index.query(question, response_mode="no_text").source_nodes[0]
    pdf_text = ''
    
    # keep the PDF as context if similarity is above 75%
    if pdf_response.similarity > 0.8:
        pdf_text = pdf_response.source_text + '\n'
        
    context += pdf_text
   
    return context


def fill_survey_with_pdf(question_df: pd.DataFrame, 
                target_col: str, 
                knowledge_base: pd.DataFrame) -> pd.DataFrame:
    
    question_df['context'] = question_df.apply(lambda x: find_doc_with_pdf(x[target_col], knowledge_base), axis=1)

    question_df['answer'] = question_df.apply(lambda x: gpt_response(x[target_col], x['context']), axis=1)

    return question_df
  

In [442]:
result_response_pdf = fill_survey_with_pdf(val_questions, 'question', kb)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 2 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 3 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 3 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 4 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 18 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_i

INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 8 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 8 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 5 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 8 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 7 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 12 tokens
INFO:l

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 13 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 15 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 13 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 13 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 18 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens
INFO:lla

In [443]:
result_response_pdf.to_csv('output_pdf.csv', index=False)

In [444]:
ours = pd.read_csv('output_pdf.csv')
conveyor = pd.read_csv('output_2.csv')
comparison = ours.drop('context', axis=1)
comparison['correct_answer'] = validation['answer']
comparison['kb_answer'] = conveyor['answer']
comparison

Unnamed: 0,question,answer,correct_answer,kb_answer
0,Company name,Signifi Solutions Inc.,Signifi Solutions Inc,The company name is Signifi Solutions Inc.
1,Responder name,,Razvan Anghelidi,N/A.
2,Responder title,,Director of Technology,N/A.
3,Responder email address,,ranghelidi@signifi.com,
4,Do you have a dedicated security authority for...,"Yes, the CISO is the Director of Technology wh...","Yes\nRazvan Anghelidi, Director of Technology",
...,...,...,...,...
111,Is data encrypted in transit from a user’s de...,"Yes, encryption in transit is used to secure d...",Yes\nBoth the kiosks/clients and the portal ar...,N/A. The given context does not provide inform...
112,Must the user grant the application device per...,,"No\nThe portal is web based, no device permiss...","Yes, we have a vulnerability management progra..."
113,Are users authenticated and authorized for the...,,"N/A\nThe portal is web based, so accessible ov...",Both internet-facing and internal network syst...
114,Do you store Motorola Solutions data on your e...,,Yes,"Yes, vulnerability assessments, scans, and pen..."


In [445]:
comparison.to_csv('comparison_pdf.csv', index=False)

In [491]:
i = 105
print('system guide:\n====================')
print(GUIDE)
print('context:\n====================')
print(result_response.iloc[i].context)
print('question:\n====================')
print(result_response.iloc[i].question)
print('\n====================')
print(f'GPT answer: {result_response.iloc[i].answer}\n====================')


system guide:
Suppose you are a person from a company filling out a survey, Answer the survey questions as truthfully as possible using only the following contextand keep the answer brief,and if you don't have enough enough information or if you are unsure of the answer, say only 'N/A' without giving any reason.

context:
1. Are privileged user access rights reviewed at least quarterly? Yes
2. Are user access rights reviewed at least annually? Yes
3. Have the information security policies been reviewed in the last 12 months? Yes
4. Does your company allow remote (i.e. VPN) access to remotely administrator your network? Yes
5. Is there a password policy for systems that transmit, process or store scoped systems data that has been approved by management, communicated to constituents, and enforced on all platforms? Yes

question:
Do you review administrator and privileged account usage?.

GPT answer: Yes.


In [479]:
i = 38
print('system guide:\n====================')
print(GUIDE)
print('context:\n====================')
print(result_response_pdf.iloc[i].context)
print('question:\n====================')
print(result_response_pdf.iloc[i].question)
print('\n====================')
print(f'GPT answer: {result_response_pdf.iloc[i].answer}\n====================')


system guide:
Suppose you are a person from a company filling out a survey, Answer the survey questions as truthfully as possible using only the following contextand keep the answer brief,and if you don't have enough enough information or if you are unsure of the answer, say only 'N/A' without giving any reason.

context:
1. Have any of your third party vendors suffered a data loss or security breach within the last 3 years? No
2. The third parties with whom you share or enable direct access to data are: (check all that apply) Subjected to a background check? yes
Bound by the same legal requirements for privacy, data protection, and information security as your company? yes
Required to obtain your consent before using the data? yes
Required to notify your company and/or client in the event of a data breach? yes
Required to assist your company in making disclosures or notifications in the event of a data breach or data subject inquiry? yes
Required to give privacy and data protection tr