## <span style="color:#ff5f27">📝 Imports </span>

In [1]:
import os
from openai import OpenAI
import getpass
import json
import pandas as pd
import json_repair
from tqdm import tqdm

## <span style="color:#ff5f27">⚙️ Settings </span>

In [2]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or getpass.getpass('🔑 Enter your OpenAI API key: ')

🔑 Enter your OpenAI API key:  ···················································


In [3]:
client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
)

## <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [1]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 



Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://snurran.hops.works/p/1143
Connected. Call `.close()` to terminate connection gracefully.


In [5]:
# Retrieve the 'documents' feature view
feature_view = fs.get_feature_view(
    name='documents',
    version=1,
)

In [6]:
# Initialize batch scoring for feature view
feature_view.init_batch_scoring()

# Get batch data from the feature view
data = feature_view.get_batch_data()

# Filter data to include only rows where the 'text' column length is greater than 2500
data_filtered = data[data.text.str.len() > 2500]

# Display the filtered data
data_filtered



Finished: Reading data from Hopsworks, using Hive (3.42s) 


Unnamed: 0,file_name,page_number,paragraph,text
0,NIST.SP.800-53r5.pdf,433,1,"NIST SP 800- 53, REV. 5 ..."
1,NIST.SP.800-53r5.pdf,417,1,"NIST SP 800- 53, REV. 5 ..."
2,NIST.SP.800-53r5.pdf,331,1,"NIST SP 800- 53, REV. 5 ..."
4,pub-ch-bank-supervision-process.pdf,35,1,Version 1.1 Risk- Based Supervision Approach ...
8,NIST.SP.800-53r5.pdf,349,1,"NIST SP 800- 53, REV. 5 ..."
...,...,...,...,...
2004,GDPR.pdf,77,1,(b) advise the Commission on any issue relate...
2005,NIST.SP.800-53r5.pdf,92,1,"NIST SP 800- 53, REV. 5 ..."
2006,GDPR.pdf,10,1,further use; the nature of the personal data; ...
2010,GDPR.pdf,27,1,"seized may stay its proceedings or may, on req..."


## <span style="color:#ff5f27">🪄 Dataset Generation</span>

In [7]:
def generate_questions(context):

    instruction = """
    The given text is the result of the text extraction from the PDF files. 
    Generate 3 meaningful questions on the text and the respective answers.
    Reply strictly in the JSON format:
    {
      "questions": ["question1", "question2", "question3"],
      "answers": ["answer1", "answer2", "answer3"]
    }

    Ensure that the lists of questions and answers are complete and properly formatted. 
    DO NOT include any additional information or characters outside the specified JSON format. 
    The response must consist only of the requested JSON structure. 
    If the generated content does not meet the specified format, please make the necessary adjustments to ensure compliance."""

    prompt = f"\nContext: {context}\nQuestion: {instruction}"

    # Create a chatbot
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        # Pre-define conversation messages for the possible roles 
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    response = json_repair.loads(completion.choices[0].message.content)
    
    response['context'] = context
    
    return response


In [8]:
# Generate question-answer pairs
generated_questions = [
    generate_questions(text)
    for text 
    in tqdm(data_filtered['text'])
]

100%|██████████| 796/796 [50:13<00:00,  3.79s/it] 


In [11]:
# Create a DataFrame from the generated_questions
df = pd.DataFrame(generated_questions)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,questions,answers,context
0,[What is the definition of an information type...,[An information type is a specific category of...,"NIST SP 800- 53, REV. 5 ..."
1,"[What is the purpose of NIST SP 800-53, Rev. 5...","[The purpose of NIST SP 800-53, Rev. 5 is to p...","NIST SP 800- 53, REV. 5 ..."
2,[What is the purpose of boundary protection de...,[The purpose of boundary protection devices is...,"NIST SP 800- 53, REV. 5 ..."
3,[What is the OCC's risk-based supervision appr...,[The OCC's risk-based supervision approach is ...,Version 1.1 Risk- Based Supervision Approach ...
4,[What is the purpose of external malicious cod...,[The purpose of external malicious code identi...,"NIST SP 800- 53, REV. 5 ..."


In [53]:
# Explode the DataFrame to expand lists in specified columns ('questions' and 'answers')
df_expanded = df.explode(['questions', 'answers']).reset_index(drop=True)

# Reset the index to create a new default integer index
df_expanded.reset_index(inplace=True)

# Rename the 'index' column to 'record_id' for clarity
df_expanded.rename(columns={'index': 'record_id'}, inplace=True)

# Display the expanded DataFrame
df_expanded

Unnamed: 0,record_id,questions,answers,context
0,0,What is the definition of an information type ...,An information type is a specific category of ...,"NIST SP 800- 53, REV. 5 ..."
1,1,What is the definition of an insider threat pr...,An insider threat program is a coordinated col...,"NIST SP 800- 53, REV. 5 ..."
2,2,What is the principle of least privilege?,The principle of least privilege states that e...,"NIST SP 800- 53, REV. 5 ..."
3,3,"What is the purpose of NIST SP 800-53, Rev. 5?","The purpose of NIST SP 800-53, Rev. 5 is to pr...","NIST SP 800- 53, REV. 5 ..."
4,4,"Where can NIST SP 800-53, Rev. 5 be obtained?","NIST SP 800-53, Rev. 5 can be obtained for fre...","NIST SP 800- 53, REV. 5 ..."
...,...,...,...,...
2377,2377,Who should compensate for any damage resulting...,The controller or processor should compensate ...,"seized may stay its proceedings or may, on req..."
2378,2378,What factors should be taken into account when...,"Factors such as the nature, gravity and durati...","seized may stay its proceedings or may, on req..."
2379,2379,What is a replay attack?,A replay attack is when an attacker replays pr...,"NIST SP 800- 53, REV. 5 ..."
2380,2380,What is the definition of resilience according...,"According to OMB A-130, resilience is the abil...","NIST SP 800- 53, REV. 5 ..."


## <span style="color:#ff5f27;"> 🪄 CQA Feature Group Creation </span>

In [55]:
# Get or create the 'cqa_fg' feature group
cqa_fg = fs.get_or_create_feature_group(
    name="cqa_fg",
    version=1,
    description='Context-Question-Response Data',
    primary_key=['record_id'],
)

cqa_fg.insert(df_expanded)

Feature Group created successfully, explore it at 
https://snurran.hops.works/p/1143/fs/1091/fg/2061




Uploading Dataframe: 0.00% |          | Rows 0/2382 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: cqa_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://snurran.hops.works/p/1143/jobs/named/cqa_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x7f60da265570>, None)

## <span style="color:#ff5f27;"> 🪄 CQA Feature View Creation </span>

In [25]:
# Get or create the 'cqa' feature view
feature_view = fs.get_or_create_feature_view(
    name="cqa",
    version=1,
    query=cqa_fg.select(["context", "questions", "responses"]),
    description='Context-Question-Response pairs for model fine-tuning',
)

Feature view created successfully, explore it at 
https://snurran.hops.works/p/1143/fs/1091/fv/cqa/version/1


---