Change Date Format

In [1]:
import json
from datetime import datetime

# Function to convert date format to full long date
def convert_to_full_long_date(date_str):
    date_obj = datetime.strptime(date_str, '%d/%m/%Y')
    return date_obj.strftime("%d %B %Y")

# Read JSON data from a file
with open('cleaned_web_servers_cve_data.json', 'r') as file:
    data = json.load(file)

# Update the 'Published Date' in each record
for record in data:
    record['Published Date'] = convert_to_full_long_date(record['Published Date'])

# Convert back to JSON
updated_json = json.dumps(data, indent=4)
# print(updated_json)

# Optionally, write the updated data back to a file
with open('cleaned_web_servers_cve_data_dated.json', 'w') as file:
    file.write(updated_json)


Generate the question-answer pairs

In [4]:
# DATA IS TRUNCATED FROM THE START FOR SECOND ITERATION

import json
import os
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def clean_description(item):
    item["Description"] = item["Description"].replace('\n', '').replace('\r', '').strip()
    return item

def clean_response(response):
    # Remove invalid control characters
    return ''.join(ch for ch in response if ch.isprintable() or ch in '\t\n\r')

def get_questions_answers(item):
    client = OpenAI(api_key=os.getenv("API_KEY"))

    prompt = f"The following is the information of a CVE:\n\n{json.dumps(item, indent=2)}\n\nPlease generate question and answer pairs for this information. Supplement the provided information with your own knowledge. Add code examples for this vulnerability in the answers if possible. Please also discuss possible attack scenarios of this vulnerability. Don't mention about the cut-off date of your own training data in the questions and answers. Dont mention in the questions and answers that a content for this vulnerability was provided to you. Always mention the CVE id in your questions.\n\nThe response should be in JSON format. Each set of question-answer pairs should be an object inside an array, with key-value pairs called 'question' and 'answer'. The parent key should be called 'data'."

    try:
        completion = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[{'role': 'user', 'content': f"{prompt}"}],
            response_format={"type": "json_object"}
        )
        response = completion.choices[0].message.content.strip()
        clean_resp = clean_response(response)
        print(f"Tokens used in GPT-4-Turbo Response: {completion.usage.total_tokens}")
        return clean_resp
    except Exception as e:
        print(f"Error in GPT-4-Turbo API call: {e}")
        return None

def append_to_excel(qa_pairs, filename):
    df = pd.DataFrame(qa_pairs, columns=['Question', 'Answer'])
    if os.path.isfile(filename):
        book = pd.read_excel(filename)
        df = pd.concat([book, df], ignore_index=True)
    df.to_excel(filename, index=False)

def main(json_file_path, excel_file_path):
    data = read_json_file(json_file_path)

    for item in data:
        cleaned_item = clean_description(item)
        json_response = get_questions_answers(cleaned_item)
        if json_response:
            try:
                response_data = json.loads(json_response)
                qa_pairs = [(qa['question'], qa['answer']) for qa in response_data.get("data", [])]
                append_to_excel(qa_pairs, excel_file_path)
            except json.JSONDecodeError as e:
                print(f"JSON parsing error: {e}")
                print(f"Invalid JSON response: {json_response}")
        print("Taking a break for 5 seconds")
        time.sleep(5)

if __name__ == "__main__":
    json_file_path = 'cleaned_web_servers_cve_data_dated.json'
    excel_file_path = 'web-servers-pairs.xlsx'
    main(json_file_path, excel_file_path)


Tokens used in GPT-4-Turbo Response: 1060
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 1282
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 1051
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 1354
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 1039
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 1177
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 1479
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 1162
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 1109
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 920
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 788
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 1249
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 994
Taking a break for 5 seconds
Tokens used in GPT-4-Turbo Response: 876
Taking a break for 5 seconds
Tokens use