In [2]:
import json
import os
import pandas as pd
import numpy as np

In [3]:
def extract_keys(schema, parent_key=''):
    keys = []
    for key, value in schema.items():
        full_key = f'{parent_key}.{key}' if parent_key else key
        if isinstance(value, dict):
            description = value.get('description', '')
            type_ = value.get('type', '')
            format_ = value.get('format', '')
            pattern = value.get('pattern', '')
            keys.append((full_key, description, type_, format_, pattern))
            # Only extend keys if the value is a dictionary and has properties
            if 'properties' in value:
                keys.extend(extract_keys(value['properties'], full_key))
        else:
            keys.append((full_key, '', '', '', ''))
    return keys

# Load the schema from the JSON file
with open('../Data/Schemas/CVE_Record_Format.json') as f:
    schema = json.load(f)

# Extract keys from the definitions section of the schema
definitions = schema.get('definitions', {})
keys = extract_keys(definitions)
df_keys = pd.DataFrame(keys, columns=['Key', 'Description', 'Type', 'Format', 'Pattern'])

# Sort the DataFrame by the 'Key' column
df_keys_sorted = df_keys.sort_values(by='Key')

# Create the output directory if it doesn't exist
output_dir = '../CVE Program/files'
os.makedirs(output_dir, exist_ok=True)

# Export the sorted DataFrame to a CSV file
output_file = os.path.join(output_dir, 'CVE_Schema.csv')
df_keys_sorted.to_csv(output_file, index=False)

df_keys_sorted



In [4]:
# Replace empty strings with NaN
df_keys.replace('', np.nan, inplace=True)

# Calculate the percentage of each column that has a value
percentages = df_keys.notnull().mean() * 100
percentages_df = percentages.reset_index()
percentages_df.columns = ['Column', 'Percentage']
percentages_df

