In [1]:
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
model_name = 'sentence-transformers/all-mpnet-base-v2'
base = 'example/ccm-4.0.7_OSCAL.json'
target = 'example/cis-controls-v8_OSCAL-1.0.json'
threshold = 0.7

In [3]:
with open(base, "r") as f:
    base_catalog = json.load(f)

In [4]:
base_controls = {}
groups = base_catalog.get('catalog', {}).get('groups', [])
for group in groups:
    for control in group.get('controls', []):
        if control.get('class') == "control":
            parts = control.get('parts')[0]
            base_controls[control.get('id')] = parts.get('prose').replace('\n',' ')
            
#base_controls

In [5]:
with open(target, "r") as f:
    target_catalog = json.load(f)

In [6]:
target_controls = {}
controls = target_catalog.get('catalog', {}).get('controls', [])
for control in controls:
    parts = control.get('parts')[0]
    target_controls[control.get('id').upper()] = parts.get('prose').replace('   ','')
    
#target_controls

In [14]:
control_mapping = {}
model = SentenceTransformer(model_name)

for id1 in base_controls:
    embedding1 = model.encode([base_controls[id1]], convert_to_tensor=True)
    max_similarity = -1
    max_id2 = None
    for id2 in target_controls:
        embedding2 = model.encode([target_controls[id2]], convert_to_tensor=True)
        similarity = cosine_similarity(embedding1.numpy(), embedding2.numpy())
        if similarity > max_similarity:
            max_similarity = similarity
            max_id2 = id2
        #print(f"{id1} to {id2}: {max_similarity}")
    if max_similarity >= threshold:
        control_mapping[id1] = max_id2
    else:
        control_mapping[id1] = None

In [15]:
for control, mapping in control_mapping.items():
    if mapping is not None:
        print(f"{control} maps to {mapping}")
        print(f"Base description: {base_controls[control]}")
        print(f"Target description: {target_controls[mapping]}\n")

AIS-07 maps to CISC-16
Base description: Define and implement a process to remediate application security vulnerabilities, automating remediation when possible. 
Target description: Manage the security life cycle of in-house developed, hosted, or acquired software to prevent, detect, and remediate security weaknesses before they can impact the enterprise.

DSP-01 maps to CISC-3
Base description: Establish, document, approve, communicate, apply, evaluate and maintain policies and procedures for the classification, protection and handling of data throughout its lifecycle, and according to all applicable laws and regulations, standards, and risk level. Review and update the policies and procedures at least annually. 
Target description: Develop processes and technical controls to identify, classify, securely handle, retain, and dispose of data.

DSP-03 maps to CISC-3
Base description: Create and maintain a data inventory, at least for any sensitive data and personal data. 
Target descript