In [33]:
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
model_name = 'sentence-transformers/all-mpnet-base-v2'
base = 'example/ccm-4.0.7_OSCAL.json'
target = 'example/cis-controls-v8_OSCAL-1.0.json'
threshold = 0.7

In [44]:
with open(base, "r") as f:
    base_catalog = json.load(f)

In [65]:
base_controls = {}
groups = base_catalog.get('catalog', {}).get('groups', [])
for group in groups:
    for control in group.get('controls', []):
        if control.get('class') == "control":
            parts = control.get('parts')[0]
            base_controls[control.get('id')] = parts.get('prose').replace('\n',' ')
            
base_controls

{'A_A-01': 'Establish, document, approve, communicate, apply, evaluate and maintain audit and assurance policies and procedures and standards. Review and update the policies and procedures at least annually. ',
 'A_A-02': 'Conduct independent audit and assurance assessments according to relevant standards at least annually. ',
 'A_A-03': 'Perform independent audit and assurance assessments according to risk-based plans and policies. ',
 'A_A-04': 'Verify compliance with all relevant standards, regulations, legal/contractual, and statutory requirements applicable to the audit. ',
 'A_A-05': 'Define and implement an Audit Management process to support audit planning, risk analysis, security control assessment, conclusion, remediation schedules, report generation, and review of past reports and supporting evidence. ',
 'A_A-06': 'Establish, document, approve, communicate, apply, evaluate and maintain a risk-based corrective action plan to remediate audit findings, review and report remedi

In [66]:
with open(target, "r") as f:
    target_catalog = json.load(f)

In [67]:
target_controls = {}
controls = target_catalog.get('catalog', {}).get('controls', [])
for control in controls:
    parts = control.get('parts')[0]
    target_controls[control.get('id').upper()] = parts.get('prose').replace('   ','')
    
target_controls

{'CISC-1': 'Actively manage (inventory, track, and correct) all enterprise assets (end-user devices, including portable and mobile; network devices; non-computing/Internet of Things (IoT) devices; and servers) connected to the infrastructure, physically, virtually, remotely, and those within cloud environments, to accurately know the totality of assets that need to be monitored and protected within the enterprise. This will also support identifying unauthorized and unmanaged assets to remove or remediate.',
 'CISC-2': 'Actively manage (inventory, track, and correct) all software (operating systems and applications) on the network so that only authorized software is installed and can execute, and that unauthorized and unmanaged software is found and prevented from installation or execution.',
 'CISC-3': 'Develop processes and technical controls to identify, classify, securely handle, retain, and dispose of data.',
 'CISC-4': 'Establish and maintain the secure configuration of enterprise

In [68]:
control_mapping = {}
model = SentenceTransformer(model_name)

for id1 in base_controls:
    embedding1 = model.encode([base_controls[id1]], convert_to_tensor=True)
    max_similarity = -1
    max_id2 = None
    for id2 in target_controls:
        embedding2 = model.encode([target_controls[id2]], convert_to_tensor=True)
        similarity = cosine_similarity(embedding1.numpy(), embedding2.numpy())
        if similarity > max_similarity:
            max_similarity = similarity
            max_id2 = id2
    print(max_similarity)
    if max_similarity >= threshold:
        control_mapping[id1] = max_id2
    else:
        control_mapping[id1] = None

[[0.49242097]]
[[0.38692206]]
[[0.51238215]]
[[0.4578596]]
[[0.58352745]]
[[0.57123005]]
[[0.62161684]]
[[0.6726141]]
[[0.5565917]]
[[0.5724204]]
[[0.5594535]]
[[0.69325006]]
[[0.75342643]]
[[0.59185904]]
[[0.549426]]
[[0.56915]]
[[0.6116719]]
[[0.5911712]]
[[0.5997201]]
[[0.542198]]
[[0.6359583]]
[[0.68584925]]
[[0.58998334]]
[[0.42019957]]
[[0.6759041]]
[[0.4707355]]
[[0.65001714]]
[[0.6300105]]
[[0.41575956]]
[[0.5644032]]
[[0.46628314]]
[[0.46008128]]
[[0.6491146]]
[[0.6118677]]
[[0.6231085]]
[[0.5295866]]
[[0.5956124]]
[[0.5978773]]
[[0.6207986]]
[[0.5922498]]
[[0.49074143]]
[[0.5322894]]
[[0.36003986]]
[[0.47502303]]
[[0.40943322]]
[[0.57721996]]
[[0.5621742]]
[[0.5062301]]
[[0.50041324]]
[[0.520122]]
[[0.5585402]]
[[0.6313634]]
[[0.60076904]]
[[0.6105455]]
[[0.6739122]]
[[0.57863563]]
[[0.59843504]]
[[0.6238033]]
[[0.5485108]]
[[0.6102348]]
[[0.54717153]]
[[0.48633125]]
[[0.50957716]]
[[0.65676343]]
[[0.6263329]]
[[0.56050366]]
[[0.5793494]]
[[0.547752]]
[[0.45905703]]
[[0.73995

In [69]:
for control, mapping in control_mapping.items():
    if mapping is not None:
        print(f"{control} maps to {mapping}")
        print(f"Base description: {base_controls[control]}")
        print(f"Target description: {target_controls[mapping]}\n")

AIS-07 maps to CISC-16
Base description: Define and implement a process to remediate application security vulnerabilities, automating remediation when possible. 
Target description: Manage the security life cycle of in-house developed, hosted, or acquired software to prevent, detect, and remediate security weaknesses before they can impact the enterprise.

DSP-01 maps to CISC-3
Base description: Establish, document, approve, communicate, apply, evaluate and maintain policies and procedures for the classification, protection and handling of data throughout its lifecycle, and according to all applicable laws and regulations, standards, and risk level. Review and update the policies and procedures at least annually. 
Target description: Develop processes and technical controls to identify, classify, securely handle, retain, and dispose of data.

DSP-03 maps to CISC-3
Base description: Create and maintain a data inventory, at least for any sensitive data and personal data. 
Target descript