## Setup

In [1]:
import json                       # To load Att&ck data as a dict
from tabulate import tabulate     # For printing tables
import pandas as pd               # Data structures and maniupulation 
import os                         # Creating dirs for organizing data
from functions.functions import * # Custom function imports

In [2]:
# Load the enterprise attack data from JSON into memory as a dictionary 
with open('data/enterprise-attack-15.0.json', 'r') as file: 
    enterprise_attack_data:list[dict] = json.load(file)['objects']

## Initial Analysis

#### Familiarize with the dataset

In [3]:
# Print details
print(f'Number of records: {len(enterprise_attack_data)}\n')

# Get a complete list of all the types
list_of_types:list[str] = [d['type'] for d in enterprise_attack_data]

# Get a list of all the unique types 
unique_types:list[str] = list(set(list_of_types))
    
# Get the count for each type
type_counts:list[list[str, int]] = [
    [t, list_of_types.count(t)] for t in unique_types
]

# Sort the type counts in ascending order
type_counts = sorted(type_counts, key=lambda x: x[1])

# Print the types and their counts
print(tabulate(type_counts, headers=['Type', 'Count']))
    

Number of records: 21541

Type                      Count
----------------------  -------
identity                      1
x-mitre-matrix                1
marking-definition            1
x-mitre-tactic               14
campaign                     28
x-mitre-data-source          38
tool                         86
x-mitre-data-component      109
intrusion-set               165
course-of-action            284
malware                     596
attack-pattern              780
relationship              19438


### Extracting features

##### Matrix details

In [4]:
# Extracting the Matrix 
mitre_matrix = filter_dict(enterprise_attack_data, 'type', 'x-mitre-matrix')[0] 

# Dumping the mitre matrix to json for easier reading
with open('data/jsons/mitre-matrix.json', 'w+') as file:
    json.dump(mitre_matrix, file, indent=4)

##### Tactics

In [5]:
# Extract the tactics
tactics:list[dict] = filter_dict(enterprise_attack_data, 'type', 'x-mitre-tactic')

# Clean the tactics data and extract the Tactic ID (external_id), url (if available), and source (if available) from "external_references"
for t in tactics: 

    # Remove the newlines from the description
    t['description'] = t['description'].replace("\n", "")
    
    # Extract the object_markings_ref, which is currently a list of one string, to simply a string
    t['object_marking_refs'] = t['object_marking_refs'][0]
    
    # Extract the tactic ID
    t['tactic_id'] = t['external_references'][0]['external_id'] 
    
    # Extract the URL if available
    try: t['url'] = t['external_references'][0]['url']
    except KeyError: t['url'] = None
    
    # Extract the source name if available 
    try: t['source_name'] = t['external_references'][0]['source_name']
    except KeyError: t['source_name'] = None

# Convert the tactics to a dataframe
tactics_csv:str = 'data/csvs/tactics.csv'
tactics_df:pd.DataFrame = pd.DataFrame(tactics)

# Drop the 'external_references' column since it is now not needed
tactics_df.drop('external_references', axis=1, inplace=True)
tactics_df.to_csv(tactics_csv, index=False)

# Dump the tactics JSON
with open('data/jsons/tactics.json', 'w+') as file: 
    json.dump(tactics, file, indent=4)
    
# Print details
print(f'Saved {len(tactics_df)} tactics to "{tactics_csv}".')
print(f'Saved {len(tactics)} tactics to "data/jsons/tactics.json".')

Saved 14 tactics to "data/csvs/tactics.csv".
Saved 14 tactics to "data/jsons/tactics.json".


##### Campaigns

In [6]:
# Extract the campaigns
campaigns:list[dict] = filter_dict(enterprise_attack_data, 'type', 'campaign')

# Use handle_list_of_dict function to parse out the campaigns and sources 
campaigns_result:dict[str, list[dict]] = handle_list_of_dict(campaigns)

standardized_campaigns:list[dict] = campaigns_result['dicts'] 
campaign_sources:list[dict] = campaigns_result['sources']

# Dump the JSON for campaigns 
with open('data/jsons/campaigns.json', 'w+') as file:
    json.dump(campaigns, file, indent=4)

# Print details
print(f'Saved {len(campaigns)} campaigns to "data/jsons/campaigns.json".')
    
# Create the CSV files for the campaigns and campaign sources
create_csvs(
    campaigns, 
    campaign_sources, 
    'data/csvs/campaigns.csv', 
    'data/csvs/campaign_sources.csv', 
    print_debug=True
)


Saved 28 campaigns to "data/jsons/campaigns.json".
Saved 28 objects to "data/csvs/campaigns.csv".
Saved 28 sources to "data/csvs/campaign_sources.csv".


##### Tools

In [7]:
# Extract all the tools
tools:list[dict] = filter_dict(enterprise_attack_data, 'type', 'tool')

# Dump to JSON for easier reading
with open('data/jsons/tools.json', 'w+') as file: 
    json.dump(tools, file, indent=4)


In [8]:
# Use handle_list_of_dict function to parse out the tools and sources 
tools_result:dict[str, list[dict]] = handle_list_of_dict(tools)

standardized_tools:list[dict] = tools_result['dicts'] 
tool_sources:list[dict] = tools_result['sources']

# Create the CSV files for the tools and tool sources
create_csvs(
    tools, 
    tool_sources, 
    'data/csvs/tools.csv', 
    'data/csvs/tool_sources.csv', 
    print_debug=True
)

Saved 86 objects to "data/csvs/tools.csv".
Saved 86 sources to "data/csvs/tool_sources.csv".


##### Intrusion sets

**NOTE:** An "intrusion-set" is a threat actor (MITRE's naming choice, for some reason)

In [9]:
# Extracting the intrusion sets
intrusion_sets:list[dict] = filter_dict(enterprise_attack_data, 'type', 'intrusion-set')

# Dump JSON for easier reading 
with open('data/jsons/intrusion-sets.json', 'w+') as file:
    json.dump(intrusion_sets, file, indent=4)

In [10]:
# Use handle_list_of_dict function to parse out the intrusion sets and sources 
iset_results:dict[str, list[dict]] = handle_list_of_dict(intrusion_sets)

standardized_isets:list[dict] = iset_results['dicts'] 
iset_sources:list[dict] = iset_results['sources']

# Create the CSV files for the intrusion sets and sources
create_csvs(
    intrusion_sets, 
    iset_sources, 
    'data/csvs/intrusion_sets.csv', 
    'data/csvs/intrusion_set_sources.csv', 
    print_debug=True
)

Saved 165 objects to "data/csvs/intrusion_sets.csv".
Saved 165 sources to "data/csvs/intrusion_set_sources.csv".


##### Courses of action

In [11]:
# Extract the courses of action
courses_of_action:list[dict] = filter_dict(enterprise_attack_data, 'type', 'course-of-action')

# Dump JSON for easier reading
with open('data/jsons/courses_of_action.json', 'w+') as file:
    json.dump(courses_of_action, file, indent=4)

In [12]:
# Use handle_list_of_dict function to parse out the courses of action and sources 
coa_result:dict[str, list[dict]] = handle_list_of_dict(courses_of_action)

standardized_coa:list[dict] = coa_result['dicts'] 
coa_sources:list[dict] = coa_result['sources']

# Create the CSV files for the courses of action and sources
create_csvs(
    courses_of_action, 
    coa_sources, 
    'data/csvs/courses_of_action.csv', 
    'data/csvs/courses_of_action_sources.csv', 
    print_debug=True
)

Saved 284 objects to "data/csvs/courses_of_action.csv".
Saved 284 sources to "data/csvs/courses_of_action_sources.csv".


##### Malware

In [13]:
# Extract the malware 
malware:list[dict] = filter_dict(enterprise_attack_data, 'type', 'malware')

# Dump JSON for easier reading 
with open('data/jsons/malware.json', 'w+') as file:
    json.dump(malware, file, indent=4)

In [14]:
# Use handle_list_of_dict function to parse out the malware and sources 
malware_result:dict[str, list[dict]] = handle_list_of_dict(malware)

standardized_malware:list[dict] = malware_result['dicts'] 
malware_sources:list[dict] = malware_result['sources']

# Create the CSV files for the malware and sources
create_csvs(
    malware, 
    malware_sources, 
    'data/csvs/malware.csv', 
    'data/csvs/malware_sources.csv', 
    print_debug=True
)

Saved 596 objects to "data/csvs/malware.csv".
Saved 596 sources to "data/csvs/malware_sources.csv".


##### Attack patterns

In [15]:
# Extract all the attack patterns 
attack_patterns:list[dict] = filter_dict(enterprise_attack_data, 'type', 'attack-pattern')

# Dump JSON for easier reading 
with open('data/jsons/attack-patterns.json', 'w+') as file: 
    json.dump(attack_patterns, file, indent=4)


In [16]:
# Use handle_list_of_dict function to parse out the attack patterns and sources 
attack_pattern_results:dict[str, list[dict]] = handle_list_of_dict(attack_patterns)

standardized_attack_patterns:list[dict] = attack_pattern_results['dicts'] 
attack_pattern_sources:list[dict] = attack_pattern_results['sources']

# Create the CSV files for the attack patterns and sources
create_csvs(
    attack_patterns, 
    attack_pattern_sources, 
    'data/csvs/attack_patterns.csv', 
    'data/csvs/attack_pattern_sources.csv', 
    print_debug=True
)

Saved 780 objects to "data/csvs/attack_patterns.csv".
Saved 780 sources to "data/csvs/attack_pattern_sources.csv".


##### Relationships

In [17]:
# Extract all the relationships
relationships:list[dict] = filter_dict(enterprise_attack_data, 'type', 'relationship')

# Dump JSON for easier reading 
with open('data/jsons/relationships.json', 'w+') as file: 
    json.dump(relationships, file, indent=4)

In [18]:
# Use handle_list_of_dict function to parse out the relationships and sources 
relationship_results:dict[str, list[dict]] = handle_list_of_dict(relationships)

standardized_relationships:list[dict] = relationship_results['dicts'] 
relationship_sources:list[dict] = relationship_results['sources']

# Create the CSV files for the relationship and sources
create_csvs(
    relationships, 
    relationship_sources, 
    'data/csvs/relationships.csv', 
    'data/csvs/relationship_sources.csv', 
    print_debug=True
)

Saved 19438 objects to "data/csvs/relationships.csv".
Saved 19438 sources to "data/csvs/relationship_sources.csv".


# Extracting & Analyzing Relationships

In [19]:
# Iterate over all the actors in intrustion_sets and extract each type of relationship for each actor, and save to their own JSON files
# for easier analysis and organization. 
for actor in intrusion_sets: 
    
    # -- RELATIONSHIPS -- # 
    these_relationships:list[dict] = get_all_relationships_for_actor(relationships, intrusion_sets, actor['name'])
    
    # Dump JSON
    if not os.path.exists(f'data/jsons/{actor["name"].replace(" ", "-")}/'):  # Check that the dir for this actor exists
        os.mkdir(f'data/jsons/{actor["name"].replace(" ", "-")}/')            # Create the dir if needed
        
    with open(f'data/jsons/{actor["name"].replace(" ", "-")}/relationships.json', 'w+') as file:
        json.dump(these_relationships, file, indent=4)
        
    # -- TOOLS -- #
    tool_matches:list[dict] = get_entities_for_actor(tools, 'tool', actor, 'Tool')
    
    # Dump JSON
    with open(f'data/jsons/{actor["name"].replace(" ", "-")}/tools.json', 'w+') as file: 
        json.dump(tool_matches, file, indent=4)

    # -- MALWARE -- #
    malware_matches:list[dict] = get_entities_for_actor(malware, 'malware', actor, 'Malware')
        
    # Dump JSON 
    with open(f'data/jsons/{actor["name"].replace(" ", "-")}/malware.json', 'w+') as file: 
        json.dump(malware_matches, file, indent=4)
        
    # -- ATTACK PATTERNS -- #
    ap_matches:list[dict] = get_entities_for_actor(attack_patterns, 'attack-pattern', actor, 'Attack Patterns')
    
    # Dump JSON 
    with open(f'data/jsons/{actor["name"].replace(" ", "-")}/attack-patterns.json', 'w+') as file: 
        json.dump(ap_matches, file, indent=4)
        
    # -- CAMPAIGNS -- # 
    campaign_matches:list[dict] = get_entities_for_actor(campaigns, 'campaign', actor, 'Campaign')
    
    # Dump JSON 
    with open(f'data/jsons/{actor["name"].replace(" ", "-")}/campaigns.json', 'w+') as file: 
        json.dump(campaign_matches, file, indent=4)
        