## Setup

### Imports

In [1]:
import json                       # To load Att&ck data as a dict
from tabulate import tabulate     # For printing tables
import pandas as pd               # Data structures and maniupulation 
import os                         # Creating dirs for organizing data
from functions.functions import * # Custom function imports

### Init Variables

In [2]:
# Load the enterprise attack data from JSON into memory as a dictionary 
enterprise_attack_data:list[dict] = json.load(open('data/enterprise-attack-15.0.json', 'r'))['objects']

## Initial Analysis

#### Familiarize with the dataset

In [3]:
# Print details
print(f'Number of records: {len(enterprise_attack_data)}\n')

# Get a complete list of all the types
list_of_types:list[str] = [d['type'] for d in enterprise_attack_data]

# Get a list of all the unique types 
unique_types:list[str] = list(set(list_of_types))
    
# Get the count for each type
type_counts:list[list[str, int]] = [
    [t, list_of_types.count(t)] for t in unique_types
]

# Sort the type counts in ascending order
type_counts = sorted(type_counts, key=lambda x: x[1])

# Print the types and their counts
print(tabulate(type_counts, headers=['Type', 'Count']))
    

Number of records: 21541

Type                      Count
----------------------  -------
identity                      1
marking-definition            1
x-mitre-matrix                1
x-mitre-tactic               14
campaign                     28
x-mitre-data-source          38
tool                         86
x-mitre-data-component      109
intrusion-set               165
course-of-action            284
malware                     596
attack-pattern              780
relationship              19438


### Extracting features

##### Functions

In [4]:
def filter_dict(lod:list[dict], k:str, v:str) -> list[dict]: 
    return [
        d for d in lod if d[k] == v
    ] 

In [5]:
def extract_external_references(d:dict) -> dict: 
    ''' Extracts the "external_references" values from the given dictionary and returns a dict containing the 
    key : val pairs.
    
        Args: 
            d (dict): a dictionary in the format of the MITRE tools, campaigns, etc. 
            
        Returns: 
            (dict): a dictionary with the sources extracted from "external_references" 
    '''
    # Extract the external_references
    try: external_references:list[dict] = d['external_references']
    
    # If the "external_references" key doesn't exist, then we can return an empty dict since there is 
    # nothing to extract
    except KeyError: return {}
    except TypeError: return {}
    
    # Iterate over the external references (sources) and extract the source information for each
    if external_references: 
        for s in external_references: 
        
            # Create a dict for this source
            this_source_dict:dict = {
                'id': d['id'],
                'source_name': s['source_name']
            }
            
            # Extract URL if available
            try: this_source_dict['url'] = s['url']
            except KeyError: this_source_dict['url'] = None 
            
            # Extract description if available
            try: this_source_dict['description'] = s['description']
            except KeyError: this_source_dict['description'] = None
            
            # Extract an external ID if available
            try: this_source_dict['external_id'] = s['external_id'] 
            except KeyError: this_source_dict['external_id'] = None
        
        return this_source_dict
    else:
        return {
            'id': None,
            'source_name': None,
            'url': None,
            'description': None,
            'external_id': None
        }

In [6]:
def handle_list_of_dict(lod:list[dict]) -> dict[str, list[dict]]:
    
    # Get all the keys for all the dictionaries 
    all_dict_keys:list[list[str]] = [ list(d.keys()) for d in lod ]

    # Sort this list in ascending order by length (i.e. by number of keys)
    all_sorted_keys:list[list[str]] = sorted(all_dict_keys, key=len)

    # Take the last list of keys for the "fill keys" since that has the most keys
    fill_keys:list[str] = all_sorted_keys[len(all_sorted_keys) - 1]

    # Dict for the 'external_references' column (sources)
    sources:list[dict] = []

    # Iterate over all the dictionaries
    for d in lod: 
        
        # For this dict, iterate over the fill keys and add any missing keys with None values
        for k in fill_keys: 
            if k not in d: d[k] = None
        
        # Since the object_marking_refs column is given as a list with only one str, extract that to be a column of strs instead
        try: d['object_marking_refs'] = d['object_marking_refs'][0]
        except TypeError: d['object_marking_refs'] = None
        
        # Extract the external_references 
        this_source_dict:dict = extract_external_references(d)
        
        # Append this_source_dict to the intrusion_set_sources list
        sources.append(this_source_dict) 
    
    return {
        'dicts': lod, 
        'sources': sources
    }

In [7]:
def create_csvs(lod:list[dict], sources:list[dict], filepath1:str, sources_filepath:str, print_debug:bool=False):
    
    # Convert the list of dicts (lod) to a df
    df1:pd.DataFrame = pd.DataFrame(lod)
    df1.drop('external_references', axis=1, inplace=True)

    # Convert the sources list (sources) to a df
    sources_df:pd.DataFrame = pd.DataFrame(sources)

    # Dump the DFs to CSVs
    df1.to_csv(filepath1, index=False)
    sources_df.to_csv(sources_filepath, index=False)

    # Print details
    if print_debug: 
        print(f'Saved {len(df1)} objects to "{filepath1}".') 
        print(f'Saved {len(sources)} sources to "{sources_filepath}".')

#### Extracting individual features

##### Matrix details

In [8]:
# Extracting the Matrix 
mitre_matrix = filter_dict(enterprise_attack_data, 'type', 'x-mitre-matrix')[0] 

# Dumping the mitre matrix to json for easier reading
with open('data/jsons/mitre-matrix.json', 'w+') as file:
    json.dump(mitre_matrix, file, indent=4)

##### Tactics

In [9]:
# Extract the tactics
tactics:list[dict] = filter_dict(enterprise_attack_data, 'type', 'x-mitre-tactic')

# Clean the tactics data and extract the Tactic ID (external_id), url (if available), and source (if available) from "external_references"
for t in tactics: 

    # Remove the newlines from the description
    t['description'] = t['description'].replace("\n", "")
    
    # Extract the object_markings_ref, which is currently a list of one string, to simply a string
    t['object_marking_refs'] = t['object_marking_refs'][0]
    
    # Extract the tactic ID
    t['tactic_id'] = t['external_references'][0]['external_id'] 
    
    # Extract the URL if available
    try: t['url'] = t['external_references'][0]['url']
    except KeyError: t['url'] = None
    
    # Extract the source name if available 
    try: t['source_name'] = t['external_references'][0]['source_name']
    except KeyError: t['source_name'] = None

# Convert the tactics to a dataframe
tactics_csv:str = 'data/csvs/tactics.csv'
tactics_df:pd.DataFrame = pd.DataFrame(tactics)

# Drop the 'external_references' column since it is now not needed
tactics_df.drop('external_references', axis=1, inplace=True)
tactics_df.to_csv(tactics_csv, index=False)

# Print details
print(f'Saved {len(tactics_df)} tactics to "{tactics_csv}".')

Saved 14 tactics to "data/csvs/tactics.csv".


##### Campaigns

In [10]:
# Extract the campaigns
campaigns:list[dict] = filter_dict(enterprise_attack_data, 'type', 'campaign')

# Use handle_list_of_dict function to parse out the campaigns and sources 
campaigns_result:dict[str, list[dict]] = handle_list_of_dict(campaigns)

standardized_campaigns:list[dict] = campaigns_result['dicts'] 
campaign_sources:list[dict] = campaigns_result['sources']

# Create the CSV files for the campaigns and campaign sources
create_csvs(
    campaigns, 
    campaign_sources, 
    'data/csvs/campaigns.csv', 
    'data/csvs/campaign_sources.csv', 
    print_debug=True
)


Saved 28 objects to "data/csvs/campaigns.csv".
Saved 28 sources to "data/csvs/campaign_sources.csv".


##### Tools

In [11]:
# Extract all the tools
tools:list[dict] = filter_dict(enterprise_attack_data, 'type', 'tool')

# Dump to JSON for easier reading
with open('data/jsons/tools.json', 'w+') as file: 
    json.dump(tools, file, indent=4)


In [12]:
# Use handle_list_of_dict function to parse out the tools and sources 
tools_result:dict[str, list[dict]] = handle_list_of_dict(tools)

standardized_tools:list[dict] = tools_result['dicts'] 
tool_sources:list[dict] = tools_result['sources']

# Create the CSV files for the tools and tool sources
create_csvs(
    tools, 
    tool_sources, 
    'data/csvs/tools.csv', 
    'data/csvs/tool_sources.csv', 
    print_debug=True
)

Saved 86 objects to "data/csvs/tools.csv".
Saved 86 sources to "data/csvs/tool_sources.csv".


##### Intrusion sets

**NOTE:** An "intrusion-set" is a threat actor (MITRE's naming choice, for some reason)

In [13]:
# Extracting the intrusion sets
intrusion_sets:list[dict] = filter_dict(enterprise_attack_data, 'type', 'intrusion-set')

# Dump JSON for easier reading 
with open('data/jsons/intrusion-sets.json', 'w+') as file:
    json.dump(intrusion_sets, file, indent=4)

In [14]:
# Use handle_list_of_dict function to parse out the intrusion sets and sources 
iset_results:dict[str, list[dict]] = handle_list_of_dict(intrusion_sets)

standardized_isets:list[dict] = iset_results['dicts'] 
iset_sources:list[dict] = iset_results['sources']

# Create the CSV files for the intrusion sets and sources
create_csvs(
    intrusion_sets, 
    iset_sources, 
    'data/csvs/intrusion_sets.csv', 
    'data/csvs/intrusion_set_sources.csv', 
    print_debug=True
)

Saved 165 objects to "data/csvs/intrusion_sets.csv".
Saved 165 sources to "data/csvs/intrusion_set_sources.csv".


##### Courses of action

In [15]:
# Extract the courses of action
courses_of_action:list[dict] = filter_dict(enterprise_attack_data, 'type', 'course-of-action')

# Dump JSON for easier reading
with open('data/jsons/courses_of_action.json', 'w+') as file:
    json.dump(courses_of_action, file, indent=4)

In [16]:
# Use handle_list_of_dict function to parse out the courses of action and sources 
coa_result:dict[str, list[dict]] = handle_list_of_dict(courses_of_action)

standardized_coa:list[dict] = coa_result['dicts'] 
coa_sources:list[dict] = coa_result['sources']

# Create the CSV files for the courses of action and sources
create_csvs(
    courses_of_action, 
    coa_sources, 
    'data/csvs/courses_of_action.csv', 
    'data/csvs/courses_of_action_sources.csv', 
    print_debug=True
)

Saved 284 objects to "data/csvs/courses_of_action.csv".
Saved 284 sources to "data/csvs/courses_of_action_sources.csv".


##### Malware

In [17]:
# Extract the malware 
malware:list[dict] = filter_dict(enterprise_attack_data, 'type', 'malware')

# Dump JSON for easier reading 
with open('data/jsons/malware.json', 'w+') as file:
    json.dump(malware, file, indent=4)

In [18]:
# Use handle_list_of_dict function to parse out the malware and sources 
malware_result:dict[str, list[dict]] = handle_list_of_dict(malware)

standardized_malware:list[dict] = malware_result['dicts'] 
malware_sources:list[dict] = malware_result['sources']

# Create the CSV files for the malware and sources
create_csvs(
    malware, 
    malware_sources, 
    'data/csvs/malware.csv', 
    'data/csvs/malware_sources.csv', 
    print_debug=True
)

Saved 596 objects to "data/csvs/malware.csv".
Saved 596 sources to "data/csvs/malware_sources.csv".


##### Attack patterns

In [19]:
# Extract all the attack patterns 
attack_patterns:list[dict] = filter_dict(enterprise_attack_data, 'type', 'attack-pattern')

# Dump JSON for easier reading 
with open('data/jsons/attack-patterns.json', 'w+') as file: 
    json.dump(attack_patterns, file, indent=4)


In [20]:
# Use handle_list_of_dict function to parse out the attack patterns and sources 
attack_pattern_results:dict[str, list[dict]] = handle_list_of_dict(attack_patterns)

standardized_attack_patterns:list[dict] = attack_pattern_results['dicts'] 
attack_pattern_sources:list[dict] = attack_pattern_results['sources']

# Create the CSV files for the attack patterns and sources
create_csvs(
    attack_patterns, 
    attack_pattern_sources, 
    'data/csvs/attack_patterns.csv', 
    'data/csvs/attack_pattern_sources.csv', 
    print_debug=True
)

Saved 780 objects to "data/csvs/attack_patterns.csv".
Saved 780 sources to "data/csvs/attack_pattern_sources.csv".


##### Relationships

In [21]:
# Extract all the relationships
relationships:list[dict] = filter_dict(enterprise_attack_data, 'type', 'relationship')

# Dump JSON for easier reading 
with open('data/jsons/relationships.json', 'w+') as file: 
    json.dump(relationships, file, indent=4)

In [22]:
# Use handle_list_of_dict function to parse out the relationships and sources 
relationship_results:dict[str, list[dict]] = handle_list_of_dict(relationships)

standardized_relationships:list[dict] = relationship_results['dicts'] 
relationship_sources:list[dict] = relationship_results['sources']

# Create the CSV files for the relationship and sources
create_csvs(
    relationships, 
    relationship_sources, 
    'data/csvs/relationships.csv', 
    'data/csvs/relationship_sources.csv', 
    print_debug=True
)

Saved 19438 objects to "data/csvs/relationships.csv".
Saved 19438 sources to "data/csvs/relationship_sources.csv".


# Analyzing Relationships

## Get all the relationships for each actor and dump to their own JSON 

**NOTE:** jsons with relationships for each actor are stored in [data/jsons/[actor_name]/relationships.json](/data/jsons/)

In [23]:
for actor in intrusion_sets: 
    # Print details
    print(f"\nactor name: {actor['name']}\nid: {actor['id']}\naliases: {actor['aliases']}")
    
    # Get all the relationships for this actor 
    these_relationships:list[dict] = get_all_relationships_for_actor(relationships, intrusion_sets, actor['name'])
    
    # Print details
    print(f'Actor "{actor["name"]}" has {len(these_relationships)} relationships.')
    
    # Dump JSON file with relationships for this actor
    if not os.path.exists(f'data/jsons/{actor["name"].replace(" ", "-")}/'):  # Check that the dir for this actor exists
        os.mkdir(f'data/jsons/{actor["name"].replace(" ", "-")}/')            # Create the dir if needed
        
    # Dump JSON 
    with open(f'data/jsons/{actor["name"].replace(" ", "-")}/relationships.json', 'w+') as file:
        json.dump(these_relationships, file, indent=4)


actor name: APT38
id: intrusion-set--00f67a77-86a4-4adf-be26-1a54fc713340
aliases: ['APT38', 'NICKEL GLADSTONE', 'BeagleBoyz', 'Bluenoroff', 'Stardust Chollima', 'Sapphire Sleet', 'COPERNICIUM']
Actor "APT38" has 50 relationships.

actor name: Indrik Spider
id: intrusion-set--01e28736-2ffc-455b-9880-ed4d1407ae07
aliases: ['Indrik Spider', 'Evil Corp', 'Manatee Tempest', 'DEV-0243']
Actor "Indrik Spider" has 29 relationships.

actor name: NEODYMIUM
id: intrusion-set--025bdaa9-897d-4bad-afa6-013ba5734653
aliases: ['NEODYMIUM']
Actor "NEODYMIUM" has 1 relationships.

actor name: Elderwood
id: intrusion-set--03506554-5f37-4f8f-9ce4-0e9f01a1b484
aliases: ['Elderwood', 'Elderwood Gang', 'Beijing Group', 'Sneaky Panda']
Actor "Elderwood" has 20 relationships.

actor name: SideCopy
id: intrusion-set--03be849d-b5a2-4766-9dda-48976bae5710
aliases: ['SideCopy']
Actor "SideCopy" has 18 relationships.

actor name: GALLIUM
id: intrusion-set--06a11b7e-2a36-47fe-8d3e-82c265df3258
aliases: ['GALLIUM',

## Getting the tools for each actor to dump to their own JSON 

In [30]:
for actor in intrusion_sets: 
    # Print details
    print(f"\nactor name: {actor['name']}\nid: {actor['id']}\naliases: {actor['aliases']}")
    
    # Get all the relationships with tools for this actor
    with open(f'data/jsons/{actor["name"].replace(" ", "-")}/relationships.json') as file: 
        all_actor_relationships:list[dict] = json.load(file)
    
    # From all the actor's relationships, extract those with tools
    rels_with_tools:list[dict] = relationships_with_tools(all_actor_relationships)
    
    # Extract the tools names for each relationship
    actor_tools:list[dict] = []
    
    for r in rels_with_tools: 
        this_tool_id:str = r['target_ref'] if 'tool' in r['target_ref'] else r['source_ref']

        actor_tools.append({
            'Tool Name': tool_name_from_id(tools, this_tool_id),
            'Description': r['description'],
            'Relationship ID': r['id'],
            'Relationship Type': r['relationship_type']
            }
        )
        
    # Print details
    print(
        tabulate(
            actor_tools, 
            headers={ k : k for k in actor_tools[0].keys() } if actor_tools else {}
        )
    )
    
    # Dump JSON file with relationships for this actor
    if not os.path.exists(f'data/jsons/{actor["name"].replace(" ", "-")}/'):  # Check that the dir for this actor exists
        os.mkdir(f'data/jsons/{actor["name"].replace(" ", "-")}/')            # Create the dir if needed
        
    # Dump JSON 
    with open(f'data/jsons/{actor["name"].replace(" ", "-")}/tools.json', 'w+') as file: 
        json.dump(actor_tools, file, indent=4)


actor name: APT38
id: intrusion-set--00f67a77-86a4-4adf-be26-1a54fc713340
aliases: ['APT38', 'NICKEL GLADSTONE', 'BeagleBoyz', 'Bluenoroff', 'Stardust Chollima', 'Sapphire Sleet', 'COPERNICIUM']
Tool Name    Description                         Relationship ID                                     Relationship Type
-----------  ----------------------------------  --------------------------------------------------  -------------------
Net          (Citation: FireEye APT38 Oct 2018)  relationship--1f0832ca-30f2-4f5a-8e92-1d15222fc087  uses
Mimikatz     (Citation: FireEye APT38 Oct 2018)  relationship--b56a1198-e23a-48f4-9744-7a6f087b79a3  uses

actor name: Indrik Spider
id: intrusion-set--01e28736-2ffc-455b-9880-ed4d1407ae07
aliases: ['Indrik Spider', 'Evil Corp', 'Manatee Tempest', 'DEV-0243']
Tool Name    Description                                   Relationship ID                                     Relationship Type
-----------  --------------------------------------------  ----------