**Objective:** Rate threat actors based on their similarity in attack patterns. 

In [1]:
import json 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from functions.functions import name_from_id, generate_similarity_table
import pandas as pd

**Create arrays of attack pattern IDs and threat actor IDs to maintain the row and column orders of the matrices as indexes** 

Example: 
```text
           t1   t2   t3 ... tN 
    TA1 |  0    0    1  ... 0   
    TA2 |  1    1    1  ... 1
    TA3 |  1    0    1  ... 1  
      . |  .    .    .   .  . 
      . |  .    .    .   .  .
      . |  .    .    .   .  . 
    TAM |  0    1    0  ... 1
```
In this example: 
- TAX is a threat actor.
- tX is a tactic (attack pattern).
- M is the number of threat actors.
- N is the number of tactics.

#### **Load data**

In [2]:
# Load the necessary data
with open('data/jsons/intrusion-sets.json', 'r') as file: 
    threat_actors:list[dict] = json.load(file)
    
with open('data/jsons/attack-patterns.json', 'r') as file: 
    attack_patterns:list[dict] = json.load(file)

#### **Construct the matrix with threat actors for rows and attack patterns for columns**

In [3]:
# Get all the threat actor IDs as a list
threat_actor_ids:list[str] = [ t['id'] for t in threat_actors ]

# Get all the attack pattern IDs as a list
attack_pattern_ids:list[str] = [ a['id'] for a in attack_patterns ]

# Convert the threat actor ids to names for meaningful analysis 
threat_actor_names:list[str] = [ name_from_id(threat_actors, i) for i in threat_actor_ids ]

# Define variables for the matrix dimensions
M:int = len(threat_actor_ids)   # M := number of rows, i.e. number of threat actors
N:int = len(attack_pattern_ids) # N := number of columns, i.e. number of attack patterns 

# Construct an NxM matrix
actor_ttp_matrix:np.matrix = np.matrix(np.zeros((M,N)))

# Print details
print(f'Got M = {len(threat_actor_ids)} threat actor IDs (rows).')
print(f'Got N = {len(attack_pattern_ids)} attack pattern IDs (columns).')

Got M = 165 threat actor IDs (rows).
Got N = 780 attack pattern IDs (columns).


#### **Populate the matrix**

In [4]:
# Iterate over the threat actors and add their attack patterns as appropriate
for t in threat_actors: 
    
    # Get the ID for this threat actor and find its index in threat_actor_ids
    this_threat_actor_id:str = t['id']
    this_row:int = threat_actor_ids.index(this_threat_actor_id)
     
    # Get the name and load the attack patterns for this threat actor from the JSON
    threat_actor_name:str = t['name'] 
    with open(f'data/jsons/threat-groups/{threat_actor_name.replace(" ", "-")}/attack-patterns.json', 'r') as file:
        these_attack_patterns:list[dict] = json.load(file) 
    
    # Iterate over the attack patterns for this actor
    for ap in these_attack_patterns:
        
        # Get the attack pattern's ID and find the index in attack_patterns 
        this_ap_id:str = ap['id']
        this_col:int = attack_pattern_ids.index(this_ap_id)
        
        # Set the value in actor_ttp_matrix at the ROW = this_row (i.e. threat actor ID) and COL = this_col (i.e. attack pattern ID) 
        # to 1, i.e. True to represent that this threat actor employs this attack pattern 
        actor_ttp_matrix[this_row, this_col] = 1

# Save the actor_ttp_matrix 
np.savetxt("data/analysis-outcomes/actor-attack-pattern-similarities/actor-ttp-matrix.csv", actor_ttp_matrix.astype(int), delimiter=",", fmt="%d")

# Save the matrix row and column labels
with open('data/analysis-outcomes/actor-attack-pattern-similarities/matrix-labels.json', 'w+') as file: 
    json.dump({ 'rows': threat_actor_ids, 'columns': attack_pattern_ids }, file, indent=4)

#### **Save results**

In [5]:
# Construct the similarity matrix
similarity_matrix:np.matrix = cosine_similarity(np.asarray(actor_ttp_matrix))

# Generate the similarity table from the similarity matrix
table_results, pretty_table = generate_similarity_table(
    similarity_matrix, 
    threat_actor_names, 
    table_headers = ['Actor 1', 'Actor 2', 'Similarity Rating', 'Is Alias'],
    check_if_aliases=True
)

# Convert the table to a dataframe and save it
attack_pattern_sim_df:pd.DataFrame = pd.DataFrame(table_results, columns=['Actor 1', 'Actor 2', 'Similarity Rating', 'Is Alias'])
attack_pattern_sim_df.to_csv('data/analysis-outcomes/actor-attack-pattern-similarities/attack-pattern-similarities-table.csv', index=False)

# Print the pretty table results
print(pretty_table)

Actor 1             Actor 2               Similarity Rating  Is Alias
------------------  ------------------  -------------------  ----------
APT-C-23            None                               0     False
APT-C-36            WIRTE                              0.6   False
APT1                Ke3chang                           0.43  False
APT12               APT30                              0.63  False
APT16               Indrik Spider                      0.22  False
APT17               POLONIUM                           0.27  False
APT18               Metador                            0.46  False
APT19               APT32                              0.45  False
APT28               APT29                              0.45  False
APT29               UNC2452                            1     True
APT3                APT41                              0.5   False
APT30               TA459                              0.63  False
APT32               Lazarus Group                      