**Objective:** Rate threat actors based on their similarity in attack patterns. 

In [9]:
import json 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from functions.functions import name_from_id
from tabulate import tabulate
import pandas as pd

**Create arrays of attack pattern IDs and threat actor IDs to maintain the row and column orders of the matrices as indexes** 

Example: 
```text
           t1   t2   t3 ... tN 
    TA1 |  0    0    1  ... 0   
    TA2 |  1    1    1  ... 1
    TA3 |  1    0    1  ... 1  
      . |  .    .    .   .  . 
      . |  .    .    .   .  .
      . |  .    .    .   .  . 
    TAM |  0    1    0  ... 1
```
In this example: 
- TAX is a threat actor.
- tX is a tactic (attack pattern).
- M is the number of threat actors.
- N is the number of tactics.

The order of the rows must be maintained such that TA1 = "FANCY BEAR", TA2 = "SILENT CHOLLIMA", etc.

The order of the columns must be maintained such that t1 = "T1000", t2 = "T4021", etc. 


In [2]:
# Load the necessary data
with open('data/jsons/intrusion-sets.json', 'r') as file: 
    threat_actors:list[dict] = json.load(file)
    
with open('data/jsons/attack-patterns.json', 'r') as file: 
    attack_patterns:list[dict] = json.load(file)

In [3]:
# Get all the threat actor IDs as a list
threat_actor_ids:list[str] = [ t['id'] for t in threat_actors ]

# Get all the attack pattern IDs as a list
attack_pattern_ids:list[str] = [ a['id'] for a in attack_patterns ]

# Define variables for the matrix dimensions
M:int = len(threat_actor_ids)   # M := number of rows, i.e. number of threat actors
N:int = len(attack_pattern_ids) # N := number of columns, i.e. number of attack patterns 

# Print details
print(f'Got M = {len(threat_actor_ids)} threat actor IDs.')
print(f'Got N = {len(attack_pattern_ids)} attack pattern IDs.')

Got M = 165 threat actor IDs.
Got N = 780 attack pattern IDs.


In [4]:
# Construct an NxM matrix
actor_ttp_matrix:np.matrix = np.matrix(np.zeros((M,N)))

# Iterate over the threat actors and add their attack patterns as appropriate
for t in threat_actors: 
    
    # Get the ID for this threat actor and find its index in threat_actor_ids
    this_threat_actor_id:str = t['id']
    this_row:int = threat_actor_ids.index(this_threat_actor_id)
     
    # Get the name and load the attack patterns for this threat actor from the JSON
    threat_actor_name:str = t['name'] 
    with open(f'data/jsons/{threat_actor_name.replace(" ", "-")}/attack-patterns.json', 'r') as file:
        these_attack_patterns:list[dict] = json.load(file) 
    
    # Iterate over the attack patterns for this actor
    for ap in these_attack_patterns:
        
        # Get the attack pattern's ID and find the index in attack_patterns 
        this_ap_id:str = ap['id']
        this_col:int = attack_pattern_ids.index(this_ap_id)
        
        # Set the value in actor_ttp_matrix at the ROW = this_row (i.e. threat actor ID) and COL = this_col (i.e. attack pattern ID) 
        # to 1, i.e. True to represent that this threat actor employs this attack pattern 
        actor_ttp_matrix[this_row, this_col] = 1

# Save the actor_ttp_matrix 
np.savetxt("data/analysis-outcomes/actor-ttp-matrix.csv", actor_ttp_matrix.astype(int), delimiter=",", fmt="%d")

In [8]:

# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(np.asarray(actor_ttp_matrix))

# Function to find the most similar row for each row
def find_most_similar_rows(similarity_matrix):
    most_similar = []
    for i in range(similarity_matrix.shape[0]):
        # Exclude the self-similarity by setting it to a very low value
        similarity_matrix[i, i] = -1
        # Find the index of the most similar row
        most_similar_row_index = np.argmax(similarity_matrix[i])
        most_similar.append(most_similar_row_index)
    return most_similar

# Find the most similar rows
most_similar_rows = find_most_similar_rows(similarity_matrix)

# Construct a list of tuples to print the results 
table_results:list[tuple] = []
for i, row in enumerate(most_similar_rows):
    
    # Convert the threat actor IDs to the names
    this_actor_name:str = name_from_id(threat_actors, threat_actor_ids[i])
    similar_actor_name:str = name_from_id(threat_actors, threat_actor_ids[row]) 
    rating:float = round(similarity_matrix[i][row], 2)
    
    table_results.append((this_actor_name, similar_actor_name, rating))
    
    #print(f"The most similar actor to {this_actor_name} is {similar_actor_name} with a rating of {round(similarity_matrix[i][row], 2)}")
    
print(tabulate(table_results, headers=['Actor 1', 'Actor 2', 'Similarity Rating']))


Actor 1             Actor 2               Similarity Rating
------------------  ------------------  -------------------
APT38               Operation Wocao                    0.43
Indrik Spider       Aquatic Panda                      0.34
NEODYMIUM           APT38                              0
Elderwood           Mofang                             0.71
SideCopy            Sidewinder                         0.55
GALLIUM             menuPass                           0.48
APT17               POLONIUM                           0.27
APT3                APT41                              0.5
Mustard Tempest     Transparent Tribe                  0.46
GCMAN               Fox Kitten                         0.22
Kimsuky             Lazarus Group                      0.52
EXOTIC LILY         Mofang                             0.46
admin@338           Darkhotel                          0.46
Volt Typhoon        Operation Wocao                    0.51
Patchwork           MONSOON                 

**Save results**

In [10]:
attack_pattern_sim_df:pd.DataFrame = pd.DataFrame(table_results, columns=['Actor 1', 'Actor 2', 'Similarity Rating'])

attack_pattern_sim_df.to_csv('data/analysis-outcomes/attack-pattern-similarities.csv', index=False)

Unnamed: 0,Actor 1,Actor 2,Similarity Rating
0,APT38,Operation Wocao,0.43
1,Indrik Spider,Aquatic Panda,0.34
2,NEODYMIUM,APT38,0.0
3,Elderwood,Mofang,0.71
4,SideCopy,Sidewinder,0.55
