**Objective:** Rate threat actors based on their similarity in malware from previous attacks. 

In [1]:
import json 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from functions.functions import name_from_id, generate_similarity_table
import pandas as pd

**Create arrays of tool IDs and threat actor IDs to maintain the row and column orders of the matrices as indexes** 

Example: 
```text
           t1   t2   t3 ... tN 
    TA1 |  0    0    1  ... 0   
    TA2 |  1    1    1  ... 1
    TA3 |  1    0    1  ... 1  
      . |  .    .    .   .  . 
      . |  .    .    .   .  .
      . |  .    .    .   .  . 
    TAM |  0    1    0  ... 1
```
In this example: 
- TAX is a threat actor.
- tX is a tool.
- M is the number of threat actors.
- N is the number of tools.


#### **Load data**

In [2]:
# Load the necessary data
with open('data/jsons/intrusion-sets.json', 'r') as file: 
    threat_actors:list[dict] = json.load(file)
    
with open('data/jsons/tools.json', 'r') as file: 
    tools:list[dict] = json.load(file)

#### **Construct the matrix with threat actors for rows and malware for columns**

In [3]:
# Get all the threat actor IDs as a list
threat_actor_ids:list[str] = [ t['id'] for t in threat_actors ]

# Get all the tool IDs as a list
tool_ids:list[str] = [ t['id'] for t in tools ]

# Convert the threat actor ids to names for meaningful analysis 
threat_actor_names:list[str] = [ name_from_id(threat_actors, i) for i in threat_actor_ids ]

# Define variables for the matrix dimensions
M:int = len(threat_actor_ids)   # M := number of rows, i.e. number of threat actors
N:int = len(tool_ids) # N := number of columns, i.e. number of tools

# Construct an NxM matrix
actor_tool_matrix:np.matrix = np.matrix(np.zeros((M,N)))

# Print details
print(f'Got M = {M} threat actor IDs (rows).')
print(f'Got N = {N} tool IDs (columns).')

Got M = 165 threat actor IDs (rows).
Got N = 86 tool IDs (columns).


#### **Populate the matrix**

In [4]:
# Iterate over the threat actors and add their malware as appropriate
for t in threat_actors: 
    
    # Get the ID for this threat actor and find its index in threat_actor_ids
    this_threat_actor_id:str = t['id']
    this_row:int = threat_actor_ids.index(this_threat_actor_id)
     
    # Get the name and load the malwares for this threat actor from the JSON
    threat_actor_name:str = t['name'] 
    with open(f'data/jsons/threat-groups/{threat_actor_name.replace(" ", "-")}/tools.json', 'r') as file:
        these_tools:list[dict] = json.load(file) 
    
    # Iterate over the malware for this actor
    for t in these_tools:
        
        # Get the attack pattern's ID and find the index in malware_ids
        this_tool_id:str = t['id']
        this_col:int = tool_ids.index(this_tool_id)
        
        # Set the value in actor_ttp_matrix at the ROW = this_row (i.e. threat actor ID) and COL = this_col (i.e. malware ID) 
        # to 1, i.e. True to represent that this threat actor uses this malware
        actor_tool_matrix[this_row, this_col] = 1

# Save the actor_ttp_matrix 
np.savetxt("data/analysis-outcomes/actor-tool-similarities/actor-tool-matrix.csv", actor_tool_matrix.astype(float), delimiter=",", fmt="%f")

# Save the matrix row and column labels
with open('data/analysis-outcomes/actor-tool-similarities/matrix-labels.json', 'w+') as file: 
    json.dump({ 'rows': threat_actor_ids, 'columns': tool_ids }, file, indent=4)

#### **Save results**

In [5]:
# Construct the similarity matrix
similarity_matrix:np.matrix = cosine_similarity(np.asarray(actor_tool_matrix))

# Save the similarity matrix 
np.savetxt("data/analysis-outcomes/actor-tool-similarities/actor-tool-similarity-matrix.csv", similarity_matrix.astype(float), delimiter=",", fmt="%f")

# Generate the similarity table from the similarity matrix
table_results, pretty_table = generate_similarity_table(
    similarity_matrix, 
    threat_actor_names, 
    table_headers = ['Actor 1', 'Actor 2', 'Similarity Rating', 'Is Alias'],
    check_if_aliases=True
)

# Convert the table to a dataframe and save it
tool_sim_df:pd.DataFrame = pd.DataFrame(table_results, columns=['Actor 1', 'Actor 2', 'Similarity Rating', 'Is Alias'])
tool_sim_df.to_csv('data/analysis-outcomes/actor-tool-similarities/tool-similarities-table.csv', index=False)

# Print the pretty table 
print(pretty_table)

Actor 1             Actor 2              Similarity Rating  Is Alias
------------------  -----------------  -------------------  ----------
APT-C-23            None                              0     False
APT-C-36            TA2541                            0.71  False
APT1                Threat Group-3390                 0.52  False
APT12               GALLIUM                           0.3   False
APT16               None                              0     False
APT17               None                              0     False
APT18               Honeybee                          0.5   False
APT19               Frankenstein                      1     False
APT28               APT38                             0.47  False
APT29               UNC2452                           1     True
APT3                Evilnum                           0.71  False
APT30               None                              0     False
APT32               APT38                             0.63  False
APT