**Objective:** Rate threat actors based on their similarity in malware from previous attacks. 

In [1]:
import json 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from functions.functions import name_from_id, generate_similarity_table
import pandas as pd

**Create arrays of malware IDs and threat actor IDs to maintain the row and column orders of the matrices as indexes** 

Example: 
```text
           m1   m2   m3 ... mN 
    TA1 |  0    0    1  ... 0   
    TA2 |  1    1    1  ... 1
    TA3 |  1    0    1  ... 1  
      . |  .    .    .   .  . 
      . |  .    .    .   .  .
      . |  .    .    .   .  . 
    TAM |  0    1    0  ... 1
```
In this example: 
- TAX is a threat actor.
- mX is a malware.
- M is the number of threat actors.
- N is the number of malwares.


#### **Load data**

In [2]:
# Load the necessary data
with open('data/jsons/intrusion-sets.json', 'r') as file: 
    threat_actors:list[dict] = json.load(file)
    
with open('data/jsons/malware.json', 'r') as file: 
    malware:list[dict] = json.load(file)

#### **Construct the matrix with threat actors for rows and malware for columns**

In [3]:
# Get all the threat actor IDs as a list
threat_actor_ids:list[str] = [ t['id'] for t in threat_actors ]

# Get all the malware IDs as a list
malware_ids:list[str] = [ m['id'] for m in malware ]

# Convert the threat actor ids to names for meaningful analysis 
threat_actor_names:list[str] = [ name_from_id(threat_actors, i) for i in threat_actor_ids ]

# Define variables for the matrix dimensions
M:int = len(threat_actor_ids)   # M := number of rows, i.e. number of threat actors
N:int = len(malware_ids) # N := number of columns, i.e. number of malwares

# Construct an NxM matrix
actor_malware_matrix:np.matrix = np.matrix(np.zeros((M,N)))

# Print details
print(f'Got M = {M} threat actor IDs (rows).')
print(f'Got N = {N} malware IDs (columns).')

Got M = 165 threat actor IDs (rows).
Got N = 596 malware IDs (columns).


#### **Populate the matrix**

In [4]:
# Iterate over the threat actors and add their malware as appropriate
for t in threat_actors: 
    
    # Get the ID for this threat actor and find its index in threat_actor_ids
    this_threat_actor_id:str = t['id']
    this_row:int = threat_actor_ids.index(this_threat_actor_id)
     
    # Get the name and load the malwares for this threat actor from the JSON
    threat_actor_name:str = t['name'] 
    with open(f'data/jsons/threat-groups/{threat_actor_name.replace(" ", "-")}/malware.json', 'r') as file:
        these_malwares:list[dict] = json.load(file) 
    
    # Iterate over the malware for this actor
    for m in these_malwares:
        
        # Get the attack pattern's ID and find the index in malware_ids
        this_malware_id:str = m['id']
        this_col:int = malware_ids.index(this_malware_id)
        
        # Set the value in actor_ttp_matrix at the ROW = this_row (i.e. threat actor ID) and COL = this_col (i.e. malware ID) 
        # to 1, i.e. True to represent that this threat actor uses this malware
        actor_malware_matrix[this_row, this_col] = 1

# Save the actor_ttp_matrix 
np.savetxt("data/analysis-outcomes/actor-malware-similarities/actor-malware-matrix.csv", actor_malware_matrix.astype(int), delimiter=",", fmt="%d")

# Save the matrix row and column labels
with open('data/analysis-outcomes/actor-malware-similarities/matrix-labels.json', 'w+') as file: 
    json.dump({ 'rows': threat_actor_ids, 'columns': malware_ids }, file, indent=4)

#### **Save results**

In [5]:
# Construct the similarity matrix
similarity_matrix:np.matrix = cosine_similarity(np.asarray(actor_malware_matrix))

# Generate the similarity table from the similarity matrix
table_results, pretty_table = generate_similarity_table(
    similarity_matrix, 
    threat_actor_names, 
    table_headers = ['Actor 1', 'Actor 2', 'Similarity Rating', 'Is Alias'],
    check_if_aliases=True
)

# Convert the table to a dataframe and save it
malware_sim_df:pd.DataFrame = pd.DataFrame(table_results, columns=['Actor 1', 'Actor 2', 'Similarity Rating', 'Is Alias'])
malware_sim_df.to_csv('data/analysis-outcomes/actor-malware-similarities/malware-similarities-table.csv', index=False)

# Print the pretty table 
print(pretty_table)

Actor 1             Actor 2              Similarity Rating  Is Alias
------------------  -----------------  -------------------  ----------
APT-C-23            None                              0     False
APT-C-36            None                              0     False
APT1                Moafee                            0.41  False
APT12               None                              0     False
APT16               None                              0     False
APT17               Leviathan                         0.32  False
APT18               Andariel                          0.35  False
APT19               Chimera                           1     False
APT28               None                              0     False
APT29               UNC2452                           1     True
APT3                Higaisa                           0.35  False
APT30               None                              0     False
APT32               Chimera                           0.32  False
APT