<a href="https://colab.research.google.com/github/aiforsec22/IEEEEuroSP23/blob/main/notebooks/malware-similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing dependencies

## NOTE (FKH): I could run this notebook without any issue 

In [None]:
!git clone https://github.com/aiforsec/LADDER.git

In [None]:
# %cd LADDER/attack_pattern/

# %cd ../attack_pattern/

%cd ../LADDER

/home/cisquad/FKH/LADDAR/LADDER


### Import modules

In [6]:
!python3 -m pip install -U scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.7 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [scikit-learn][0m [scikit-learn]
[1A

In [7]:
import numpy as np

from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

### Read all malware, threat actor and triples

In [11]:
!pwd

/home/cisquad/FKH/LADDAR/LADDER


In [12]:
with open('./notebooks/all_malware.txt', 'r', encoding='utf-8') as f:
    text = f.read()
malware = []

for line in text.split('\n')[:-1]:
    malware.append(line)

In [13]:
len(malware)

1300

In [15]:
with open('./notebooks/all_threat_actors.txt', 'r', encoding='utf-8') as f:
    text = f.read()
actors = []

for line in text.split('\n')[:-1]:
    actors.append(line)

In [16]:
len(actors)

235

In [17]:
def read_triples(fname):
    triples = []
    with open(fname, 'r', encoding='utf-8') as f:
        text = f.read()

    for line in text.split('\n'):
        if len(line) > 0:
            e1, r, e2 = line.split('\t')
            triples.append([e1, r, e2])
    return triples

In [19]:
triples = read_triples('./notebooks/150_all.txt')

In [20]:
def get_malware_nodes(mal, triples):
    nodes = set()
    
    for e1, r, e2 in triples:
        if e1 == mal:
            nodes.add((e2, r))
        elif e2 == mal:
            nodes.add((e1, r))
    return nodes

In [21]:
def get_all_malware_nodes(triples):
    malware_nodes = {}
    for m in malware:
        nodes = get_malware_nodes(m, triples)
        if len(nodes) > 0:
            malware_nodes[m] = nodes
    return malware_nodes

In [22]:
def get_all_actor_nodes(triples):
    actor_nodes = {}
    for m in actors:
        nodes = get_malware_nodes(m, triples)
        nodes_list = list(nodes)
        for x in nodes_list:
            if x[1] == 'hasAuthor':
                mal_nodes = get_malware_nodes(x[0], triples)
                for z in mal_nodes:
                    if z[1] in ['targets', 'uses', 'exploits', 'indicates', 'isA', 'variantOf',]:
                        nodes.add(z)
#                     else:
#                         nodes.add(x[0], )
        if len(nodes) > 0:
            actor_nodes[m] = nodes
    return actor_nodes

In [23]:
malware_nodes = get_all_malware_nodes(triples)

In [24]:
def get_distance(node1, node2, type='jaccard'):
    union = node1.union(node2)
    intersect = node1.intersection(node2)
    if type == 'intersect':
        return 1000-len(intersect)
    elif type == 'jaccard':
        return 1 - len(intersect)/len(union)
    elif type == 'overlap':
        return 1 - len(intersect)/min(len(node1), len(node2))

In [25]:
def find_most_similar_malware(mal, triple_fname):
    triples = read_triples(triple_fname)
    malware_nodes = get_all_malware_nodes(triples)

    malware_list = list(malware_nodes.keys())
    mal_node_i = malware_nodes[mal]
        
    dist = []    
    for j in range(len(malware_list)):
            mal_node_j = malware_nodes[malware_list[j]]
            dist.append([malware_list[j], get_distance(mal_node_i, mal_node_j, 'jaccard')])
    dist.sort(key=lambda x: x[1])
    
    return dist[1:6]

### Find the malware most similar to FluBot

In [27]:
find_most_similar_malware('FluBot', './notebooks/12k_all.txt')

[['TeaBot', 0.7906976744186046],
 ['Medusa', 0.8064516129032258],
 ['Gustuff', 0.8115942028985508],
 ['Ghimob', 0.823943661971831],
 ['Faketoken', 0.8260869565217391]]

In [None]:
# should print the following
# [['TeaBot', 0.7906976744186046],
#  ['Medusa', 0.8064516129032258],
#  ['Gustuff', 0.8115942028985508],
#  ['Ghimob', 0.823943661971831],
#  ['Faketoken', 0.8260869565217391]]

In [28]:
triples = read_triples('./notebooks/12k_all.txt')
malware_nodes = get_all_malware_nodes(triples)

In [29]:
mal_node_i = malware_nodes['FluBot']
mal_node_j = malware_nodes['TeaBot']

print(mal_node_i.intersection(mal_node_j))

{('Correos', 'targets'), ('T1512', 'uses'), ('Spanish', 'targets'), ('T1629', 'uses'), ('T1513', 'uses'), ('T1409', 'uses'), ('T1639', 'uses'), ('T1636', 'uses'), ('WhatsApp', 'targets'), ('T1418', 'uses'), ('trojan', 'isA'), ('T1616', 'uses'), ('T1625', 'uses'), ('Android', 'targets'), ('Google', 'targets'), ('bank', 'targets'), ('T1640', 'uses'), ('T1406', 'uses'), ('ransomware', 'isA'), ('T1481', 'uses'), ('ESET', 'targets'), ('UPS', 'targets'), ('German', 'targets'), ('T1626', 'uses'), ('apps', 'targets'), ('T1582', 'uses'), ('T1635', 'uses')}


In [30]:
def find_most_similar_threat_actor(act, triple_fname):
    triples = read_triples(triple_fname)
    actor_nodes = get_all_actor_nodes(triples)
    
    actor_list = list(actor_nodes.keys())
    actor_node_i = actor_nodes[act]
    
    dist = []    
    for j in range(len(actor_list)):
            actor_node_j = actor_nodes[actor_list[j]]
            dist.append([actor_list[j], get_distance(actor_node_i, actor_node_j, 'jaccard')])
    dist.sort(key=lambda x: x[1])
    
    return dist[1:6]

### Find the most similar threat actor to APT15

In [31]:
find_most_similar_threat_actor('APT15', './notebooks/12k_all.txt')

[['GREF', 0.5333333333333333],
 ['Boyusec', 0.574468085106383],
 ['Ke3chang', 0.5833333333333333],
 ['APT-C-50', 0.8163265306122449],
 ['Kitten', 0.8333333333333334]]

In [None]:
# should print the folliwng
# [['GREF', 0.5333333333333333],
#  ['Boyusec', 0.574468085106383],
#  ['Ke3chang', 0.5833333333333333],
#  ['APT-C-50', 0.8163265306122449],
#  ['Kitten', 0.8333333333333334]]

In [32]:
triples = read_triples('./notebooks/12k_all.txt')
actor_nodes = get_all_actor_nodes(triples)

In [33]:
mal_node_i = actor_nodes['APT15']
mal_node_j = actor_nodes['Boyusec']

print(mal_node_i.intersection(mal_node_j))

{('Central', 'targets'), ('Kuwait', 'targets'), ('Xinjiang', 'targets'), ('Voxer', 'targets'), ('Muslim', 'targets'), ('RAT', 'isA'), ('Telegram', 'targets'), ('Syria', 'targets'), ('Android', 'targets'), ('Spyware', 'isA'), ('TIBBIYJAWHAR', 'targets'), ('Google', 'targets'), ('China', 'targets'), ('TalkBox', 'targets'), ('voice', 'targets'), ('third-party', 'targets'), ('surveillanceware', 'isA'), ('Uyghur', 'targets'), ('apps', 'targets'), ('music', 'targets')}
