In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

def extract_cisa_techniques (url: str, sort_mode: str = None, look_up_table: pd.DataFrame() = None):
    response = requests.get(url)
    filtered_strings = []
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        all_tables = soup.find_all('table')
        regex_pattern = re.compile(r'T\d{4}\.*\d*')
        for table in all_tables: 
            matched_elements = list (table.find_all(string=regex_pattern))
            if len(matched_elements) >0 and len(filtered_strings)>0: 
                print ('WARNING: Extracting Techniques from more than one table. Check the url.')
                return
            if len(matched_elements) >0: filtered_strings.extend(matched_elements)
    else:
        print('Failed to fetch the webpage.')
        return
    return filtered_strings

In [3]:
url = 'https://www.cisa.gov/news-events/cybersecurity-advisories/aa22-277a'  
# print(extract_cisa_techniques (url = url, sort_mode= 'earliest', look_up_table= look_up_table))
print(extract_cisa_techniques (url = url ))

['T1078', 'T1047', 'T1059', 'T1059.001', 'T1059.003', 'T1059.006', 'T1129', 'T1569', 'T1078', 'T1543', 'T1078', 'T1036.005', 'T1070', 'T1070.004', 'T1078', 'T1497.001', 'T1562.001', 'T1574', 'T1016', 'T1016.001', 'T1033', 'T1049', 'T1057', 'T1082', 'T1083', 'T1497.001', 'T1021.002', 'T1560.001', 'T1039', 'T1074.002', 'T1095', 'T1105', 'T1090', 'T1029', 'T1567.002']


In [4]:
report_codes =  ['aa22-277a','aa22-138b','aa22-174a']
group_IDs = []
interacted_techniques = []
for report_code in report_codes:
    url = 'https://www.cisa.gov/news-events/cybersecurity-advisories/' + report_code
    group_IDs.append(report_code)
    interacted_techniques.append (extract_cisa_techniques (url))
data = {
    'group_ID': group_IDs,
    'interacted_techniques': interacted_techniques
}
test_data = pd.DataFrame (data=data)

In [5]:
test_data

Unnamed: 0,group_ID,interacted_techniques
0,aa22-277a,"[T1078, T1047, T1059, T1059.001, T1059.003, T1..."
1,aa22-138b,"[T1588.001, T1059, T1059.004, T1203, T1505.003..."
2,aa22-174a,"[T1190, T1059.001, T1053.005, T1505.003, T1036..."


In [6]:
import sys
sys.path.append ('..')
from src.models.model1.recommend import get_cadidate_techniques, get_interacted_tactic_range
look_up_table = pd.read_pickle ('data/lookup_tables/m06c_1_1.pkl')
n = 200
test_group_IDs = []
test_detected_techniques = []
test_true_subsequent_techniques = []
test_candidate_techniques = []
for index, row in test_data.iterrows():
    group_ID = row['group_ID']
    for i in range (len (row['interacted_techniques'])-1):
        detected_techniques = row['interacted_techniques'][0:i+1]
        true_subsequent_techniques_techniques = row['interacted_techniques'][i+1:]
        candidate_techniques = get_cadidate_techniques (interacted_techniques = detected_techniques, look_up_table=look_up_table, n = n, mode = 'latest')
        
        test_group_IDs.append (group_ID)
        test_detected_techniques.append (detected_techniques)
        test_true_subsequent_techniques.append (true_subsequent_techniques_techniques)
        test_candidate_techniques.append (candidate_techniques)
data = {
    'group_ID': test_group_IDs,
    'detected_techniques': test_detected_techniques,
    'candidate_techniques': test_candidate_techniques,
    'true_subsequent_techniques': test_true_subsequent_techniques,
}

res_df = pd.DataFrame(data = data)
for index, row in res_df.iterrows():
    print (len(row['detected_techniques']), len(row['candidate_techniques']), len(row['true_subsequent_techniques']), len([i for i in row['candidate_techniques'] if i in row['true_subsequent_techniques']]))
    print ([i for i in row['true_subsequent_techniques'] if i not in row['candidate_techniques']])

1 183 34 26
['T1129', 'T1078', 'T1078', 'T1070', 'T1078', 'T1497.001', 'T1497.001', 'T1029']
2 178 33 26
['T1129', 'T1070', 'T1497.001', 'T1497.001', 'T1029']
3 183 32 25
['T1129', 'T1070', 'T1497.001', 'T1497.001', 'T1029']
4 185 31 24
['T1129', 'T1070', 'T1497.001', 'T1497.001', 'T1029']
5 185 30 23
['T1129', 'T1070', 'T1497.001', 'T1497.001', 'T1029']
6 200 29 22
['T1129', 'T1070', 'T1497.001', 'T1497.001', 'T1029']
7 376 28 23
['T1497.001', 'T1497.001', 'T1029']
8 461 27 24
[]
9 461 26 24
[]
10 444 25 23
[]
11 444 24 23
[]
12 371 23 22
[]
13 382 22 21
[]
14 382 21 20
[]
15 382 20 19
[]
16 396 19 19
[]
17 396 18 18
[]
18 396 17 17
[]
19 185 16 16
[]
20 185 15 15
[]
21 185 14 14
[]
22 185 13 13
[]
23 185 12 12
[]
24 185 11 11
[]
25 185 10 10
[]
26 185 9 9
[]
27 141 8 8
[]
28 120 7 7
[]
29 120 6 6
[]
30 120 5 5
[]
31 83 4 4
[]
32 83 3 3
[]
33 83 2 2
[]
34 44 1 1
[]
1 200 13 9
['T1222.002', 'T1070', 'T1003.008', 'T1105']
2 181 12 8
['T1222.002', 'T1070', 'T1003.008', 'T1105']
3 280 11 

In [7]:
res_df.loc[57]

group_ID                                                              aa22-174a
detected_techniques           [T1190, T1059.001, T1053.005, T1505.003, T1036...
candidate_techniques          [T1071, T1071.004, T1071.002, T1071.003, T1071...
true_subsequent_techniques                                              [T1090]
Name: 57, dtype: object

In [8]:
look_up_table.loc[look_up_table['technique_ID'] == 'T1090']

Unnamed: 0,technique_ID,sorted_similar_techniques,technique_earliest_stage,technique_latest_stage
447,T1090,"[T1087.002, T1210, T1069.001, T1586.002, T1132...",10,10
