In [100]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

def extract_cisa_techniques (url: str, sort_mode: str = None, look_up_table: pd.DataFrame() = None):
    response = requests.get(url)
    filtered_strings = []
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        all_tables = soup.find_all('table')
        regex_pattern = re.compile(r'T\d{4}\.*\d*')
        for table in all_tables: 
            matched_elements = list (table.find_all(string=regex_pattern))
            if len(matched_elements) >0 and len(filtered_strings)>0: 
                print ('WARNING: Extracting Techniques from more than one table. Check the url.')
                return
            if len(matched_elements) >0: filtered_strings.extend(matched_elements)
    else:
        print('Failed to fetch the webpage.')
        return
    return filtered_strings

In [101]:
url = 'https://www.cisa.gov/news-events/cybersecurity-advisories/aa22-277a'  
# print(extract_cisa_techniques (url = url, sort_mode= 'earliest', look_up_table= look_up_table))
print(extract_cisa_techniques (url = url ))

['T1078', 'T1047', 'T1059', 'T1059.001', 'T1059.003', 'T1059.006', 'T1129', 'T1569', 'T1078', 'T1543', 'T1078', 'T1036.005', 'T1070', 'T1070.004', 'T1078', 'T1497.001', 'T1562.001', 'T1574', 'T1016', 'T1016.001', 'T1033', 'T1049', 'T1057', 'T1082', 'T1083', 'T1497.001', 'T1021.002', 'T1560.001', 'T1039', 'T1074.002', 'T1095', 'T1105', 'T1090', 'T1029', 'T1567.002']


In [102]:
report_codes =  ['aa22-277a','aa22-138b','aa22-174a']
group_IDs = []
interacted_techniques = []
for report_code in report_codes:
    url = 'https://www.cisa.gov/news-events/cybersecurity-advisories/' + report_code
    group_IDs.append(report_code)
    interacted_techniques.append (extract_cisa_techniques (url))
data = {
    'group_ID': group_IDs,
    'interacted_techniques': interacted_techniques
}
test_data = pd.DataFrame (data=data)

In [103]:
test_data

Unnamed: 0,group_ID,interacted_techniques
0,aa22-277a,"[T1078, T1047, T1059, T1059.001, T1059.003, T1..."
1,aa22-138b,"[T1588.001, T1059, T1059.004, T1203, T1505.003..."
2,aa22-174a,"[T1190, T1059.001, T1053.005, T1505.003, T1036..."


In [127]:
import sys
sys.path.append ('..')
from src.models.model1.recommend import get_cadidate_techniques, get_interacted_tactic_range
look_up_table = pd.read_pickle ('data/lookup_tables/m06c_test1.pkl')
n = 200
test_group_IDs = []
test_detected_techniques = []
test_true_subsequent_techniques = []
test_candidate_techniques = []
for index, row in test_data.iterrows():
    group_ID = row['group_ID']
    for i in range (len (row['interacted_techniques'])-1):
        detected_techniques = row['interacted_techniques'][0:i+1]
        true_subsequent_techniques_techniques = row['interacted_techniques'][i+1:]
        candidate_techniques = get_cadidate_techniques (interacted_techniques = detected_techniques, look_up_table=look_up_table, n = n, mode = 'latest')
        
        test_group_IDs.append (group_ID)
        test_detected_techniques.append (detected_techniques)
        test_true_subsequent_techniques.append (true_subsequent_techniques_techniques)
        test_candidate_techniques.append (candidate_techniques)
data = {
    'group_ID': test_group_IDs,
    'detected_techniques': test_detected_techniques,
    'candidate_techniques': test_candidate_techniques,
    'true_subsequent_techniques': test_true_subsequent_techniques,
}

res_df = pd.DataFrame(data = data)
for index, row in res_df.iterrows():
    print (len(row['detected_techniques']), len(row['candidate_techniques']), len(row['true_subsequent_techniques']), len([i for i in row['candidate_techniques'] if i in row['true_subsequent_techniques']]))
    print ([i for i in row['true_subsequent_techniques'] if i not in row['candidate_techniques']])

1 181 34 24
['T1569', 'T1078', 'T1078', 'T1070', 'T1078', 'T1497.001', 'T1016.001', 'T1497.001', 'T1074.002', 'T1029']
2 178 33 24
['T1569', 'T1070', 'T1497.001', 'T1016.001', 'T1497.001', 'T1074.002', 'T1029']
3 183 32 23
['T1569', 'T1070', 'T1497.001', 'T1016.001', 'T1497.001', 'T1074.002', 'T1029']
4 183 31 22
['T1569', 'T1070', 'T1497.001', 'T1016.001', 'T1497.001', 'T1074.002', 'T1029']
5 183 30 21
['T1569', 'T1070', 'T1497.001', 'T1016.001', 'T1497.001', 'T1074.002', 'T1029']
6 188 29 20
['T1569', 'T1070', 'T1497.001', 'T1016.001', 'T1497.001', 'T1074.002', 'T1029']
7 254 28 22
['T1569', 'T1070', 'T1497.001', 'T1497.001']
8 388 27 22
['T1070', 'T1497.001', 'T1497.001']
9 388 26 22
['T1070', 'T1497.001', 'T1497.001']
10 364 25 21
['T1070', 'T1497.001', 'T1497.001']
11 364 24 21
['T1070', 'T1497.001', 'T1497.001']
12 309 23 20
['T1070', 'T1497.001', 'T1497.001']
13 383 22 21
[]
14 383 21 20
[]
15 383 20 19
[]
16 384 19 19
[]
17 384 18 18
[]
18 384 17 17
[]
19 183 16 16
[]
20 184 15

In [129]:
look_up_table.loc[look_up_table['technique_ID'] == 'T1029']

Unnamed: 0,technique_ID,sorted_similar_techniques,technique_earliest_stage,technique_latest_stage
477,T1029,"[T1056, T1053.003, T1482, T1574.010, T1102.003...",11,11
