In [None]:
!pip install requests beautifulsoup4

In [1]:
import requests
import time
from bs4 import BeautifulSoup
import re
import json
import html



# proxy = 'http://127.0.0.1:7890'

def get_request(url):
    while True:
        try:
            response = requests.get(url)
            if response.status_code == 404:
                return None
            response.raise_for_status()  # if status code is not 200，raise HTTPError Exception
            return response.text
        except requests.exceptions.RequestException as err:
            print(f"Network error occurred: {err}. Retrying...")
            time.sleep(1)  # Wait for 1 seconds before retrying

def clean_text(raw_text):
    # remove citation
    text = re.sub(r'\[\d+\]','', raw_text)
    text = html.unescape(text)  # turn HTML char into Unicode char
    # remove blanks in html
    text = re.sub(r'\s|\xa0|\u3000', ' ', text)
    return text

def get_sub_technique(sub_tech, mitre_url):
    sub_technique = {
        'name': '',
        'description': '',
        'detailed_description': '',
        'id': '',
        'examples': []
    }
    sub_technique['name'] = sub_tech.text.strip()
    sub_technique['id'] = sub_tech['data-subtechnique_id'].strip()
    sub_tech_href = sub_tech['href'][1:] # remove the first '/'

    # request for sub technique
    sub_tech_url = mitre_url + sub_tech_href
    sub_tech_html = get_request(sub_tech_url)
    soup = BeautifulSoup(sub_tech_html, 'html.parser')
    
    dd_div = soup.find('div', attrs={'class': 'description-body'}) # detailed_description
    ps = dd_div.find_all('p')

    h2_titles = soup.find_all('h2', attrs={'class': 'pt-3', 'id': 'examples'})
    if len(h2_titles) > 0 and h2_titles[0].text == 'Procedure Examples':
        # add examples
        examples_table = soup.find_all('div', {'class': 'tables-mobile'})[0] # on the mitre page, the example example table is always the first table if the example table exists
        for example in examples_table.table.tbody.find_all('tr'):
            tds = example.find_all('td')
            sub_technique['examples'].append({
                'id': tds[0].text.strip(),
                'name': tds[1].text.strip(),
                'description': clean_text(tds[2].text.strip())
            })
    
    sub_technique['description'] = clean_text(ps[0].text.strip())
    sub_technique['detailed_description'] = clean_text(ps[0].text.strip())
    for i in range(1,len(ps)):
        sub_technique['detailed_description'] = sub_technique['detailed_description'] +'\n'+ clean_text(ps[i].text).strip()

    return sub_technique


def get_technique(tech, mitre_url):
    if len(tech['class']) > 1:
        # skip the ['sub', 'technique']
        return None
    technique = {
        'name': '',
        'description': '',
        'detailed_description': '',
        'id': '',
        'examples': [],
        'sub_techniques': []
    }
    tds = tech.find_all('td')
    tech_href = tds[1].a['href'][1:] # remove the first '/'
    technique['id'] = tds[0].text.strip()
    technique['name'] = tds[1].text.strip()
    technique['description'] = clean_text(tds[2].text.strip())
            
    # request for technique
    technique_url = mitre_url + tech_href
    technique_html = get_request(technique_url)
    soup = BeautifulSoup(technique_html, 'html.parser')

    dd_div = soup.find('div', attrs={'class': 'description-body'}) # detailed_description
    ps = dd_div.find_all('p')
    
    technique['detailed_description'] = clean_text(ps[0].text.strip())
    for i in range(1,len(ps)):
        technique['detailed_description'] = technique['detailed_description'] +'\n'+ clean_text(ps[i].text).strip()

    h2_titles = soup.find_all('h2', attrs={'class': 'pt-3', 'id': 'examples'})
    if len(h2_titles) > 0 and h2_titles[0].text == 'Procedure Examples':
        # add examples
        examples_table = soup.find_all('div', {'class': 'tables-mobile'})[0] # on the mitre page, the example example table is always the first table if the example table exists
        for example in examples_table.table.tbody.find_all('tr'):
            tds = example.find_all('td')
            technique['examples'].append({
                'id': tds[0].text.strip(),
                'name': tds[1].text.strip(),
                'description': clean_text(tds[2].text.strip())
            })
     
    # sub techniques
    count_sub_techniques = 1
    sub_techniques = soup.find_all('a', attrs={'class', 'subtechnique-table-item'})
    for i, sub_tech in enumerate(sub_techniques):
        if i % 2 == 0:
            # skip the id description
            continue
        sub_technique = get_sub_technique(sub_tech, mitre_url)
        technique['sub_techniques'].append(sub_technique)
        print('    - sub technique {} finished : {}'.format(count_sub_techniques, sub_technique['name']))
        count_sub_techniques += 1

    return technique    

def get_tactic(tact, mitre_url):
    tactic = {
            'name': '',
            'description': '',
            'id': '',
            'techniques': []
        }
    tactic['name'] = tact.a.text.strip()
    tactic['id'] = tact.a['title'].strip()
    tactic_href = tact.a['href'][1:] # remove the first '/'
        
    # request for tactic
    tactic_url = mitre_url + tactic_href
    tactic_html = get_request(tactic_url)
    soup = BeautifulSoup(tactic_html, 'html.parser')
    description = soup.find('div', attrs={'class': 'description-body'})
        
    tactic['description'] = description.text.strip() # set 'description'
    
    # techniques
    count_techniques = 1
    techniques = soup.find_all('tr', attrs={'class': 'technique'})
    for tech in techniques:
        technique = get_technique(tech, mitre_url)
        if technique is None:
            # skip the sub technique
            continue
        # put technique into the list in tactic
        tactic['techniques'].append(technique)
        print(' - technique {} finished : {}'.format(count_techniques, technique['name']))
        count_techniques += 1
            
    return tactic

def get_mitre():
    mitre_url = 'https://attack.mitre.org/'
    mitre_main = get_request(mitre_url)
    soup = BeautifulSoup(mitre_main, 'html.parser')

    tactics = soup.find('table', attrs={'class': ['matrix', 'side']}).find_all('td', attrs={'class' : 'tactic name'})
 
    mitre_dict = {'tactics': []}
    # add tactics
    count_tactics = 1
    for tact in tactics:
        tactic = get_tactic(tact, mitre_url)        
        mitre_dict['tactics'].append(tactic)
        print('tactic {} finished : {}'.format(count_tactics, tactic['name']))
        count_tactics += 1
    
    return mitre_dict

mitre_dict = get_mitre()



    - sub technique 1 finished : Scanning IP Blocks
    - sub technique 2 finished : Vulnerability Scanning
    - sub technique 3 finished : Wordlist Scanning
 - technique 1 finished : Active Scanning
    - sub technique 1 finished : Hardware
    - sub technique 2 finished : Software
    - sub technique 3 finished : Firmware
    - sub technique 4 finished : Client Configurations
 - technique 2 finished : Gather Victim Host Information
    - sub technique 1 finished : Credentials
    - sub technique 2 finished : Email Addresses
    - sub technique 3 finished : Employee Names
 - technique 3 finished : Gather Victim Identity Information
    - sub technique 1 finished : Domain Properties
    - sub technique 2 finished : DNS
    - sub technique 3 finished : Network Trust Dependencies
    - sub technique 4 finished : Network Topology
    - sub technique 5 finished : IP Addresses
    - sub technique 6 finished : Network Security Appliances
 - technique 4 finished : Gather Victim Network Infor

In [2]:
with open('./mitre.json', 'w') as f:
    json.dump(mitre_dict, f)

In [48]:
mitre_url = 'https://attack.mitre.org/'
mitre_main = get_request(mitre_url)
soup = BeautifulSoup(mitre_main, 'html.parser')

tactics = soup.find_all('td', attrs={'class' : 'tactic name'})

mitre_dict = {}
mitre_dict = {'tactics': []}
# add tactics
for tact in tactics:
    tactic = {
        'name': '',
        'href': '',
        'description': '',
        'techniques': []
    }
    tactic['name'] = tact.a.text
    tactic['href'] = tact.a['href'][1:] # remove the first '/'
    
    # request for tactic
    tactic_url = mitre_url + tactic['href']
    tactic_html = get_request(tactic_url)
    soup = BeautifulSoup(tactic_html, 'html.parser')
    description = soup.find('div', attrs={'class': 'description-body'})
    
    tactic['description'] = description.text # set 'description'

    # techniques
    techniques = soup.find_all('tr', attrs={'class': 'technique'})
    for tech in techniques:
        if len(tech['class']) > 1:
            # skip the ['sub', 'technique']
            continue
        tds = tech.find_all('td')
        print(tds[1].a['href'])
    mitre_dict['tactics'].append(tactic)


/techniques/T1595
/techniques/T1592
/techniques/T1589
/techniques/T1590
/techniques/T1591
/techniques/T1598
/techniques/T1597
/techniques/T1596
/techniques/T1593
/techniques/T1594
/techniques/T1650
/techniques/T1583
/techniques/T1586
/techniques/T1584
/techniques/T1587
/techniques/T1585
/techniques/T1588
/techniques/T1608
/techniques/T1659
/techniques/T1189
/techniques/T1190
/techniques/T1133
/techniques/T1200
/techniques/T1566
/techniques/T1091
/techniques/T1195
/techniques/T1199
/techniques/T1078
/techniques/T1651
/techniques/T1059
/techniques/T1609
/techniques/T1610
/techniques/T1203
/techniques/T1559
/techniques/T1106
/techniques/T1053
/techniques/T1648
/techniques/T1129
/techniques/T1072
/techniques/T1569
/techniques/T1204
/techniques/T1047
/techniques/T1098
/techniques/T1197
/techniques/T1547
/techniques/T1037
/techniques/T1176
/techniques/T1554
/techniques/T1136
/techniques/T1543
/techniques/T1546
/techniques/T1133
/techniques/T1574
/techniques/T1525
/techniques/T1556
/technique

In [49]:
# test for tactic
tactic = tactics[0]
t_url = mitre_url + tactic.a['href']
tactic_html = get_request(t_url)
soup = BeautifulSoup(tactic_html, 'html.parser')

description = soup.find('div', attrs={'class': 'description-body'})
print(description.text)

# techniques
techniques = soup.find_all('tr', attrs={'class': 'technique'})
for tech in techniques:
    if len(tech['class']) > 1:
        continue
    tds = tech.find_all('td')
    print(tds[1].a['href'])



The adversary is trying to gather information they can use to plan future operations.Reconnaissance consists of techniques that involve adversaries actively or passively gathering information that can be used to support targeting. Such information may include details of the victim organization, infrastructure, or staff/personnel. This information can be leveraged by the adversary to aid in other phases of the adversary lifecycle, such as using gathered information to plan and execute Initial Access, to scope and prioritize post-compromise objectives, or to drive and lead further Reconnaissance efforts.

/techniques/T1595
/techniques/T1592
/techniques/T1589
/techniques/T1590
/techniques/T1591
/techniques/T1598
/techniques/T1597
/techniques/T1596
/techniques/T1593
/techniques/T1594


In [6]:
import os
import json

# json and see

with open('./mitre.json', 'r') as f:
    mitre_dict = json.load(f)

for tact in mitre_dict['tactics']:
    print(tact['description'][0])
    tact['description'] = tact['description'].strip()

with open('./mitre.json', 'w') as f:
    json.dump(mitre_dict, f)

T
T
T
T
T
T
T
T
T
T
T
T
T
T
