# Reports analysis

Understand what should be taken from the reports.

In [246]:
# Test with some samples
from IPython.display import display
from lxml import etree
import gzip
import json

data_dir = 'data/analyses_gz/'
samples = ['MjhkYjM1ZjY1YWNkNDQ1Y2FiYmM1M2MwNTI0OGEzYjA',
           'MTRiZjc0YTA2MmYwNDA2NDk4MDA1YzU2NzJkY2ZkYjc',
           'MzIwZDgzMjY0YjQ5NGQxMjhkZjk1YjE0YTlkNGQ1OTE',
           'MzMxZjlkMDljZDA3NDViMThmNzMwOWYwYWNhMGY1MmY',
           'YjYxNGFhZTNlZDFkNGI5Yjk1NmEzM2ZlM2EwZWU0YmQ',
           'MzMxZjlkMDljZDA3NDViMThmNzMwOWYwYWNhMGY1MmY',
           'YTU5ZmNiNzM0OTgyNDQxOTgyMGU5N2MwMGFiYjQzMmI',
           'MzYwNjVjZjA4ODgzNGUyOGJiMDMzZWRjNTFlNTcyMDg',
           'Mzg3NTIyMTAzZWVmNDQ3OTgzZTM3MDE5NGJlNzQwNzA',
           'MzExMzhlYzIyMjZlNGI5ZGE1M2E5MTU3NzdkZTNmOTk',
           'NWExYTBjZGVjMzQzNDBlYzg2MjBkM2NjODlhYTcwZWI',
           'Zjk1YzcxNDUzNGFiNDliZWFiOWM3NTE3NDE2YThlM2I',
           'MTkwZTNmNDRlNWU4NDVlZDllNzY1NWQ0NWE1NjQ3OTk',
           'M2Q0OWI2ODJmMTA3NDU5Y2E2MDM3OWNmODlkOWE3OTI',
           'OTY0ODgzZGQyNGVmNGEwNTgxMjlkYTA5MzcwMjBlYjI',
           'ODg3MzI1MzVkMDBmNGMyNTk0ZTgxM2I1MzMzODMxNGQ',
           'NjEwYzNmZDRmYTBiNGQ2NDhmOTcyMjYzY2I3NDdhZWM',
           'YjY2YWMxYTdkN2IwNDc5N2E4MDc4YzFlMjM5YzRlNzE',]


def extract_pe_imphash(doc):
    """
        Returns the PE imphash or empty
    """
    # imphash is always the first on the list
    return list(map(lambda x: x.strip(), doc.xpath('//section[@id="static_analysis"]/div/div[@class="well"]/text()'))) or None

def extract_signatures(doc):
    """
        Returns the matched signatures or empty
    """
    # Lambda to remove the #
    return list(map(lambda x: x[1:], doc.xpath('//section[@id="signatures"]/a/@href'))) or None

def extract_hosts(doc):
    """
        Returns the hosts or empty
    """
    # Lambda to remove whitespaces
    return list(map(lambda x: x.strip(), doc.xpath('//div[@id="network_hosts_tab"]/section[@id="hosts"]/table/tr/td/text()'))) or None

def extract_domains(doc):
    """
        Returns the domains + host dict or empty
    """
    # Cannot use zip since some domains have no matching host
    domains = dict()
    for row in doc.xpath('//div[@id="network_domains_tab"]/section[@id="domains"]/table/tr[position()>1]'):
        host = row.xpath('td[2]/text()')
        if not host:
            host = ''
        else:
            host = host[0]
        domains[row.xpath('td[1]/text()')[0]] = host
    return domains or None

def extract_files(doc):
    """
        Returns the touched files or empty
    """
    # Lambda to remove whitespaces, filter to remove empty values
    return list(filter(None, map(lambda x: x.strip(), doc.xpath('//section[@id="summary"]//div[@id="summary_files"]/div/text()')))) or None

def extract_keys(doc):
    """
        Returns the touched registry keys or empty
    """
    # Lambda to remove whitespaces, filter to remove empty values
    return list(filter(None, map(lambda x: x.strip(), doc.xpath('//section[@id="summary"]//div[@id="summary_keys"]/div/text()')))) or None

def extract_mutexes(doc):
    """
        Returns the mutexes or empty
    """
    # Lambda to remove whitespaces, filter to remove empty values
    return list(filter(None, map(lambda x: x.strip(), doc.xpath('//section[@id="summary"]//div[@id="summary_mutexes"]/div/text()')))) or None

def extract_peversioninfo(doc):
    """
        Returns PE version info as a dict or empty
    """
    info = dict()
    for tr in doc.xpath('//section[@id="static_analysis"]//div[@id="pe_versioninfo"]/table/tr'):
        val = tr.xpath('td/span/text()')
        # Skip rows without value
        if not val:
            continue
        info[tr.xpath('th/text()')[0]] = tr.xpath('td/span/text()')[0]
    return info or None

def extract_pesections(doc):
    """
        Returns PE sections as a dict or empty
    """
    sections = []
    # Use the table header as keys
    headers = map(lambda x: x.lower(),
                  doc.xpath('//section[@id="static_analysis"]//div[@id="pe_sections"]/table/tr[1]/th/text()'))
    for tr in doc.xpath('//section[@id="static_analysis"]//div[@id="pe_sections"]/table/tr[position()>1]'):
        sections.append(dict(zip(headers, tr.xpath('td/text()'))))
    return sections or None

def extract_peresources(doc):
    """
        Returns PE resources as a dict or empty
    """
    sections = []
    headers = map(lambda x: x.lower(),
                  doc.xpath('//section[@id="static_analysis"]//div[@id="pe_resources"]/table/tr[1]/th/text()'))
    for tr in doc.xpath('//section[@id="static_analysis"]//div[@id="pe_resources"]/table/tr[position()>1]'):
        sections.append(dict(zip(headers, tr.xpath('td/text()'))))
    return sections or None

def extract_peimports(doc):
    """
        Returns PE imports as a dict or empty
    """
    imports = dict()
    # Imports from each dll are inside a div
    for well in doc.xpath('//section[@id="static_analysis"]//div[@id="pe_imports"]/div[@class="well"]'):
        dll = well.xpath('div[1]/strong/text()')[0].lower().replace('library ', '')
        functions = well.xpath('div[position()>1]/span/a/text()')
        imports[dll] = functions
    return imports or None

def extract_strings(doc):
    """
        Returns strings or empty
    """
    return doc.xpath('//section[@id="static_strings"]/div[@class="well"]/div/text()') or None
    
    
def extract_antivirus(doc):
    """
        Returns the antivirus as a dict or empty
    """
    av = doc.xpath('//section[@id="static_antivirus"]/table/tr[position()>1]/td[1]/text()')
    clss = doc.xpath('//section[@id="static_antivirus"]/table/tr[position()>1]/td[2]/span/text()')
    return dict(zip(av, clss)) or None

def extract_dynamic(doc):
    """
        Returns the dynamic calls as a dict or empty
    """
    data = doc.xpath('//div[@id="behavior"]//script[@type="text/javascript" and contains(., "graph_raw_data")]/text()')
    # If there's no match
    if not data:
        return None
    data = data[0].strip().replace('var graph_raw_data = ', '')[:-1]
    return json.loads(data)

def extract_http(doc):
    """
        Returns the HTTP requests as dict or empty
    """
    http = []
    for row in doc.xpath('//div[@id="network_http_tab"]/table/tr[position()>1]'):
        http.append({row.xpath('td[1]/text()')[0]: row.xpath('td[2]/pre/text()')[0]})
    return http or None

def extract_irc(doc):
    """
        Returns the IRC traffic or empty
    """
    return list(map(lambda x: x.strip(), doc.xpath('//div[@id="network_irc_tab"]/pre/text()'))) or None

def extract_smtp(doc):
    """
        Returns pair with number of SMTP requests and example or empty
    """
    example = doc.xpath('//div[@id="network_smtp_tab"]/pre/text()')
    if example:
        example = example[0].strip()
        number = doc.xpath('//div[@id="network_smtp_tab"]/p/text()')[0].split(': ')[1]
        return (number, example)
    return None

def extract_dropped(doc):
    """
        Returns list of dropped files or empty
    """
    files = []
    for drop in doc.xpath('//div[@id="dropped"]//table'):
        keys = map(lambda x: x.lower(), drop.xpath('tr/th/text()'))
        values = map(lambda x: x.strip(), drop.xpath('tr/td/text()|tr/td/b/text()'))
        files.append(dict(zip(keys, values)))
    return files or None


for sample in samples:
    with gzip.open(data_dir + sample) as gz_file:
        content = gz_file.read()

    doc = etree.HTML(content)
    h4 = set()
    th = set()
    for h in doc.xpath('//h4/text()'):
        h4.add(h)
    for t in doc.xpath('//th/text()'):
        th.add(t)
        
    # print(extract_pe_imphash(doc))
    # print(extract_signatures(doc))
    # print(extract_hosts(doc))
    # print(extract_domains(doc))
    # print(extract_files(doc))
    # print(extract_keys(doc))
    # print(extract_mutexes(doc))
    # print(extract_peversioninfo(doc))
    # print(extract_pesections(doc))
    # print(extract_peresources(doc))
    print(extract_peimports(doc))
    # print(extract_strings(doc))
    # print(extract_antivirus(doc))
    # print(extract_dynamic(doc))
    # print(extract_http(doc))
    # print(extract_irc(doc))
    # print(extract_smtp(doc))
    # print(extract_dropped(doc))

display(h4)
display(th)
# Find a sample with IRC traffic
# from os.path import isfile, join
# from os import listdir
# files = [f for f in listdir('data/analyses_gz/') if isfile(join('data/analyses_gz/', f))]
# counter = 0
# for f in files:
#     with gzip.open('data/analyses_gz/' + f) as html:
#         content = html.read().decode('utf8')
#     if 'No SMTP requests performed.' not in content:
#         counter += 1
#         print(f)
#         if counter == 10:
#             break

{'kernel32.dll': ['GetModuleHandleA', 'LoadLibraryA', 'GetProcAddress', 'ExitProcess', 'VirtualAlloc', 'VirtualFree'], 'user32.dll': ['MessageBoxA']}
{'shell32.dll': ['SHGetSpecialFolderPathW', 'SHGetFolderPathW', 'ShellAboutW', 'None', 'ShellExecuteExW'], 'shlwapi.dll': ['None'], 'gdiplus.dll': ['GdipDrawLineI', 'GdipDrawArcI', 'GdipFillRectangleI', 'GdipCloneBrush', 'GdipCloneImage', 'GdipCreateBitmapFromHBITMAP', 'GdipCreateFromHDC', 'GdipDrawImageRectI', 'GdipSetInterpolationMode', 'GdipSetPageUnit', 'GdipCreateSolidFill', 'GdipCreateBitmapFromScan0', 'GdipDisposeImage', 'GdipDeleteGraphics', 'GdipDeletePen', 'GdipCreatePen1', 'GdipDeleteBrush', 'GdipAlloc', 'GdipFree', 'GdiplusStartup', 'GdiplusShutdown', 'GdipGetImageGraphicsContext', 'GdipSetSmoothingMode', 'GdipCloneBitmapAreaI', 'GdipCreateHBITMAPFromBitmap'], 'advapi32.dll': ['RegEnumKeyExW', 'RegOpenKeyExW', 'RegQueryInfoKeyW', 'RegGetValueW', 'RegEnumValueW', 'RegDeleteKeyW', 'RegQueryValueExW', 'RegSetValueExW', 'OpenSCMan

{'Analysis',
 'Domains',
 'File Details',
 'HTTP Requests',
 'Hosts',
 'IRC Traffic',
 'Imports',
 'PE Imphash',
 'Resources',
 'SMTP Requests',
 'Screenshots',
 'Sections',
 'Signatures',
 'Summary'}

{'Antivirus',
 'CRC32',
 'Category',
 'Completed',
 'Data',
 'Domain',
 'Duration',
 'Entropy',
 'File Name',
 'File Size',
 'File Type',
 'File name',
 'File type',
 'IP',
 'Language',
 'MD5',
 'Name',
 'Offset',
 'SHA1',
 'SHA256',
 'SHA512',
 'Signature',
 'Size',
 'Size of Raw Data',
 'Ssdeep',
 'Started',
 'Sub-language',
 'URI',
 'Virtual Address',
 'Virtual Size',
 'Yara'}