In [3]:
import sys
print(sys.version)

3.12.3 (main, Sep 11 2024, 14:17:37) [GCC 13.2.0]


In [18]:
#
# Load and extract the 327k bucket names downloaded from Grayhat Warfare
#
# The provided S3 names contain complete AWS domain information as well, which will be one of the following formats:
# 1) [bucket_name].s3.amazonaws.com
# 2) [bucket_name].s3-[aws_region].amazonaws.com
# 3) [bucket_name].s3.[aws_region].amazonaws.com
# 
#
import json, re

def extract_bucket_name(bucket_url):
    # Define regex pattern to match and capture the bucket name
    pattern = r"^(.+)?\.s3(?:[\.-])(.+)?amazonaws.com(?:\.[a-z]{2})?$"
    
    # Search for the pattern in the given URL
    match = re.match(pattern, bucket_url)
    
    # Return the bucket name if found, otherwise None
    return match.group(1) if match else None


with open('buckets.json', 'r') as f:
    gh_data = json.load(f)

bucket_names = [x['bucket'] for x in gh_data]
bucket_names = [extract_bucket_name(x) for x in bucket_names]
bucket_names = [b for b in bucket_names if b is not None]

# Save point, all bucket names minus the AWS domains
with open('parsed_buckets.json', 'w') as f:
    json.dump(bucket_names, f, indent=2)

In [104]:
#
# Let's attempt to tokenize
#
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

##
## Due to weird behavior, I ended up downloading *all* corpora and models to get the
## punkt tokenizer operational.
##
#nltk.download()

import json

with open('parsed_buckets.json') as f:
    buckets = json.load(f)

# Start with a simple Regex based tokenizer for the light work.
custom_tokenizer = RegexpTokenizer(r'\d+|[a-zA-Z]+')
tokenized_buckets = []
for b in buckets:
   tokenized_buckets.append(custom_tokenizer.tokenize(b.lower()))

# Move onto a dictionary based tokenizer
from wordfreq import top_n_list

# Approx 235k English words, we need to remove single letters
english_words = top_n_list("en", 3000)
english_words = set([word for word in english_words if len(word) > 2])

def dictionary_split(input_word) -> list:
    # print(f"Testing {input_word}")
    splits = []
    for i in range(1, len(input_word)):
        prefix, suffix = input_word[:i], input_word[i:]    
        if prefix in english_words:
            # We've matched a word based on the prefix. Recurse for the suffix, and
            # add the combination as a contender
            current_split = [prefix]
            current_split.extend(dictionary_split(suffix))
            splits.append(current_split)            
        if suffix in english_words:
            # We've matched a word based on the suffix. Recurse for the suffix, and
            # add the combination as a contender
            current_split = dictionary_split(prefix)
            current_split.append(suffix)
            splits.append(current_split)
            
    # After iteration, if splits is empty then return the full word.  Otherwise return the 
    # Split with the least number of words
    if len(splits) == 0:
        return [input_word]
    else:
        min_splits = splits[0]
        for test_split in splits:
            if len(test_split) < len(min_splits):
                min_splits = test_split
        return min_splits    
    
for bucket_tokens in tokenized_buckets[:20]:
    local_split = []
    for token in bucket_tokens:
        local_split.extend(dictionary_split(token))
    print(local_split)
    
# NOTE: Bert broke words based on tokens with no regard for the english dictionary
# Move on to a BERT based tokenizer to split words such as "websitecontent" and "siliconvalley"
# from transformers import BertTokenizer
# import torch
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# for bucket_tokens in tokenized_buckets[30:50]:
#     for token in bucket_tokens:
#         print(f"Input: {token}, output: {tokenizer.tokenize(token)}")
# print(tokenized_buckets[50:100])




['static', 'test']
['sha', 'red', 'public']
['as', 'sets', 'sha', 'red']
['screen', 'shots', 'test']
['images', 'test']
['cu', 'website']
['media', 'sha', 'red']
['sha', 'red', 'common']
['sf', 'as', 'sets']
['cu', 'test']
['re', 'sources', 'dev']
['dev', 'web']
['devbuilds']
['tempdev']
['sfuploads']
['dev', 'app']
['sf', 'img']
['media', 'sign', 'age']
['net', 'dev']
['sign', 'age', 'staging']


In [101]:
from wordfreq import top_n_list

# Approx 235k English words, we need to remove single letters
english_words = top_n_list("en", 3000)
english_words = set([word for word in english_words if len(word) > 1])

# print("\n".join(english_words))

inputs = [
    "compoundword",
    "myscreenshots",
    "screenshotlibrary",
    "asdfbasdfdsaf",
    "mygarbageghggfgfgf"
]

def dictionary_split(input_word) -> list:
    # print(f"Testing {input_word}")
    splits = []
    for i in range(1, len(input_word)):
        prefix, suffix = input_word[:i], input_word[i:]    
        if prefix in english_words:
            # We've matched a word based on the prefix. Recurse for the suffix, and
            # add the combination as a contender
            current_split = [prefix]
            current_split.extend(dictionary_split(suffix))
            splits.append(current_split)            
        if suffix in english_words:
            # We've matched a word based on the suffix. Recurse for the suffix, and
            # add the combination as a contender
            current_split = dictionary_split(prefix)
            current_split.append(suffix)
            splits.append(current_split)
            
    # After iteration, if splits is empty then return the full word.  Otherwise return the 
    # Split with the least number of words
    if len(splits) == 0:
        return [input_word]
    else:
        min_splits = splits[0]
        for test_split in splits:
            if len(test_split) < len(min_splits):
                min_splits = test_split
        return min_splits    

for input in inputs:
    print(f"Final: {dictionary_split(input)}")

Final: ['co', 'mpound', 'word']
Final: ['my', 'screen', 'shots']
Final: ['screen', 'shot', 'library']
Final: ['as', 'dfbasdfdsaf']
Final: ['my', 'garbageghggfgfgf']
