# 1. Project Configuration

In [1]:
import tensorflow as tf

DATA_FOLDER_DIR = "resources/data/numeric_only/sample_12digit_pattern5" # this is the location of the raw dataset to process
TF_RECORD_OUTPUT_DIR = "resources/tf_records/numeric-only/tf_12digit_pattern5" # directory of the TFRecord files to load

MAX_SEQUENCE_LENGTH = 12 # This should be the max length of the password, 12 default
VOCAB = "0123456789" # This should be the vocabulary of the password, 0123456789 default
VOCAB_SIZE = len(VOCAB)

LIMIT_TF_RECORD_DATASET = True # For testing, might be good to limit the number of TFRecord files to use
TF_RECORD_DATASET_LIMIT = 1 # Number of TFRecord files to be used for training

EPOCHS = 1
BATCH_SIZE = 512
BUFFER_SIZE = 10000
PREFETCH_SIZE = tf.data.AUTOTUNE  # Automatically tune prefetch buffer size

NUM_SAMPLES = 500000  # This should be the number of samples in each dataset: 500k default
NUM_FILES = None  # This should be the number of files in your dataset

# Used to create a generator/discriminiator to verify passwords are being one_hot_encoded correctly
G_OPTIMIZE_LEARNING_RATE = 0.0001
D_OPTIMIZE_LEARNING_RATE = 0.00001

# Model monitor for storoing genreated passwords, not needed, can be taken out in future
FOLDER_PATH = "resources/generated_passwords"

In [2]:
import os

# Check if the current working directory is notebooks, if so change to the root directory
current_dir = os.getcwd()

if current_dir.split("/")[-1] == "notebooks":
    print (f"Changing current working directory: {os.getcwd()}\n")
    os.chdir('../')

# 2. Create TensorFlow Records

In [3]:
import os

# Get all of the files in the folder
def get_all_files_in_folder(folder_path):
    """Return a list of all files in the specified folder."""

    # Get the current working directory
    current_directory = os.getcwd()

    # Print the current working directory
    print(f"Current Directory: {current_directory}")
    print(f"Getting all files in folder: {folder_path}")

    # List all items in the folder
    all_items = os.listdir(folder_path)
    
    return [item for item in all_items if os.path.isfile(os.path.join(folder_path, item))]

In [4]:
import re

# Function to preprocess passwords
def preprocess_password(password):
    # Remove non-numeric characters
    password = re.sub(r'\D', '', password)
    # Ensure password is 12 characters long
    return password if len(password) == 12 else None


In [5]:
def one_hot_encode(password):
    # Define the one-hot encoding for each digit
    encoding = []
    for char in password:
        one_hot = [0]*10
        one_hot[int(char)] = 1
        encoding.extend(one_hot)
    return encoding

In [6]:
import os

# Get all of the password files in the folder
password_files = get_all_files_in_folder(DATA_FOLDER_DIR)
print(f"\nNumber of password files: {len(password_files)}")

print(f"Checking if TFRecord output directory exists: {TF_RECORD_OUTPUT_DIR}")
if not os.path.exists(TF_RECORD_OUTPUT_DIR):
    print(f"Created TFRecord output directory: {TF_RECORD_OUTPUT_DIR}")
    os.makedirs(TF_RECORD_OUTPUT_DIR)

# Loop through each password file
print(f"Looping through each password file in: {DATA_FOLDER_DIR}")
for count, file_name in enumerate(password_files):
    print(f"  - Processing file: {count+1} of {len(password_files)} - {file_name}")
    
    # Reading and preprocessing passwords
    passwords = []
     
    password_file = os.path.join(DATA_FOLDER_DIR, file_name)
    with open(password_file, 'r') as file:
        for line in file:
            processed = preprocess_password(line.strip())
            if processed:
                passwords.append(processed)
            else:
                print(f"***** Invalid password format: {line.strip()} *****")

    # Creating a TFRecord file
    tf_record_file = os.path.join(TF_RECORD_OUTPUT_DIR, os.path.splitext(file_name)[0] + ".tfrecord")

    with tf.io.TFRecordWriter(tf_record_file) as writer:
        
        for password in passwords:
            encoded_password = one_hot_encode(password) # One-hot encode the password
            
            # Create a tf.train.Example
            feature = {
                'password': tf.train.Feature(float_list=tf.train.FloatList(value=encoded_password))
            }
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            
            writer.write(example.SerializeToString()) # Serialize and write the example

Current Directory: /Users/bambrick/DevCenter/Juypter/PasswordGAN/notebooks/demo
Getting all files in folder: resources/data/numeric_only/sample_12digit_pattern5


FileNotFoundError: [Errno 2] No such file or directory: 'resources/data/numeric_only/sample_12digit_pattern5'