In [3]:
!pip install tensorflow==2.6.0
!pip install tensorflow-datasets==4.4.0

import tensorflow as tf
import tensorflow_datasets as tfds

print("tensorflow version: " + tf.__version__)
# !pip freeze

Collecting tensorflow==2.6.0
  Downloading tensorflow-2.6.0-cp37-cp37m-manylinux2010_x86_64.whl (458.3 MB)
[K     |████████████████████████████████| 458.3 MB 2.4 kB/s 
[?25hCollecting clang~=5.0
  Downloading clang-5.0.tar.gz (30 kB)
Collecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting typing-extensions~=3.7.4
  Downloading typing_extensions-3.7.4.3-py3-none-any.whl (22 kB)
Collecting wrapt~=1.12.1
  Downloading wrapt-1.12.1.tar.gz (27 kB)
Building wheels for collected packages: clang, wrapt
  Building wheel for clang (setup.py) ... [?25l[?25hdone
  Created wheel for clang: filename=clang-5.0-py3-none-any.whl size=30692 sha256=45d854390bede74c89e192a562426efdf613efb695e64018ac32384d1cdcb2fb
  Stored in directory: /root/.cache/pip/wheels/98/91/04/971b4c587cf47ae952b108949b46926f426c02832d120a082a
  Building wheel for wrapt (setup.py) ... [?25l[?25hdone
  Created wheel for wrapt: filename=wrapt-1.12.1-cp37-cp37m-linux_x86_64.whl size

In [4]:
gpus = tf.config.list_physical_devices('GPU')
gpus

# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#   raise SystemError('GPU device not found')
# print('Found GPU at: {}'.format(device_name))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [6]:
class GANConfig:
    DATASET_NAME = "rock_you"
    NOISE_INPUT_SIZE = 128  # noise input size
    BACH_SIZE = 128
    EPOCHS = 10
    LAYER_DIM = 128
    GRADIENT_PENALTY = 10
    OUTPUT_SEQ_LENGTH = 10
    DISC_ITERATIONS_PER_GEN_ITERATIONS = 10  # How many discriminator iterations per generator iteration
    
    INITIAL_TRAIN_SIZE = 1000  # Train size for starting training in local environment
    INITIAL_TRAINING = True  # specify loading initial training or 2.5M passwords for actual training

    # Adam Optimizer"s hyper-parameters
    LEARNING_RATE = 0.0001
    BETA_1 = 0.5
    BETA_2 = 0.9


In [7]:
ds, ds_info = tfds.load(name=GANConfig.DATASET_NAME, split='train[:1000]', with_info=True)

print(ds_info)
print(f"dataset {GANConfig.DATASET_NAME} loaded train size: {len(ds)}")

ds_train = ds.map(lambda ds: ds['password'])
print("10 passwords from train set: -------------------- ")

for data in ds_train.take(10):
  print(data.numpy().decode("utf-8"))

dataset = []
vocabulary = set(" ")
for data in ds_train:
  try:
    word: str = data.numpy().decode("utf-8")
    if len(word) <= 10:
        dataset.append(word.ljust(10))
        vocabulary |= set(word)

  except Exception:
      pass

char2id = dict((c, i) for i, c in enumerate(vocabulary))
ds = tf.data.Dataset.from_tensor_slices(dataset)

# chose password of length less than or equal to 10 characters
# Cache dataset for future use
ds = ds.batch(GANConfig.BACH_SIZE, drop_remainder=True)
ds = ds.cache()

print(f"train dataset size with passwords lengh <= 10 characters: {len(ds) * GANConfig.BACH_SIZE}")
print(f"vocabulary size: {len(char2id)}")


[1mDownloading and preparing dataset 133.44 MiB (download: 133.44 MiB, generated: 393.36 MiB, total: 526.80 MiB) to /root/tensorflow_datasets/rock_you/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/14344391 [00:00<?, ? examples/s]

Shuffling rock_you-train.tfrecord...:   0%|          | 0/14344391 [00:00<?, ? examples/s]

[1mDataset rock_you downloaded and prepared to /root/tensorflow_datasets/rock_you/1.0.0. Subsequent calls will reuse this data.[0m
tfds.core.DatasetInfo(
    name='rock_you',
    full_name='rock_you/1.0.0',
    description="""
    This dataset contains 14,344,391 passwords that were leaked or stolen from
    various sites. The author of this dataset states that "I'm hosting them because
    it seems like nobody else does (hopefully it isn't because hosting them is
    illegal :)). Naturally, I'm not the one who stole these; I simply found them
    online, removed any names/email addresses/etc.".
    
    This dataset is used to train Machine Learning models for password guessing
    and cracking.
    """,
    homepage='https://wiki.skullsecurity.org/Passwords',
    data_path='/root/tensorflow_datasets/rock_you/1.0.0',
    download_size=133.44 MiB,
    dataset_size=393.36 MiB,
    features=FeaturesDict({
        'password': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=

In [8]:
!pip install keras==2.6.0
class GANLoss:
    def __init__(self):
        # This method returns a helper function to compute cross entropy loss
        self.cross_entropy = tf.keras.losses.BinaryCrossentropy()

    def discriminator_loss(self, real_output, fake_output):
        real_loss = self.cross_entropy(tf.ones_like(real_output), real_output)
        fake_loss = self.cross_entropy(tf.zeros_like(fake_output), fake_output)
        total_loss = real_loss + fake_loss
        return total_loss

    def generator_loss(self, fake_output):
        return self.cross_entropy(tf.ones_like(fake_output), fake_output)

class ResidualBlock(tf.keras.Model):
    """
        create a class Residual block based on Residual Networks definition
    """
    def __init__(self, dim):
        super().__init__()
        self.res_block = tf.keras.Sequential([
            tf.keras.layers.ReLU(True),
            tf.keras.layers.Conv1D(dim, dim, 5, padding='same'),
            tf.keras.layers.ReLU(True),
            tf.keras.layers.Conv1D(dim, dim, 5, padding='same'),
        ])

    def call(self, input_data, **kwargs):
        output = self.res_block(input_data)
        return input_data + (0.3 * output)


class GeneratorNetwork(tf.keras.Model):
    def __init__(self, dim, pass_length):
        super(GeneratorNetwork, self).__init__()

        self.dim = dim
        self.pass_length = pass_length

        # instantiate a Sequential Model
        self.generator_res_block_model = tf.keras.models.Sequential()

        # first linear layer
        self.first_linear_layer = tf.keras.layers.Dense(pass_length, activation='linear', input_shape=[dim*pass_length, ])

        # residual blocks in a sequential order
        self.generator_res_block_model.add(ResidualBlock(dim=dim))
        self.generator_res_block_model.add(ResidualBlock(dim=dim))
        self.generator_res_block_model.add(ResidualBlock(dim=dim))
        self.generator_res_block_model.add(ResidualBlock(dim=dim))
        self.generator_res_block_model.add(ResidualBlock(dim=dim))

        # convolutional 1D layer
        """
        
        """
        self.conv_1d_layer = tf.keras.layers.Conv1D(dim, 1, padding='valid')

        # last soft max layer
        self.softmax_layer = tf.keras.layers.Softmax(axis=1)

    def call(self, input_noise, **kwargs):
        """

        :param input_noise: noise input of some sample generated passwords
        :param kwargs:
        :return: the generated passwords for an iteration
        """

        # feed first layer with noise data
        output = self.first_linear_layer(input_noise)

        # reshape the result of linear layer
        output = tf.reshape(output, [-1, 2, self.dim])

        # feed residual blocks by output from reshape stage
        output = self.generator_res_block_model(output)
        # output = tf.reshape(output, (1, 32, 8))

        # feed resulted data to convolution layer
        output = self.conv_1d_layer(output)

        # transpose operation on the resulted output
        output = tf.transpose(output)

        # feed soft-max layer with transposed output
        output = self.softmax_layer(output)
        # output = tf.reshape(output, [2, 1, 32])

        return output


class DiscriminatorNetwork(tf.keras.Model):
    def __init__(self, dim, pass_length):
        super(DiscriminatorNetwork, self).__init__()
        self.dim = dim
        self.pass_length = pass_length

        self.block = tf.keras.Sequential([
            ResidualBlock(dim=dim),
            ResidualBlock(dim=dim),
            ResidualBlock(dim=dim),
            ResidualBlock(dim=dim),
            ResidualBlock(dim=dim),
        ])
        self.conv1d = tf.keras.layers.Conv1D(dim, 1, padding='valid')
        self.linear = tf.keras.layers.Dense(dim, activation='linear', input_shape=(dim*pass_length, ))

    def call(self, input_data, **kwargs):
        output = tf.transpose(input_data)
        # , [0, 2, 1]
        output = self.conv1d(output)
        output = self.block(output)
        output = tf.reshape(output, (-1, 64, 4))
        output = self.linear(output)
        return output

class GANOpt:
    """
        Define functions to return Adam optimizer for both networks
    """
    def __init__(self):
        pass

    def get_generator_opt(self):
        return tf.keras.optimizers.Adam(1e-4, beta_1=GANConfig.BETA_1, beta_2=GANConfig.BETA_2)

    def get_discriminator(self):
        return tf.keras.optimizers.Adam(1e-4, beta_1=GANConfig.BETA_1, beta_2=GANConfig.BETA_2)


Collecting keras==2.6.0
  Downloading keras-2.6.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.4 MB/s 
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.7.0
    Uninstalling keras-2.7.0:
      Successfully uninstalled keras-2.7.0
Successfully installed keras-2.6.0


In [12]:
import datetime
import string
import time
from math import log

import tensorflow as tf
from keras.layers import TextVectorization
from keras.utils.np_utils import to_categorical

import numpy as np
from numpy import save


class TrainGAN:
    def __init__(self):
        self.generator = GeneratorNetwork(dim=GANConfig.LAYER_DIM, pass_length=GANConfig.OUTPUT_SEQ_LENGTH)
        self.discriminator = DiscriminatorNetwork(dim=GANConfig.LAYER_DIM, pass_length=GANConfig.OUTPUT_SEQ_LENGTH)
        self.gan_loss = GANLoss()
        self.gan_opt = GANOpt()
        self.generator_opt: tf.keras.optimizers.Adam = self.gan_opt.get_generator_opt()
        self.discriminator_opt: tf.keras.optimizers.Adam = self.gan_opt.get_generator_opt()
        self.vocab_size = None
        self.char2id = None

    # @tf.function
    def train_step(self, passwords, epoch):
        """
        this would be called on each iteration
            > Here we use tensorflow GradiantTape to record operations for differentiation for each epoch.

        :param epoch:
        :param passwords:
        :return:
        """
        # generates a new set of random values every time:
        tf.random.set_seed(5)
        z = tf.random.uniform(shape=[GANConfig.NOISE_INPUT_SIZE, self.vocab_size, GANConfig.OUTPUT_SEQ_LENGTH],
                              minval=0, maxval=1, dtype=tf.float32)

        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            for _ in range(GANConfig.DISC_ITERATIONS_PER_GEN_ITERATIONS):
                encoded_passwords = [[self.char2id.get(c) for c in password.decode('utf-8')] for password in
                                     passwords.numpy()]
                one_hot_encoded = [tf.constant(to_categorical(p, num_classes=self.vocab_size)) for p in
                                   encoded_passwords]
                numpy_one_hot = np.array(one_hot_encoded)

                # Pass real passwords to discriminator for producing real output, this will be used for disc_loss calculations
                real_output = self.discriminator.call(input_data=numpy_one_hot)

            # Every time pass noisy passwords to generator, so this will generated new ones
            generated_passwords = self.generator.call(input_noise=z)
            if epoch % 100 == 0:
                save(f"{GANConfig.PROBABILITY_DIR}/epoch_{epoch}_prod.npy", generated_passwords)

            generated = tf.reshape(generated_passwords, [128, 10, self.vocab_size])
            generated_argmax = np.argmax(generated, axis=-1)

            # pass generator output to discriminator
            generated = tf.reshape(generated_passwords, [128, 10, self.vocab_size])
            fake_output = self.discriminator.call(input_data=generated)

            # calculate both generator and discriminator losses
            gen_loss = self.gan_loss.generator_loss(fake_output)
            disc_loss = self.gan_loss.discriminator_loss(real_output, fake_output)

            # compute gradient based on computed losses
            gradients_of_generator = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
            gradients_of_discriminator = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

            # apply adam optimizer on both networks
            self.generator_opt.apply_gradients(zip(
                gradients_of_generator, self.generator.trainable_variables))
            self.discriminator_opt.apply_gradients(zip(
                gradients_of_discriminator,
                self.discriminator.trainable_variables))

            # self.generator.summary()
            # self.discriminator.summary()

        return gen_loss, disc_loss, generated_argmax

    def train(self, dataset, char2id, epochs):
        self.generator.build(input_shape=[])
        self.discriminator.build(input_shape=[])

        generated = None
        start = time.time()

        self.char2id = char2id
        self.vocab_size = len(char2id)

        for epoch in range(epochs):
            epoch_start = time.time()

            start_time_str = datetime.datetime.now().strftime(format="%Y-%m-%dT%H-%M-%S")

            print(f"epoch {epoch} started at {start_time_str}")
            gen_loss_list = []
            disc_loss_list = []

            for batch in dataset:
                gen_loss, disc_loss, generated = self.train_step(batch, epoch)
                gen_loss_list.append(gen_loss)
                disc_loss_list.append(disc_loss)

            g_loss = sum(gen_loss_list) / len(gen_loss_list)
            d_loss = sum(disc_loss_list) / len(disc_loss_list)

            epoch_elapsed = time.time() - epoch_start
            print(f"saving losses and generated data for epoch {epoch}")
            losses_string = f'Epoch {epoch}, gen loss={g_loss},disc loss={d_loss}, {self.hms_string(epoch_elapsed)}'
            self.save_losses(file_name=f"epoch_{epoch}_losses", losses_string=losses_string)

            # convert generated passwords vector to password strings, then save them to a text file
            
            current_time_str = datetime.datetime.now().strftime(format="%Y%m%d-%H%M%S")
            if epoch % 10 == 0:
              self.save_generated_passwords(generated, f"generated-password_epoch-{str(epoch)}_{current_time_str}")

        elapsed = time.time() - start
        print(f'Training time: {elapsed}')

    @staticmethod
    def hms_string(sec_elapsed):
        h = int(sec_elapsed / (60 * 60))
        m = int((sec_elapsed % (60 * 60)) / 60)
        s = sec_elapsed % 60
        return "{}:{:>02}:{:>05.2f}".format(h, m, s)

    def _get_vocabulary(self):
        # vocabulary = [char for char in string.printable]
        # vocabulary.append('<unk>')
        # char2id = dict((c, i) for i, c in enumerate(vocabulary))
        return self.char2id

    def _convert_password_float_vector_to_string(self, generated_password_vector):
        char2id = self.char2id
        id2char = {}
        for key, val in char2id.items():
            id2char[val] = key
        password = ''
        for char_id in generated_password_vector:
            password += str(id2char.get(char_id) if id2char.get(char_id) else " ")
        return password

    def save_losses(self, file_name, losses_string):
        file = open(f'{GANConfig.LOSSES_DIR}/{file_name}.txt', 'a')
        file.write(losses_string)
        file.write("\n")
        file.close()

    def save_generated_passwords(self, passwords, file_name):
        file = open(f'{GANConfig.GENERATED_DIR}/{file_name}.txt', 'w')
        for password in passwords:
            word = self._convert_password_float_vector_to_string(password)
            file.write(word)
            file.write("\n")
        file.close()

    def _get_probability_of_character(self, character):
        """
            The soft-max output of PassGAN
            acts as posterior distribution over character set.

            here we calculate the probability of each character from the conditional posterior distribution.
        :param character:
        :return:
        """
        return 0.5

    def get_prediction_probability(self, password: str, charset: list):
        """
            Algorithm 1: Get prediction probability
            Result: score
            Input : password, model, charmap,;
            prob = 1;
            for char in password do
                char_prob = model.getProbability(char);
                prob = prob × char_prob
            end
            score = -log(prob)

        :param password:
        :param charset:
        :return:
        """
        prob = 1
        for char in password:
            char_prob = self._get_probability_of_character(character=char)
            prob *= char_prob
        score = -log(prob)
        return score



In [None]:
training = TrainGAN()

training.train(dataset=ds, char2id=char2id, epochs=GANConfig.EPOCHS)