# Password Strength Estimation Model (Training)

In [None]:
!pip install torch transformers datasets tokenizers matplotlib

In [None]:
import os
import torch
import random
import time
import yaml
import shutil
import torch
import numpy as np
from zxcvbn import zxcvbn
from transformers import GPT2LMHeadModel, GPT2Config, RobertaTokenizerFast, TrainingArguments, Trainer
from transformers.data.data_collator import DataCollatorForLanguageModeling, _torch_collate_batch
from tokenizers import ByteLevelBPETokenizer, trainers
from dataclasses import dataclass
from datasets import load_dataset, disable_caching
from pathlib import Path
from datetime import timedelta
import matplotlib.pyplot as plt

# Train Custom Tokenizer

Implement a character-level tokenizer specifically for passwords. This approach ensures each character is treated as a separate token, unlike typical NLP tokenizers that group letters into words. This method maintains a more accurate probability distribution for password modeling.

In [None]:
class PassTokenizer(ByteLevelBPETokenizer):
    """ByteLevelBPETokenizer
    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
    """

    def train_from_iterator(
        self,
        iterator,
        vocab_size: int = 30000,
        min_frequency: int = 2,
        show_progress: bool = True,
        special_tokens = [],
        length = None,
    ):
        """Train the model using the given iterator"""

        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            show_progress=show_progress,
            special_tokens=special_tokens,
            initial_alphabet=[],
        )
        self._tokenizer.train_from_iterator(
            iterator,
            trainer=trainer,
            length=length,
        )

# Set the paths directly
train_path = "../bruteforce-database/uniqpass-v16-passwords.txt"  # Replace with your actual training dataset path
output_path = "./uniqpass-v16-passwords-tokenizer/"  # Replace with your actual output directory path

print("🤞 Reading passwords")

with open(train_path, "r", encoding="utf-8") as f:
    lines = f.read().splitlines()

# Filter printable passwords
ascii_printable = []
for p in lines:
    if all(32 < ord(c) < 128 for c in p):
        ascii_printable.append(p)
    
# Log information about your data
all_chars = ''.join(ascii_printable)  # concatenate all strings into a single string
unique_chars = set(all_chars)
count = len(unique_chars)
print(f"The number of distinct letters in all strings is {count}")

# Customize training
special_tokens=[
    "<s>",
    "<pad>",
    "</s>", # This will be used to indicate end of password
    "<unk>",
    "<mask>",
]

# Create BPE tokenizer
print("🤞 Training tokenizer")
tokenizer = PassTokenizer()

# Customize training
tokenizer.train_from_iterator(ascii_printable, vocab_size=count+len(special_tokens), min_frequency=1, special_tokens=special_tokens)

print("🤞 Tokenizer trained with vocabulary")
vocab = tokenizer.get_vocab()
print(sorted(vocab, key=lambda x: vocab[x]))

output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
   
# Export
tokenizer.save_model(str(output_dir))  # Convert Path object to string
print("✅ Tokenizer exported successfully")


# Data Pre-process

Some of the data preprocessing steps that can be done are:
1. Remove passwords with length smaller than 8
2. Remove passwords with length greater than 64
3. Remove passwords with non-ASCII characters
4. Remove passwords with non-printable characters
5. Remove passwords with non-alphanumeric characters
6. Remove passwords with repeating characters
7. Remove passwords with repeating patterns
8. Remove duplicates.

In [None]:
# Load uniqpass-v16-passwords and only extract printable passwords over 10 characters
print("🤞 Reading passwords")
with open("../bruteforce-database/uniqpass-v16-passwords.txt", "r", encoding="utf-8") as f:
    lines = f.read().splitlines()
    
# Filter printable passwords
ascii_printable = []
for p in lines:
    if all(32 < ord(c) < 128 for c in p):
        ascii_printable.append(p)
        
# Filter passwords over 10 characters
ascii_printable = [p for p in ascii_printable if len(p) < 16]

# Get length of of ascii_printable array
count = len(ascii_printable)
print(f"The number of passwords in ascii_printable is {count}")

myset = set(ascii_printable)
setcount = len(myset)
print(f"The number of passwords in myset is {setcount}")


# Data Loader and Configuration

Password lists are generally line separated and can be loaded using our custom data `PasswordDataCollator` to help facilitate the embedding of the data.

Configuration can be loaded in via the `config.yaml` file.

In [None]:
%%writefile config.yaml
# Details for model architecture. Set parameters directly for GPT2Config (https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Config)
model_args:
    n_head: 12 # Number of attention heads for each attention layer in the Transformer encoder.
    n_layer: 8 # Number of transformer layers

# Execution-wide parameters
config_args:
    seed: 14
    maxchars: 16 # Maximum characters to be considered in your passwords
    subsample: -1 # -1 means no subsampling training data
    tokenizer_path: './uniqpass-v16-passwords-tokenizer' # Introdue the path or huggingface name for your tokenizer
    train_data_path: '../bruteforce-database/uniqpass-v16-passwords.txt' # Path to your training data

# Set parameters directly for TrainingArguments (https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments)
training_args:
    per_device_train_batch_size: 512 # Batch size per GPU/CPU for training
    gradient_accumulation_steps: 4 # Number of updates steps to accumulate before performing a backward/update pass.
    logging_steps: 50 # Number of steps between logging
    save_total_limit: 1 # Limit the total amount of checkpoints. Deletes the older checkpoints.
    num_train_epochs: 1 # Number of training epochs
    overwrite_output_dir: true # Overwrite the content of the output directory
    fp16: false # Whether to use 16-bit (mixed) precision training instead of 32-bit training (Does not work on MPS devices)
    output_dir: './uniqpass-v16-passwords-trained' # Where to store your checkpoints
    report_to: "wandb" # options are "wandb", "tensorboard", "mlflow", "none"
    save_steps: 200  # Number of updates steps before two checkpoint saves.

In [None]:
@dataclass
class PasswordDataCollator(DataCollatorForLanguageModeling):
    """
    CustomDataCollator for this task. It modifies the special token mask so that the end of password token is not ignored (should also be predicted).
    """
    def torch_call(self, examples):
        # Handle dict or lists with proper padding and conversion to tensor.
        if isinstance(examples[0], (dict, BatchEncoding)):
            batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
        else:
            batch = {
                "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
            }

        special_tokens_mask = batch.pop("special_tokens_mask", None) # Remove if given
        
        # Create custom special tokens mask
        special_tokens_mask = torch.where(batch['input_ids'] != self.tokenizer.pad_token_id, 0, 1)
        
        if self.mlm:
            batch["input_ids"], batch["labels"] = self.torch_mask_tokens(
                batch["input_ids"], special_tokens_mask=special_tokens_mask
            )
        else:
            labels = batch["input_ids"].clone()
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["labels"] = labels
        return batch

    
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

# Load config from file
config_path = "./config.yaml"  # Replace with your actual config path

# Check if MPS is available and set it as the default device
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")  # Fallback to CPU if MPS is not available
print(f"Using device: {device}")

with open(config_path) as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    
args = dotdict(config["config_args"])
model_args = dotdict(config["model_args"])
training_args = dotdict(config["training_args"])
training_args["seed"] = args.seed

# Init random seeds
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

assert not os.path.exists(training_args.output_dir), "The provided output path already exists, please provide a unique path."
Path(training_args.output_dir).mkdir(parents=True, exist_ok=True)


# Tokenizer the Data

Here we will use the custom tokenizer to tokenize the data into our training testset.

In [None]:

# Declare constants
TOKENIZER_MAX_LEN = args.maxchars + 2 # Additional characters for start and end of password tokens

# Load tokenizer
print("🤞 Loading tokenizer")
tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer_path, 
                                                  max_len=TOKENIZER_MAX_LEN,
                                                  padding="max_length", 
                                                  truncation=True,
                                                  do_lower_case=False,
                                                  strip_accents=False,
                                                  mask_token="<mask>",
                                                  unk_token="<unk>",
                                                  pad_token="<pad>",
                                                  truncation_side="right")

# Define dataloader
print("🤞 Loading data")

def preprocess_function(entries):
    """
    This function tokenizes a list of passwords. It appends the end of password token to each of them before processing.
    """
    to_tokenize = ['<s>' + p[:args.maxchars] +'</s>' for p in entries['text']]
    return tokenizer(to_tokenize, 
                     truncation=True, 
                     padding="max_length", 
                     max_length=TOKENIZER_MAX_LEN, 
                     add_special_tokens=False, 
                     return_special_tokens_mask=False)

data_files = {'train': [args.train_data_path]}
dataset = load_dataset('text', data_files=data_files)
print("Dataset loaded with {} entries".format(len(dataset["train"])))

if args.subsample > 0:
    print("Subsampling dataset to {} random entries".format(args.subsample))
    dataset['train'] = dataset['train'].select([i for i in range(args.subsample)])
    
# Process data
print("🤞 Processing data")
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_datasets = tokenized_datasets.shuffle(seed=args.seed)

# Format data
tokenized_datasets.set_format(type="torch")

print("🤞 Initializing model")

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    **model_args
)

model = GPT2LMHeadModel(config).to(device)
print("Model initialized with {} parameters".format(sum(t.numel() for t in model.parameters())))

print("🤞 Preparing training")
# Define the data collator. In charge of hiding tokens to be predicted.
data_collator = PasswordDataCollator(
    tokenizer=tokenizer, mlm=False
)

train_args = TrainingArguments(**training_args)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=train_args,
    train_dataset=tokenized_datasets["train"]
)

print("🚀 Launching training")
start = time.time()
trainer.train()
end = time.time()

print("👍 Training completed after {}. Storing last version.".format(str(timedelta(seconds=end-start))))
model.save_pretrained(os.path.join(training_args.output_dir, "last"))

# Comment out next lines if you want to keep several checkpoints.
print("🗑️ Deleting previous checkpoints")
checkpoints = [i for i in os.listdir(training_args.output_dir) if i.startswith("checkpoint")]
for c in checkpoints: 
    shutil.rmtree(os.path.join(training_args.output_dir, c))

print("✅ Training finished successfully :)")


# Test The Model

This model can now be used for single or conditional generation of passwords. The model can be used to generate passwords from scratch or to generate passwords based on a prefix.

## Single Generation of Password

In [None]:
import torch
from transformers import GPT2LMHeadModel, RobertaTokenizerFast

NUM_GENERATIONS = 1
MAX_CHARS = 10

tokenizer = RobertaTokenizerFast.from_pretrained("./uniqpass-v16-passwords-tokenizer/",
                                                  max_len=MAX_CHARS + 2, # Max length + start and end tokens
                                                  padding="max_length", 
                                                  truncation=True,
                                                  do_lower_case=False,
                                                  strip_accents=False,
                                                  mask_token="<mask>",
                                                  unk_token="<unk>",
                                                  pad_token="<pad>",
                                                  truncation_side="right")

model = GPT2LMHeadModel.from_pretrained("./uniqpass-v16-passwords-trained/last").eval()

# Generate passwords sampling from the beginning of password token
g = model.generate(torch.tensor([[tokenizer.bos_token_id]]),
                  do_sample=True,
                  num_return_sequences=NUM_GENERATIONS,
                  max_length=MAX_CHARS+2, # Max length + start and end tokens
                  pad_token_id=tokenizer.pad_token_id,
                  bad_words_ids=[[tokenizer.bos_token_id]])

# Remove start of sentence token
g = g[:, 1:]

decoded = tokenizer.batch_decode(g.tolist())
decoded_clean = [i.split("</s>")[0] for i in decoded] # Get content before end of password token

# Print your sampled password
print(decoded_clean)

## Multi Generation of Passwords

Here we can generate multiple passwords at once. This can be used to generate a large number of passwords for a password list for log evaluation to determine the strength of the password generation abilities of the model.

In [None]:
import os
import torch
from transformers import GPT2LMHeadModel, RobertaTokenizerFast
from datasets import load_dataset
from pathlib import Path
import numpy as np
import random
from tqdm.auto import tqdm

# Set your parameters here
model_path = "./uniqpass-v16-passwords-trained/last"  # Replace with your model path
tokenizer_path = "./uniqpass-v16-passwords-tokenizer/"  # Assuming tokenizer is saved at model path
train_data_path = "../bruteforce-database/uniqpass-v16-passwords.txt"  # Replace with your training data path
eval_data_path = "../bruteforce-database/38650-password-sktorrent.txt"  # Replace with your evaluation data path
out_path = "./generated_passwords_uniqpass-v16-passwords_trained_1000000/"  # Replace with your output path
filename = "passwords.txt"
maxchars = 16  # Set your max characters
num_generate = 1000000  # Number of passwords to generate
batch_size = 512  # Batch size for generation
num_beams = 1
top_p = 95
top_k = None
temperature = 1.2
seed_offset = 42  # Seed for randomness

# Set device to MPS if available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# device = "cpu"

# Init random seeds
random.seed(seed_offset)
np.random.seed(seed_offset)
torch.manual_seed(seed_offset)

# Ensure output path exists
os.makedirs(out_path, exist_ok=True)
assert not os.path.isfile(os.path.join(out_path, filename)), "The provided output path already exists, please provide a unique path."

# Load tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path, 
                                                 max_len=maxchars+2,
                                                 padding="max_length", 
                                                 truncation=True,
                                                 do_lower_case=False,
                                                 strip_accents=False,
                                                 mask_token="<mask>",
                                                 unk_token="<unk>",
                                                 pad_token="<pad>",
                                                 truncation_side="right")

# Load model
model = GPT2LMHeadModel.from_pretrained(model_path).eval().to(device)

# Passwords generation
generations = []

for i in tqdm(range(int(num_generate / batch_size)), desc="Generating passwords"):
    # Set seed for reproducibility
    torch.manual_seed(seed_offset + i)

    with torch.no_grad():
        # Generate tokens sampling from the distribution of codebook indices
        g = model.generate(torch.tensor([[tokenizer.bos_token_id]]).to(device), 
                           do_sample=True, 
                           max_length=maxchars+2, 
                           pad_token_id=tokenizer.pad_token_id, 
                           bad_words_ids=[[tokenizer.bos_token_id]], 
                           num_return_sequences=batch_size, 
                           num_beams=num_beams, 
                           top_p=top_p / 100, 
                           top_k=top_k, 
                           temperature=temperature)

        # Remove start of sentence token
        g = g[:, 1:]

    decoded = tokenizer.batch_decode(g.tolist())
    decoded_clean = [i.split("</s>")[0] for i in decoded]  # Get content before end of password token

    generations += decoded_clean

# Store passwords
with open(os.path.join(out_path, filename), 'w') as f:
    for line in generations:
        f.write(f"{line}\n")

# Log information
num_generated = len(generations)
num_unique = len(set(generations))
perc_unique = num_unique / num_generated * 100

data_files = {}

if train_data_path:
    data_files["train"] = [train_data_path]

if eval_data_path:
    data_files["eval"] = [eval_data_path]

if data_files:
    dataset = load_dataset('text', data_files=data_files, encoding="latin-1")

    if train_data_path:
        train_passwords = set(dataset["train"]["text"])
        inter_with_train = len(train_passwords.intersection(set(generations)))

    if eval_data_path:
        eval_passwords = set(dataset["eval"]["text"])
        inter_with_eval = len(eval_passwords.intersection(set(generations)))

# Log details
with open(os.path.join(out_path, f"log_{filename}"), 'w') as f:
    f.write(f"Passwords generated using model at: {model_path}\n")
    f.write(f"Number of passwords generated: {num_generated}\n")
    f.write(f"{num_unique} unique passwords generated => {perc_unique:.2f}%\n")
    if train_data_path:
        f.write(f"{inter_with_train} passwords were found in the training set. {100 * inter_with_train / len(train_passwords):.5f}% of the train set guessed.\n")
    if eval_data_path:
        f.write(f"{inter_with_eval} passwords were found in the test set. {100 * inter_with_eval / len(eval_passwords):.5f}% of the test set guessed.\n")


# Visualize the results of batching process over time

In [None]:
# 10^3 - 10^6
number_of_guesses = [1000, 10000, 100000, 1000000]  # Number of guesses

# Extracted from the log files
unique_passwords_percent = [100.00, 100.00, 99.85, 98.95]  # Corresponding percentage of unique passwords

# Create Chart to Show Percentage of Unique Passwords vs. Number of Guesses
plt.figure(figsize=(10, 6))

# Plotting the data with a label for the legend
plt.plot(number_of_guesses, unique_passwords_percent, marker='o', linestyle='-', color='blue', label='GPTCracker')

# Setting the x-axis to logarithmic scale
plt.xscale('log')

# Set the x-axis limits to show from 10^4 to 10^9
plt.xlim(10**4, 10**6)

# Set the y-axis limits from lowest value to 100
plt.yticks(np.arange(min(unique_passwords_percent) -5, 100, 1))

# Adding labels and title
plt.xlabel('Number of Guesses (Log Scale)')
plt.ylabel('Percentage of Unique Passwords (%)')
plt.title('Percentage of Unique Passwords vs. Number of Guesses')

# Adding the legend to the top right
plt.legend(loc='upper right')

# Show the plot with a grid
plt.grid(True)
plt.show()


## Password Strength Classification

Here we can utilize the model to classify the strength of a password by extracting the log likelihood and determine if it is high or low to numerically understand the strength of the password. 

We can then compare the strength of the password through open-source tools like `zxcvbn` to determine if the model is generating strong passwords.

In [None]:
# Load your trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./uniqpass-v16-passwords-trained/last").eval()
tokenizer = RobertaTokenizerFast.from_pretrained("./uniqpass-v16-passwords-tokenizer/")

def calculate_password_probability(model, tokenizer, password):
    # Tokenize the password and convert to tensor format
    input_ids = tokenizer.encode(password, return_tensors="pt")
    
    # Get the model's logit predictions for the password
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    # Convert logits to probabilities
    probabilities = torch.softmax(logits, dim=-1)
    
    # Calculate the probability of the password sequence
    # We take the product of probabilities for the actual tokens in input_ids
    # The first token is the BOS token, which we skip
    input_id_probs = probabilities[:, :-1].gather(2, input_ids[:, 1:].unsqueeze(-1)).squeeze(-1)
    password_probability = torch.prod(input_id_probs).item()
    
    return password_probability


def assess_password_strength(password, model, tokenizer):
    # Get password strength estimation from zxcvbn
    # 0 means the password is very weak (might be cracked in a few guesses)
    # 1 means the password is weak (might be cracked in minutes)
    # 2 means the password is fair (might be cracked in hours)
    # 3 means the password is strong (might be cracked in days)
    # 4 means the password is very strong (might be cracked in centuries)
    zxcvbn_strength = zxcvbn(password)['score']
    
    # Get password probability from model
    password_probability = calculate_password_probability(model, tokenizer, password)
    
    # Combine the two metrics to assess password strength
    combined_strength = zxcvbn_strength * (1 - password_probability)
    
    return combined_strength, password_probability, zxcvbn_strength


# Load passwords from ./generated_passwords/passwords.txt
with open("./generated_passwords_uniqpass-v16-passwords_trained_10000/passwords.txt", "r") as f:
    passwords = f.read().splitlines()
    
# passwords = passwords + ["123456", "123456789", "qwerty", "password", "12345", "qwerty123", "1q2w3e", "12345678", "111111", "1234567890"]    

# Loop over passwords and get strength
for i, password in enumerate(passwords):
    combined_strength, password_probability, zxcvbn_strength = assess_password_strength(password, model, tokenizer)
    print(f"Combined strength score for password '{password}': {combined_strength} - Prob: {password_probability}, Zxcvbn: {zxcvbn_strength}")
    
    # If first index add column names first: password,combined_strength,password_probability,zxcvbn_strength otherwie just add new record
    if i == 0:
        with open("./generated_passwords_uniqpass-v16-passwords_trained_1000/passwords_strength.csv", "a") as f:
            f.write(f"password,combined_strength,password_probability,zxcvbn_strength\n")
            f.write(f"{password},{combined_strength},{password_probability},{zxcvbn_strength}\n")
    else:
        with open("./generated_passwords_uniqpass-v16-passwords_trained_1000/passwords_strength.csv", "a") as f:
            f.write(f"{password},{combined_strength},{password_probability},{zxcvbn_strength}\n")    