In [None]:
import os
import argparse
import torch
from transformers import GPT2LMHeadModel
from pathlib import Path

from transformers import RobertaTokenizerFast

import numpy as np
import random
from tqdm import trange
import string

In [None]:
# CONSTANTS
MODEL_PATH = "javirandor/passgpt-10characters"
MAXCHARS = 10
DEVICE = 'cuda'

In [None]:
# Load the tokenizer from the model path
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_PATH, 
                                                max_len=MAXCHARS+2,
                                                padding="max_length", 
                                                truncation=True,
                                                do_lower_case=False,
                                                strip_accents=False,
                                                mask_token="<mask>",
                                                unk_token="<unk>",
                                                pad_token="<pad>",
                                                truncation_side="right")

In [3]:
# Load your PassGPT model
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH).eval().to(DEVICE)

# Conditional Password Generation
One of the main advantages of PassGPT over PassGAN is the possibility of generating passwords under arbitrary constraints. In this template code, we have created five different groups of characters that we can sample from at each position:
* `l`: lowercase letters
* `u`: uppercase letters
* `d`: digits
* `p`: punctuation
* `*`: any character in the vocabulary

You can create any template by combining these. For example, `lllldd` will generate passwords starting with four lowercase letters and finishing with two digits.

Feel free to create your own character groups below.

In [23]:
# Map each of the desired character groups into their corresponding ids (as given by the tokenizer)
lowercase = list(string.ascii_lowercase)
uppercase = list(string.ascii_uppercase)
digits = list(string.digits)
punctuation = list(string.punctuation)

lowercase_tokens = tokenizer(lowercase, add_special_tokens=False).input_ids
uppercase_tokens = tokenizer(uppercase, add_special_tokens=False).input_ids
digits_tokens = tokenizer(digits, add_special_tokens=False).input_ids
punctuation_tokens = tokenizer(punctuation, add_special_tokens=False).input_ids

In [54]:
# All possible tokens in our model
all_tokens = [[i] for i in range(len(tokenizer))]

In [121]:
def conditional_generation(template, num_generations=1):
    generated = 0
    generations = []
    
    while generated < num_generations:
        generation = torch.tensor([tokenizer.bos_token_id]).unsqueeze(0)
        current_length = 1

        for char in template:
            if char == "l":
                bad_tokens = [i for i in all_tokens if i not in lowercase_tokens]
            elif char == "u":
                bad_tokens = [i for i in all_tokens if i not in uppercase_tokens]
            elif char == "d":
                bad_tokens = [i for i in all_tokens if i not in digits_tokens]
            elif char == "p":
                bad_tokens = [i for i in all_tokens if i not in punctuation_tokens]
            else:
                bad_tokens = [[tokenizer.eos_token_id]]

            generation = model.generate(generation.to(DEVICE), do_sample=True, max_length=current_length+1, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1,  bad_words_ids=bad_tokens)
            current_length += 1
        
        if not 2 in generation.flatten():
            generations.append(generation)
            generated += 1
    
    return torch.cat(generations, 0)[:, 1:]

In [136]:
generations = conditional_generation("uuuu**dd", 5)

In [137]:
tokenizer.batch_decode(generations)

['PARLA198', 'ANTHON64', 'JRWFX786', 'CELAN777', 'QWER1234']