### Configuration
If running with GPT2-generated responses, you must first run `pip install transformers accelerate`.

In [None]:
GENERATE_RESPONSES_GPT2 = True
SOLVE_CAPTCHAS = False # Seemingly not required for this form

### Imports

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions 
from selenium.webdriver.support.select import Select 
from selenium_recaptcha_solver import RecaptchaSolver
from selenium.common.exceptions import TimeoutException
import time
import os
from pathlib import Path

import pandas as pd
import numpy as np
from faker import Faker

if GENERATE_RESPONSES_GPT2:
    from transformers import AutoTokenizer, AutoModelForCausalLM

### Load data

In [None]:
fake_people = pd.read_csv("fake_details/FakeNameGenerator.com.csv")
print(f"Loaded {len(fake_people)} fake people")

# This dict has one entry per file in the 'fake_fields' folder
fields_info = dict[str, list[str]]()
for filepath in Path("fake_fields").iterdir():
    if filepath.suffix.lower() == ".txt":
        with filepath.open() as f:
            lines = f.readlines()
        fields_info[filepath.stem] = [line.strip() for line in lines]
print(f"Loaded {len(fields_info)} fields: {list(fields_info.keys())}")

### Helpers

In [None]:
def get_random_browser_agent() -> str:
    idx = np.random.choice(len(fake_people))
    return fake_people['BrowserUserAgent'].iloc[idx]

def get_random_fake_fields() -> pd.Series:
    ser = pd.Series(index = fields_info.keys(), dtype=str)
    for key, val in fields_info.items():
        ser[key] = val[np.random.choice(len(val))]
    return ser

def pick_at_least_one(elements: list) -> np.ndarray:
    n_to_pick = np.random.choice(len(elements)) + 1
    return np.random.choice(elements, n_to_pick, replace=False)

def random_chance(p: float = 0.5) -> bool:
    return np.random.random() < p

school_keywords = {'school', 'schools', 'university', 'college', 'academy', 'education', 'educational'}
def is_probably_school(name: str) -> bool:
    words = {w.replace(',', '') for w in name.lower().split()}
    return len(school_keywords & words) > 0
is_probably_school('wolris academy, for the beans'), is_probably_school('fuck the government of utah')

def _get_fake_phone_number():
    """
    Generates a realistic phone number with a Utah area code
    """
    utah_area_codes = ["801", "385", "435"]
    area_code = np.random.choice(utah_area_codes)
    first_local_digit = str(np.random.choice(9) + 1) # Can't be zero
    other_digits = ''.join(str(n) for n in np.random.choice(10, 6))
    joiner = np.random.choice(['-', '.', '', ' '])
    if random_chance(0.4):
        area_code = f"({area_code})"
    
    digit_groups = ["+1"] if random_chance(0.2) else [] # Small chance of including country code
    digit_groups.extend([
        area_code,
        f"{first_local_digit}{other_digits[:2]}",
        other_digits[2:],
    ])

    return joiner.join(digit_groups)

faker = Faker()
def _get_fake_address():
    addr_lines = faker.address().split('\n')

    # Sometimes just return one address line to spice things up a bit
    do_single_address_line = random_chance(0.15)
    if do_single_address_line or len(addr_lines) == 1:
        return addr_lines[0]
    
    addr_line_1 = addr_lines[0]
    addr_line_2 = addr_lines[1]
    # Replace state with UT 95% of the time
    if ',' in addr_lines[1] and random_chance(0.95):
        # State abbreviation
        comma_idx = addr_line_2.index(',')
        addr_line_2 = addr_line_2[: comma_idx + 2] + 'UT' + addr_line_2[comma_idx + 4 :]

        # Zip code
        zip_code = np.random.choice(fields_info['zip_codes'])
        space_idx = addr_line_2.rfind(' ')
        addr_line_2 = addr_line_2[:space_idx + 1] + zip_code

    return f"{addr_line_1}, {addr_line_2}"

def subsample_string(s: str, max_letters = 5, min_letters = 3, chance_force_start_zero = 0.0):
    n_letters = min(len(s), np.random.choice(max(len(s), max_letters - 1)) + min_letters)
    start = np.random.choice(len(s) // 2)
    start = max(0, min(start, len(s) - min_letters))
    if random_chance(chance_force_start_zero):
        start = 0
    return s[start : start + n_letters]

def _get_email_from_name(name: str) -> str:
    # Get first and last name components
    names = name.lower().split()
    first_name = names[0]
    last_name = names[1] if len(names) > 1 else first_name
    first_name_component = subsample_string(first_name, min_letters=99 if random_chance(0.3) else 3, chance_force_start_zero=0.6)
    last_name_component  = subsample_string(last_name,  min_letters=99 if random_chance(0.3) else 3, chance_force_start_zero=0.6)

    # Optionally swap first and last name components
    if random_chance(0.5):
        first_name_component, last_name_component = last_name_component, first_name_component

    # Optionally add numbers to last name component
    if random_chance(0.25):
        num = np.random.choice(100)
        last_name_component = f"{last_name_component}{num:00}"

    # Optionally capitalize first and last name components
    if random_chance(0.2):
        first_name_component = first_name_component.capitalize()
    if random_chance(0.1):
        last_name_component = last_name_component.capitalize()

    # Get email name (combined first and last name components)
    joiner = np.random.choice(['_', '-', ''], p=[0.4, 0.1, 0.5])
    email_name = joiner.join([first_name_component, last_name_component])

    # Optionally replace letters with numbers
    mapping = {s : n for s, n in zip("aeoli", "43011")}
    final_letters = []
    for letter in email_name:
        if letter in mapping and random_chance(0.08):
            final_letters.append(mapping[letter])
        else:
            final_letters.append(letter)
    email_name = ''.join(final_letters)

    # Choose domain
    domain_name = np.random.choice(['gmail.com', 'outlook.com', 'byu.edu', 'utah.gov'], p=[0.75, 0.15, 0.05, 0.05])

    return f"{email_name}@{domain_name}"

def get_fake_identity() -> pd.Series:
    name = faker.name()
    return pd.Series(dict(
        name = name,
        phone = _get_fake_phone_number(),
        address = _get_fake_address(),
        email = _get_email_from_name(name),
    ))

def get_new_webdriver():
    options = Options()
    options.add_argument(f"user-agent={get_random_browser_agent()}")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

### Instantiate GPT2 Model (Optional)

In [None]:
if GENERATE_RESPONSES_GPT2:
    # Instantiate model
    model = AutoModelForCausalLM.from_pretrained("gpt2")
    tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side='left')
    tokenizer.pad_token = tokenizer.eos_token

    # Basic prompts (can make more complicated)
    gpt_prompts = [
        "I saw a blue person enter the wrong bathroom. It was terrible.",
        "There was a mean person using the wrong bathroom. It was so bad.",
    ] * 50
    
    # Helper for regenerating GPT nonsense
    GPT_RESULTS = []
    gpt_use_counter = 0
    def regenerate_gpt_text():
        global GPT_RESULTS, gpt_use_counter
        gpt_use_counter = 0
        GPT_RESULTS.clear()

        tokenized = tokenizer(gpt_prompts, return_tensors="pt", padding=True, truncation=True)
        gen_tokens = model.generate(
            tokenized['input_ids'],
            attention_mask = tokenized['attention_mask'],
            do_sample=True,
            temperature=0.8, # 0.9
            max_length=100,
        )
        gen_text = tokenizer.batch_decode(gen_tokens)
        for prompt, gen in zip(gpt_prompts, gen_text):
            GPT_RESULTS.append(gen.replace('<|endoftext|>', '').removeprefix(prompt))

    # Call once to create initial batch of GPT nonsense
    regenerate_gpt_text()

def get_random_fake_fields_GPT(regen_at: int = 100) -> pd.Series:
    # Regenerate tokens after a while
    global gpt_use_counter
    if gpt_use_counter >= regen_at:
        print(f" -> Regenerating GPT text")
        regenerate_gpt_text()

    # Get random fake fields and replace relevant ones with GPT results
    gpt_fields = ['evidence', 'how', 'information', 'resolve', 'who']
    gpt_results = [str(x) for x in np.random.choice(GPT_RESULTS, len(gpt_fields), replace=False)]
    ser = get_random_fake_fields()
    for gpt_field, gpt_result in zip(gpt_fields, gpt_results):
        # Clean up the text a bit
        gpt_result = gpt_result.strip()
        final_punct_idx = max(gpt_result.rfind('.'), gpt_result.rfind('!')) + 1
        if final_punct_idx == 0:
            final_punct_idx = -1
        gpt_result = gpt_result[:final_punct_idx]
        ser[gpt_field] = gpt_result

    gpt_use_counter += 1
    return ser

### Helpful info about various fields

In [None]:
REPORT_MAIN_TEXT_FIELDS = [
    # (field path, field name)
    ('//*[@id="cd_q1"]/div[1]', 'who'),
    ('//*[@id="cd_q2"]/div[1]', 'information'),
    ('//*[@id="cd_q3"]/div[1]', 'resolve'),
    ('//*[@id="cd_q4"]/div[1]', 'how'),
    ('//*[@id="cd_q5"]/div[1]', 'evidence'),
]

IDENTITY_TEXT_FIELDS = [
    ("00N1K00000fX1ND", "name"),
    ("00N1K00000fXXY3", "address"),
    ("00N1K00000fWywZ", "email"),
    ("00N1K00000fWywe", "phone"),
]

GOVT_CHECKBOXES = [
    1, # Allowed trans people to use the correct facilities (probs the main one to troll here)
    2, # Lewdness, voyeurism, etc.
    # 3, # No Privacy compliance plan
    4, # No single occupant facility
    # 5, # Other violation
]

SCHOOL_CHECKBOXES = [
    6, # Failed to provide equal opportunity for boys & girls
    7, # Allowed trans people to play on the correct team
    8, # Allowed trans people to use the correct facilities
    # 9, # Didn't tell students / parents about this dumbass law's policies lol
]

ALLOW_IDENTITY_DISCLOSURE_CHECKBOX_ID = "00N1K00000fXXXy"
GOVERNMENT_ENTITY_DROPDOWN_ID = "00N1K00000fGn13"

### Begin submitting

In [None]:
VERBOSE = True
vprint = print if VERBOSE else lambda x: None

DRIVER_USES_BEFORE_RESET = 6 # Tends to speed things up a bit because Chrome doesn't have to be restarted each time

score = 0
driver_use_counter = 0
driver = get_new_webdriver()
while True:
    # Start new webdriver or use one from previous loop iteration (resets every DRIVER_USES_BEFORE_RESET)
    if driver_use_counter >= DRIVER_USES_BEFORE_RESET:
        driver_use_counter = 0
        driver.close()
        driver = get_new_webdriver()
    driver.get("https://ut-sao-special-prod.web.app/sex_basis_complaint2.html")
    driver.maximize_window()
    driver_use_counter += 1
    solver = RecaptchaSolver(driver=driver)

    # Check if everything's loaded correctly
    fails = 0
    while True:
        try:
            dropdown = driver.find_element(By.XPATH, '//*[@id="form-row"]/form/div[1]/button').click()
            break
        except:
            fails += 1
            vprint(f" -> Failed to load page {fails} time{'s' if fails > 1 else ''}")
            time.sleep(0.5)
    vprint(" -> Page successfully loaded!")

    # Generate random text & govt entity results for this entry
    random_field_entries = get_random_fake_fields_GPT() if GENERATE_RESPONSES_GPT2 else get_random_fake_fields()
    
    # Select a government entity to report
    dropdown = driver.find_element(By.ID, GOVERNMENT_ENTITY_DROPDOWN_ID)
    dropdown_options = dropdown.get_property("options")
    random_entity = np.random.choice(dropdown_options[1:]).get_property("value")
    Select(dropdown).select_by_value(random_entity)

    # Determine checkboxes to check
    cb_ids_to_check = []
    # Always check at least one school-unrelated checkbox
    cb_ids_to_check.extend([f'cb{n}' for n in pick_at_least_one(GOVT_CHECKBOXES)])
    # If entity is a school, check at least one school-related checkbox
    if is_probably_school(random_entity):
        cb_ids_to_check.extend([f'cb{n}' for n in pick_at_least_one(SCHOOL_CHECKBOXES)])
    # Optionally check identity disclosure checkbox 
    if random_chance():
        cb_ids_to_check.append(ALLOW_IDENTITY_DISCLOSURE_CHECKBOX_ID)

    # Actually check the boxes
    for cb_id in cb_ids_to_check:
        cb_input = driver.find_element(By.ID, cb_id)
        action = ActionChains(driver)
        action.move_to_element(cb_input).perform()
        action.click(cb_input).perform()

    # Fill in all the big text fields
    for field_path, field_name in REPORT_MAIN_TEXT_FIELDS:
        field_box = driver.find_element(By.XPATH, field_path)
        field_box.send_keys(random_field_entries[field_name])

    # Fill in name, email, phone number
    fake_identity = get_fake_identity()
    for field_id, field_name in IDENTITY_TEXT_FIELDS:
        field_box = driver.find_element(By.ID, field_id)
        field_box.send_keys(fake_identity[field_name])

    # Check the final acknowledgement checkboxes
    ack1 = driver.find_element(By.ID, "check_certify");
    action.move_to_element(ack1).perform()
    action.click(ack1).perform()

    ack2 = driver.find_element(By.ID, "check_certify_2");
    action.move_to_element(ack2).perform()
    action.click(ack2).perform()

    # Solve CAPTCHA (seems slightly buggy; tends to crash)
    if SOLVE_CAPTCHAS:
        try:
            recaptcha_iframe = driver.find_element(By.XPATH, '//*[@id="form-row"]/form/div[30]/div/div/iframe')
            action.move_to_element(recaptcha_iframe).perform()
            solver.click_recaptcha_v2(iframe=recaptcha_iframe)
        except TimeoutException as err:
            print(f" -> CAPTCHA TimeoutException: {err}")
        break

    # SUBMIT!
    submit = driver.find_element(By.ID, "btn-submit-complaint2");
    action.move_to_element(submit).perform()
    action.click(submit).perform()
    time.sleep(2)
    score = score + 1
    print(score)