In [2]:
%load_ext autoreload
%autoreload 2

# compute a vocab that is never seen before for any of our target NPNs
# additional experiment added for CR version after ARR review


In [6]:
from transformers import AutoTokenizer
import random
from rozlib.libs.api.api_infinigram import check_infinigram

from proj.cxs_are_revealed.paper.data_config import Exp4NPN
from lib.common.joint_vocab import get_vocab
from proj.cxs_are_revealed.paper.proj_common.npn_dataset_generation.gpt_sample import NPNGenerationConfig
from proj.cxs_are_revealed.paper.proj_common.npn_dataset_generation.gpt_sample import is_valid_for_experiment
from lib.common.joint_vocab import VocabItem
from typing import List, Set



In [None]:

npn_config = NPNGenerationConfig(
    model = 'roberta-large',
    model_require_preceding_space=True,
    gpt_output_file=Exp4NPN.npn_gpt_outputs_v3_zero_freq,
    do_generate=False,
    compute_joint_vocab = False,

    min_word_len=0,
    filter_likely_gerunds=True,
)


In [7]:

# adapted from run() in make_npn_dataset_with_gpt.py
def get_all_valid_nouns():
    tok = AutoTokenizer.from_pretrained(
        npn_config.model,
        use_fast=True,
        clean_up_tokenization_spaces=True)
    all_vocab: List[VocabItem] = get_vocab(tok)
    voc: Set[str] = set([x.str_rep_no_space for x in all_vocab])
    all_nouns: List[str] = [
        x for x in voc
        if is_valid_for_experiment(
            x,
            npn_config.min_word_len,
            npn_config.filter_likely_gerunds)
    ]
    all_nouns = sorted(all_nouns)
    print(f"got possible nouns: {len(all_nouns)}")
    return all_nouns

all_nouns = get_all_valid_nouns()



roberta-large
got possible nouns: 7087


In [38]:
# sample infinigram count = 0 nouns
def sample(end_ct = 108):
    random.seed(42)
    ct = 0
    valid = []
    i = 0
    while ct < end_ct:
        i+=1
        sampled_noun = random.choice(all_nouns)
        print(i, sampled_noun)

        # check if noun has 0 freq for all
        ct_okay = 0
        ct_list = []
        for prep in ["to", "after", "upon", "by"]:
            tgt_str = f"{sampled_noun} {prep} {sampled_noun}"
            inf_result = check_infinigram(tgt_str, retries=10, sleep_time=0.4)
            ct_list.append(inf_result.count)
            if inf_result.count != 0:
                break
            else:
                ct_okay += 1
        print(ct_list, ct_okay)

        if ct_okay == 4:
            valid.append(sampled_noun)
            ct += 1

        if i == 1000:
            break
        print(len(valid))
    return valid

valid_nouns = sample()




1 presence
[28] 0
0
2 enthusiasm
[36] 0
0
3 frown
[2] 0
0
4 why
[5] 0
0
5 dozen
[19] 0
0
6 verse
[454] 0
0
7 borrower
[33] 0
0
8 execution
[257] 0
0
9 location
[12505] 0
0
10 dossier
[1] 0
0
11 stump
[375] 0
0
12 manufacture
[49] 0
0
13 pip
[61] 0
0
14 psychopath
[5] 0
0
15 runaway
[0, 0, 0, 0] 4
1
16 humidity
[1] 0
1
17 given
[19] 0
1
18 android
[68] 0
1
19 molecule
[1267] 0
1
20 whistleblower
[0, 26] 1
1
21 fundamentalist
[1] 0
1
22 costume
[37] 0
1
23 miracle
[49] 0
1
24 demand
[39] 0
1
25 hail
[4] 0
1
26 toxin
[11] 0
1
27 eldest
[7] 0
1
28 goblin
[15] 0
1
29 exhibit
[384] 0
1
30 principal
[196] 0
1
31 shower
[439] 0
1
32 eviction
[5] 0
1
33 pedestrian
[20] 0
1
34 adept
[16] 0
1
35 noble
[41] 0
1
36 crisis
[4976] 0
1
37 boot
[654] 0
1
38 dome
[107] 0
1
39 solder
[84] 0
1
40 intention
[17] 0
1
41 misery
[107] 0
1
42 peanut
[4] 0
1
43 superiority
[0, 0, 0, 0] 4
2
44 documentary
[2] 0
2
45 serial
[33] 0
2
46 membrane
[181] 0
2
47 reprint
[5] 0
2
48 slavery
[28] 0
2
49 ml
[4] 0
2
50 lit

In [58]:
# these were the outputs

# tested 1000
nouns1 = [
    'amulet', 'judiciary', 'deterioration', 'haste', 'spinach', 'humanitarian', 'bung', 'halftime', 'resettlement', 'upside', 'insecurity', 'soar', 'drawback', 'needy', 'bravery', 'astronomy', 'pave', 'tact', 'pseud', 'prostitution', 'damn', 'diction', 'basil', 'plethora', 'meanwhile', 'brill', 'campaigner', 'livelihood', 'counterterrorism', 'confidentiality', 'abstinence', 'compatibility', 'pornography', 'sabot', 'forfeiture', 'petroleum', 'pul', 'pave', 'durability', 'viol', 'patriotism', 'cooper', 'unwillingness', 'behest', 'anarchism', 'prosperity', 'vigilante', 'prostitution', 'unemployed', 'displeasure', 'diction', 'cryptography', 'despite', 'attrition', 'lieu', 'versatility', 'drib', 'sexist', 'leash', 'impress', 'ana', 'attrition', 'delinquent', 'fuzz', 'excise', 'mater', 'resultant', 'drib', 'efficacy', 'conservatism', 'versatility', 'sir', 'spoil'
]
# tested 715
nouns2 = [
    'runaway', 'superiority', 'hindsight', 'separatist', 'prosperity', 'lam', 'biotech', 'endeavour', 'plethora', 'maternity', 'leash', 'comprehensive', 'ire', 'supremacist', 'folklore', 'patriarchy', 'diction', 'malnutrition', 'midfield', 'nationalist', 'unwillingness', 'meanwhile', 'hygiene', 'immortality', 'reconnaissance', 'populist', 'eccentric', 'dug', 'haste', 'moot', 'brainstorm', 'chilly', 'sax', 'weaponry', 'vigilante'
]

all_nouns_zero_freq = nouns1 + nouns2
print(len(all_nouns_zero_freq))

# 14 were repeated
all_nouns_zero_freq = set(all_nouns_zero_freq)
print(len(all_nouns_zero_freq))

# roughly 5% of nouns are unseen in the Pile
print(94/(1000+ 715))

108
94
0.05481049562682216


In [59]:
# filter some bad nouns
to_remove = [
    'prostitution',
    'pornography',
    'sexist',
    # 'damn'
]
all_nouns_zero_freq_clean = all_nouns_zero_freq.difference(set(to_remove))
print(len(all_nouns_zero_freq_clean))


91


In [61]:

from proj.cxs_are_revealed.paper.proj_common.npn_dataset_generation.gpt_sample import run_gpt_and_write
import itertools


# adapted from make_npnP_dataset_with_gpt.py
def do_generate():
    preps = ["to", "after", "by", "upon"]

    for n, p in itertools.product(all_nouns_zero_freq_clean, preps):
        run_gpt_and_write(n, p, Exp4NPN.npn_gpt_outputs_v3_zero_freq)
do_generate()
