In [87]:
import json
from lemminflect import getAllLemmas, getAllInflections
from nltk.corpus import wordnet as wn

In [41]:
def print_synsets(word):
    synsets = wn.synsets(word)
    for i, ss in enumerate(synsets):
        print(i, ss._name)
        print(ss._definition)
        print()
        
    return synsets

In [42]:
religion_synsets = print_synsets("religion")

0 religion.n.01
a strong belief in a supernatural power or powers that control human destiny

1 religion.n.02
an institution to express belief in a divine power



In [96]:
def get_all_lemmas(lemmas, words):
    for lemma in lemmas:
        for w in lemma._name.lower().split("_"):
            words.add(w)
            for lem in getAllLemmas(w).values():
                for word in lem:
                    words.add(word)
            for infl in getAllInflections(w).values():
                for word in infl:
                    words.add(word)
                    for lem in getAllLemmas(word).values():
                        for word_ in lem:
                            words.add(word_)
                    
    return words

def get_all_words(synsets, indices, verbose = True):
    words = set()
    for i in indices:
        ss = synsets[i]
        words = get_all_lemmas(ss._lemmas, words)
        
        hyponyms = [ss.hyponyms()]
        while len(hyponyms) > 0:
            for hyp in hyponyms[0]:
                words = get_all_lemmas(hyp._lemmas, words)
                hyponyms.append(hyp.hyponyms())
            hyponyms.pop(0)
            
    words = list(sorted(words))
    if verbose:
        print(len(words))
        display(words)
        
    return words

In [97]:
religion_words = get_all_words(religion_synsets, [0, 1])

296


['abecedarian',
 'adventism',
 'albigenses',
 'albigensianism',
 'amish',
 'anabaptism',
 'analogies',
 'analogy',
 'anglican',
 'anglicanism',
 'anglo-catholicism',
 'apophatism',
 'apostolic',
 'appear',
 'appearing',
 'armenian',
 'arminianism',
 'asian',
 'augustinian',
 'austin',
 'bahaism',
 'baptistic',
 'belief',
 'beliefs',
 'believer',
 'believers',
 'benedict',
 'benedictine',
 'brahmanism',
 'brahminism',
 'brethren',
 'buddhism',
 'byzantine',
 'calvinism',
 'canon',
 'canons',
 'cargo',
 'cargoes',
 'cargos',
 'carmel',
 'carmelite',
 'carthusian',
 'cataphatism',
 'cathari',
 'catharism',
 'cathars',
 'catholic',
 'catholicism',
 'catholicity',
 'chabad',
 'chasidim',
 'chasidism',
 'chassidim',
 'chassidism',
 'chiao',
 "christ's",
 'christian',
 'christianity',
 'church',
 'churches',
 'congregationalism',
 'consciousness',
 'consciousnesses',
 'conservative',
 'conservatives',
 'coptic',
 'cult',
 'cults',
 'cultus',
 'darsana',
 'doctrine',
 'doctrines',
 'dominican'

In [98]:
json.dump(religion_words, open("masking_words/religion.json", "w"), indent = 4)

In [99]:
race_synsets = print_synsets("race")

0 race.n.01
any competition

1 race.n.02
a contest of speed

2 race.n.03
people who are believed to belong to the same genetic stock

3 subspecies.n.01
(biology) a taxonomic group that is a division of a species; usually arises as a consequence of geographical isolation within a species

4 slipstream.n.01
the flow of air that is driven backwards by an aircraft propeller

5 raceway.n.01
a canal for a current of water

6 rush.v.01
move fast

7 race.v.02
compete in a race

8 race.v.03
to work as fast as possible towards a goal, sometimes in competition with others

9 race.v.04
cause to move fast or to rush or race



In [100]:
race_words = get_all_words(race_synsets, [2])

51


['amerindian',
 'black',
 'blacked',
 'blacker',
 'blackest',
 'blacking',
 'blacks',
 'caucasian',
 'caucasians',
 'caucasoid',
 'color',
 'colored',
 'coloring',
 'colors',
 'colour',
 'coloured',
 'colouring',
 'colours',
 'herrenvolk',
 'indian',
 'master',
 'mastered',
 'mastering',
 'masters',
 'mongolian',
 'mongoloid',
 'negro',
 'negroid',
 'negroids',
 'of',
 'people',
 'peopled',
 'peoples',
 'peopling',
 'race',
 'raced',
 'races',
 'racing',
 'slavic',
 'white',
 'whited',
 'whiter',
 'whites',
 'whitest',
 'whiting',
 'yellow',
 'yellowed',
 'yellower',
 'yellowest',
 'yellowing',
 'yellows']

In [101]:
json.dump(race_words, open("masking_words/race.json", "w"), indent = 4)

In [102]:
gender_synsets = print_synsets("gender")

0 gender.n.01
a grammatical category in inflected languages governing the agreement between nouns and pronouns and adjectives; in some languages it is quite arbitrary but in Indo-European languages it is usually based on sex or animateness

1 sex.n.04
the properties that distinguish organisms on the basis of their reproductive roles



In [103]:
gender_words = get_all_words(gender_synsets, [0, 1])

30


['androgyny',
 'bisexuality',
 'femaleness',
 'feminine',
 'feminineness',
 'feminines',
 'gender',
 'gendered',
 'gendering',
 'genders',
 'grammatical',
 'hermaphroditism',
 'maleness',
 'malenesses',
 'masculine',
 'masculines',
 'masculinities',
 'masculinity',
 'neuter',
 'neutered',
 'neutering',
 'neuters',
 'sex',
 'sexed',
 'sexes',
 'sexing',
 'sexualities',
 'sexuality',
 'virilities',
 'virility']

In [104]:
json.dump(gender_words, open("masking_words/gender.json", "w"), indent = 4)