Generates vocabulary used to build PCFGs.

Most words are taken from the UD corpus, some epicenes are taken from Wiktionary.

## Requirements

In [25]:
from urllib.request import urlretrieve
import json

from conllu import parse

from collections import defaultdict

from string import punctuation

import tempfile

## Download UD data

In [3]:
with tempfile.TemporaryDirectory() as tmp_directory:
    urlretrieve('https://raw.githubusercontent.com/UniversalDependencies/UD_French-GSD/master/fr_gsd-ud-train.conllu',f'{tmp_directory}/train.conllu')
    with open(f'{tmp_directory}/train.conllu', 'r') as f:
        data = f.read()
    sentences = parse(data)

('train.conllu', <http.client.HTTPMessage at 0x7f6b04307a90>)

## Select relevant words

In [16]:
# take words from UD (filtering some out)

sampled_vocab = defaultdict(lambda: set())
my_vocab = {}

for idx, sent in enumerate(sentences):
    
    for t in sent:
        #print(" ".join([f'{k}: {t[k]}' for k in t]))
        if ((t['misc'] and t['misc'].get('Idiom')) or # no idioms
            any(p in t['form'] for p in punctuation) or  # no words containing punctuation
            (t['feats'] and 
            (t['feats'].get('Number') not in {'Sing', None} or  # only singular
            t['feats'].get('Person') not in {'3', None} or  # only 3rd person
            t['feats'].get('Tense') not in {'Pres', None} or    # only present
            t['feats'].get('Mood') not in {'Ind', None} or  # only indicative
            t['feats'].get('VerbForm') not in {'Fin', None})    # no gerunds, infinitives, etc)
            )):
            continue

        if t['feats'] and t['feats'].get('Gender'):
            sampled_vocab[t['upos'] + t['feats']['Gender']].add(t['form'].lower())
        else:
            sampled_vocab[t['upos']].add(t['form'].lower())


for k in sampled_vocab:
    print(k + "\t\t" + str(len(sampled_vocab[k])))

ADV		518
ADP		59
DETMasc		14
NOUNMasc		4073
PRONMasc		25
VERB		1026
DETFem		13
NOUNFem		3264
AUX		6
_		9
PROPN		11694
PROPNFem		633
ADJMasc		1853
PRONFem		14
ADJFem		1792
CCONJ		15
SCONJ		10
PRON		13
DET		10
PROPNMasc		1014
NOUN		22
X		1379
NUMFem		1
PUNCT		4
SYM		44
NUM		56
ADVMasc		5
ADJ		40
INTJ		16
SYMMasc		3
XMasc		5
SYMFem		1
ADPMasc		2


In [26]:
# creating my vocabulary

my_vocab['VERB'] = sampled_vocab['VERB']

# choosing them manually so I have the same five of each type
my_vocab['DETFem'] = {'cette', 'la', 'une', 'ma', 'ta'}
my_vocab['DETMasc'] = {'ce', 'le', 'un', 'mon', 'ton'}
my_vocab['DETEpic'] = {'quelque', 'votre', 'notre', 'chaque', 'leur'}

my_vocab['PREP'] = {'hors', 'contre', 'avant', 'pas', 'en', 'selon', 'après', 'alias', 'plus', 'à', 'jusque', 'concernant', 
                    'depuis', 'par', 'avec', 'entre', 'envers', 'durant', 'sans', 'dès', 'vers', 'courant', 'derrière', 'sous', 
                    'devant', 'de', 'pour', 'suivant', 'chez', 'dans', 'comme', 'via', 'outre', 'moyennant', 'sur'}

other_epicenes = sampled_vocab['NOUNFem'].intersection(sampled_vocab['NOUNMasc']) # nouns present in both sets are considered epicene
all_epicenes = sampled_vocab['NOUN'].union(other_epicenes)
my_vocab['NOUNEpic'] = all_epicenes
my_vocab['NOUNFem'] = sampled_vocab['NOUNFem'].difference(all_epicenes) # remove epicenes
my_vocab['NOUNMasc'] = sampled_vocab['NOUNMasc'].difference(all_epicenes)   # remove epicenes

other_epicenes = sampled_vocab['ADJFem'].intersection(sampled_vocab['ADJMasc']) # adjectives present in both sets are considered epicene
all_epicenes = sampled_vocab['ADJ'].union(other_epicenes)
my_vocab['ADJFem'] = sampled_vocab['ADJFem'].difference(all_epicenes) # remove epicenes
my_vocab['ADJMasc'] = sampled_vocab['ADJMasc'].difference(all_epicenes) # remove epicenes
my_vocab['ADJEpic'] = all_epicenes

# remove nouns and adjectives starting with vowels to avoid handling elision
vowels = ('a', 'e', 'i', 'o', 'u', 'y', 'à', 'é', 'è', 'ê', 'â', 'î', 'ï', 'ü', 'ë', 'h')
my_vocab['NOUNEpic'] = {w for w in my_vocab['NOUNEpic'] if not w.startswith(vowels)}
my_vocab['NOUNFem'] = {w for w in my_vocab['NOUNFem'] if not w.startswith(vowels)}
my_vocab['NOUNMasc'] = {w for w in my_vocab['NOUNMasc'] if not w.startswith(vowels)}

my_vocab['ADJEpic'] = {w for w in my_vocab['ADJEpic'] if not w.startswith(vowels)}
my_vocab['ADJFem'] = {w for w in my_vocab['ADJFem'] if not w.startswith(vowels)}
my_vocab['ADJMasc'] = {w for w in my_vocab['ADJMasc'] if not w.startswith(vowels)}

for k in my_vocab:
    print(k)
    print(str(len(my_vocab[k])) + "\t" + str(my_vocab[k]))
    print()

VERB
855	{'ramène', 'édite', 'expire', 'affiche', 'retravaille', 'effleure', 'protège', 'autorise', 'étudie', 'adapte', 'fédère', 'réglemente', 'échange', 'impute', 'adhère', 'stipule', 'apparaît', 'imagine', 'inflige', 'relate', 'mange', 'vise', 'enlève', 'retient', 'desserre', 'surnomme', 'gradue', 'incarne', 'hérite', 'honore', 'parle', 'ensuit', 'comprend', 'contribue', 'entend', 'devient', 'nécessite', 'embellit', 'voici', 'augmente', 'dépasse', 'dit', 'prétend', 'félicite', 'entraîne', 'navigue', 'réjouit', 'répare', 'essuie', 'appartient', 'disparait', 'râle', 'clôt', 'écrit', 'justifie', 'sait', 'accède', 'sent', 'commente', 'signale', 'démérite', 'déclare', 'peaufine', 'retourne', 'coud', 'envoie', 'renforce', 'allie', 'pète', 'cumule', 'aperçoit', 'convient', 'plonge', 'jette', 'oppose', 'égalise', 'purifie', 'ajoute', 'détériore', 'exporte', 'approche', 'invite', 'escorte', 'milite', 'sécurise', 'assiège', 'essaime', 'plaît', 'restructure', 'monopolise', 'redonne', 'projette

## Add more epicene nouns and adjectives with wiktextract

In [8]:
with tempfile.TemporaryDirectory() as tmp_directory:
    !curl https://kaikki.org/dictionary/raw-wiktextract-data.json -o {tmp_directory}/raw-wiktextract-data.json
    with open(f"{tmp_directory}/raw-wiktextract-data.json", "r", encoding="utf-8") as f:
        for line in f: 
            data = json.loads(line)
            if data.get('lang_code') == 'fr' and data.get('word')[0].islower() and not ' ' in data.get('word'): # français, minuscule, sans espaces
                if data.get('pos') == 'adj':
                    if 'plural' in data.get('head_templates', [{}])[0].get('expansion', '') and 'feminine' not in data.get('head_templates', [{}])[0].get('expansion', ''):                        
                        my_vocab['ADJEpic'].add(data.get('word').lower())
                if data.get('pos') == 'noun':
                    for sense in data.get('senses'):
                        if all(tag in sense.get('tags', []) for tag in ['feminine', 'masculine', 'by-personal-gender']):
                            my_vocab['NOUNEpic'].add(data.get('word').lower())

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 12.9G  100 12.9G    0     0  55.5M      0  0:03:58  0:03:58 --:--:-- 45.7MM    0     0  47.1M      0  0:04:41  0:00:14  0:04:27 53.2M


In [29]:
# enlever les doublons
keys = ['ADJEpic', 'NOUNMasc', 'NOUNFem', 'ADJMasc', 'NOUNEpic', 'VERB', 'ADJFem', 'PREP', 'DETEpic', 'DETMasc', 'DETFem']

nwords = {'VERB':800, 'ADJMasc':1000, 'NOUNMasc':1700, 'ADJFem':1000, 'NOUNFem':1700, 'ADJEpic':1000, 
          'NOUNEpic':1700, 'PREP':35, 'DETEpic':5, 'DETMasc':5, 'DETFem':5}
 
for i in range(11):
    for j in range(i+1,11):
        my_vocab[keys[i]].difference_update(my_vocab[keys[j]])
        
# convert to list  
for k in my_vocab:
    my_vocab[k] = list(my_vocab[k])[:nwords[k]]
    print(k, len(my_vocab[k]))

VERB 800
DETFem 5
DETMasc 5
DETEpic 5
PREP 35
NOUNEpic 1700
NOUNFem 1700
NOUNMasc 1700
ADJFem 1000
ADJMasc 1000
ADJEpic 1000


## Save vocabulary to json

In [30]:
with open("full_vocabulary.json", "w") as outfile:
    json.dump(my_vocab, outfile)