In [1]:
import pymongo
from pathlib import Path
import os
from dotenv import load_dotenv

In [2]:
from fastprogress import progress_bar

In [12]:
pth = Path('.').joinpath('./mongo.env')
load_dotenv(pth)

True

## UTILS

In [3]:
all_keys = ['abbreviations',
 'antonyms',
 'categories',
 'compounds',
 'coordinate_terms',
 'derived',
 'forms',
 'heads',
 'holonyms',
 'hypernyms',
 'hyphenation',
 'hyponyms',
 'inflection',
 'inflection_of',
 'lang',
 'lang_code',
 'meronyms',
 'pos',
 'proverbs',
 'related',
 'senses',
 'sounds',
 'synonyms',
 'translations',
 'troponyms',
 'wikipedia',
 'word']

def keep_desired_keys(*keys):
    return {
     k:0   for k in all_keys if k not in keys
    }

In [37]:
def query_words_by_key(key_name):
    return list(dict_col.find({
    key_name:{
        '$exists':True
    }
},{ **keep_desired_keys(key_name,'word') }))


def len_distribution(data, key_name):
    distr = {}
    samples = {}
    for h in data:
        len_h = len(h[key_name])
        if not distr.get(len_h):
            distr[len_h] = 1
            samples[len_h] = h
        else:
            distr[len_h] +=1
    d2 = {}       
    for k in sorted(distr.keys()):
        d2[k] = distr[k]
    return d2, samples

def get_uniq_tags(data, key_name):
    res = set()
    for item in data:
        res.update(item[key_name])
    return list(res)

def get_uniq_keys(arr):
    keys = set()
    samples = {}
    distr = {}
    for o in arr:
        _keys = list(o.keys())
        keys.update(_keys)
        for key in _keys:
            val = samples.get(key)
            if val is None:
                samples[key] = [o[key]]
            elif len(val) < 3:
                samples[key].append(o[key])
                
            if distr.get(key) is None:
                distr[key] = 1
            else:
                distr[key] += 1
    
    return list(keys), samples,distr

In [13]:
mongo_username, mongo_password = os.getenv('MONGO_INITDB_ROOT_USERNAME'), os.getenv('MONGO_INITDB_ROOT_PASSWORD')
conn = pymongo.MongoClient(
    host='localhost',
    port=27017,
    username=mongo_username,
    password=mongo_password
)

In [14]:
db_name = 'chartreuse'
dict_db = conn[db_name]
col_name = 'wiktionary'
dict_col = dict_db[col_name]

In [5]:
dict_pth = Path('.')/'wiktionary'/'dictionary-English.json'
dict_sense_pth = Path('.')/'wiktionary'/'English-all-senses.json'
dict_pth,dict_sense_pth

(PosixPath('wiktionary/dictionary-English.json'),
 PosixPath('wiktionary/English-all-senses.json'))

In [63]:
import json

In [None]:
def all_keys(pth):
    keys = set()
    with open(pth) as f:
        for line in f:
            json1 = json.loads(line)
            keys.update(json1.keys())
    return keys

keys = all_keys(dict_sense_pth)
keys

In [44]:
def count_key_lens(pth):
    max_dic = {}
    min_dic = {}
    key_lens_dic = {}
    with open(pth) as f:
        for line in f:
            json1 = json.loads(line)
            key_len = len(json1.keys())
#             if max_dic is None:
#                 max_dic = json1
#             if min_dic is None:
#                 min_dic = json1

            max_len = len(max_dic.keys())
            min_len = len(min_dic.keys())

            if key_len > max_len:
                max_dic = json1
            if min_len == 0 or key_len < min_len:
                min_dic = json1

            if key_lens_dic.get(key_len, None) is None:
                key_lens_dic[key_len] = 1
            else:
                key_lens_dic[key_len] +=1
    
    return max_dic, min_dic, key_lens_dic

In [45]:
max_dic, min_dic, key_lens = count_key_lens(dict_pth)

In [46]:
s_max_dic, s_min_dic, s_key_lens = count_key_lens(dict_sense_pth)

In [50]:
len(max_dic.keys()),max_dic.keys()

(18,
 dict_keys(['pos', 'heads', 'forms', 'antonyms', 'wikipedia', 'sounds', 'hyphenation', 'categories', 'word', 'lang', 'lang_code', 'translations', 'synonyms', 'hyponyms', 'holonyms', 'derived', 'related', 'senses']))

In [49]:
max_dic

{'pos': 'noun',
 'heads': [{'1': '~', '2': 'zeroes', '3': 's', 'template_name': 'en-noun'}],
 'forms': [{'form': 'zeroes', 'tags': ['plural']},
  {'form': 'zeros', 'tags': ['plural']}],
 'antonyms': [{'word': 'pole',
   'sense': "value of a function's variables at zero"}],
 'wikipedia': ['zero'],
 'sounds': [{'ipa': '/ˈzɪəɹəʊ/',
   'tags': ['UK', 'General New Zealand', 'General Australian']},
  {'ipa': '/ˈzɪɹ(ˌ)oʊ/', 'tags': ['US']},
  {'ipa': '/ˈzi(ˌ)ɹoʊ/', 'tags': ['US']},
  {'rhymes': '-ɪəɹəʊ'},
  {'rhymes': '-iːɹəʊ'},
  {'audio': 'en-us-zero.ogg', 'tags': ['US'], 'text': 'Audio (US)'},
  {'audio': 'en-au-zero.ogg', 'tags': ['AU'], 'text': 'Audio (AU)'},
  {'enpr': "zîrʹō(')"}],
 'hyphenation': ['zero'],
 'categories': ['English basic words',
  'English numerals',
  'Romanian numerals',
  'Zero'],
 'word': 'zero',
 'lang': 'English',
 'lang_code': 'en',
 'translations': [{'lang': 'American Sign Language',
   'code': None,
   'sense': 'numeric symbol of zero',
   'word': 'see last ro

In [52]:
len(min_dic.keys()), min_dic.keys(), min_dic

(5,
 dict_keys(['pos', 'word', 'lang', 'lang_code', 'senses']),
 {'pos': 'noun',
  'word': 'pop culture',
  'lang': 'English',
  'lang_code': 'en',
  'senses': [{'glosses': ['popular culture'],
    'wikipedia': ['pop culture'],
    'id': 'pop_culture-noun'}]})

In [54]:
len(s_max_dic.keys()), s_max_dic.keys()

(18,
 dict_keys(['pos', 'heads', 'forms', 'antonyms', 'wikipedia', 'sounds', 'hyphenation', 'categories', 'word', 'lang', 'lang_code', 'translations', 'synonyms', 'hyponyms', 'holonyms', 'derived', 'related', 'senses']))

In [55]:
s_max_dic

{'pos': 'noun',
 'heads': [{'1': '~', '2': 'zeroes', '3': 's', 'template_name': 'en-noun'}],
 'forms': [{'form': 'zeroes', 'tags': ['plural']},
  {'form': 'zeros', 'tags': ['plural']}],
 'antonyms': [{'word': 'pole',
   'sense': "value of a function's variables at zero"}],
 'wikipedia': ['zero'],
 'sounds': [{'ipa': '/ˈzɪəɹəʊ/',
   'tags': ['UK', 'General New Zealand', 'General Australian']},
  {'ipa': '/ˈzɪɹ(ˌ)oʊ/', 'tags': ['US']},
  {'ipa': '/ˈzi(ˌ)ɹoʊ/', 'tags': ['US']},
  {'rhymes': '-ɪəɹəʊ'},
  {'rhymes': '-iːɹəʊ'},
  {'audio': 'en-us-zero.ogg', 'tags': ['US'], 'text': 'Audio (US)'},
  {'audio': 'en-au-zero.ogg', 'tags': ['AU'], 'text': 'Audio (AU)'},
  {'enpr': "zîrʹō(')"}],
 'hyphenation': ['zero'],
 'categories': ['English basic words',
  'English numerals',
  'Romanian numerals',
  'Zero'],
 'word': 'zero',
 'lang': 'English',
 'lang_code': 'en',
 'translations': [{'lang': 'American Sign Language',
   'code': None,
   'sense': 'numeric symbol of zero',
   'word': 'see last ro

In [56]:
len(s_min_dic.keys()), s_min_dic.keys(), s_min_dic

(5,
 dict_keys(['pos', 'word', 'lang', 'lang_code', 'senses']),
 {'pos': 'noun',
  'word': 'pop culture',
  'lang': 'English',
  'lang_code': 'en',
  'senses': [{'glosses': ['popular culture'],
    'wikipedia': ['pop culture'],
    'id': 'pop_culture-noun'}]})

In [58]:
key_lens, s_key_lens

({6: 564769,
  10: 17487,
  7: 315187,
  8: 69862,
  13: 3379,
  9: 31586,
  11: 10084,
  12: 6272,
  14: 1400,
  15: 483,
  16: 123,
  18: 6,
  17: 36,
  5: 450},
 {6: 564769,
  10: 17487,
  7: 315187,
  8: 69862,
  13: 3379,
  9: 31586,
  11: 10084,
  12: 6272,
  14: 1400,
  15: 483,
  16: 123,
  18: 6,
  17: 36,
  5: 450})

In [59]:
i = 0
for v in key_lens.values():
    i+= v
i

1021124

In [16]:
from tqdm import tqdm
import json
from datetime import datetime
with open(dict_sense_pth) as f:
    for line in tqdm(f):
        loaded = json.loads(line)
        dict_col.insert_one({
            **loaded,
        })

1021124it [16:12, 1050.28it/s]


 ## HEADS
 
 remove

In [26]:
## words with heads
lis = list(dict_col.find({
    'heads':{
        '$exists':True
    }
},{ **keep_desired_keys('heads','word') }))
len(lis)

1020235

In [27]:
lis[:10]

[{'_id': ObjectId('60544a9311fe31d283dedc17'),
  'heads': [{'template_name': 'en-proper noun'}],
  'word': 'GNU FDL'},
 {'_id': ObjectId('60544a9311fe31d283dedc18'),
  'heads': [{'template_name': 'en-proper noun'}],
  'word': 'Pope Julius'},
 {'_id': ObjectId('60544a9311fe31d283dedc19'),
  'heads': [{'1': 's', '2': 'encyclopaediae', 'template_name': 'en-noun'}],
  'word': 'encyclopaedia'},
 {'_id': ObjectId('60544a9311fe31d283dedc1a'),
  'heads': [{'1': 'en', '2': 'noun form', 'template_name': 'head'}],
  'word': 'pies'},
 {'_id': ObjectId('60544a9311fe31d283dedc1b'),
  'heads': [{'1': 'en', '2': 'verb form', 'template_name': 'head'}],
  'word': 'pies'},
 {'_id': ObjectId('60544a9311fe31d283dedc1c'),
  'heads': [{'sg': 'current event', 'template_name': 'en-plural noun'},
   {'1': 'en',
    '2': 'noun',
    'cat2': 'pluralia tantum',
    'head': '',
    'sort': '',
    'g': 'p',
    '3': '',
    '4': '',
    '5': 'normally plural',
    '6': '',
    '7': 'singular',
    '8': 'current eve

In [29]:
heads_greater_than_1 = [
    h
    for h in lis
    if len(h.get('heads',[])) > 1
]

In [34]:
heads_len_distribution = {}

for h in lis:
    len_h = len(h['heads'])
    if not heads_len_distribution.get(len_h):
        heads_len_distribution[len_h] = 1
    else:
        heads_len_distribution[len_h] +=1

heads_len_distribution

{1: 1010287, 2: 9948}

In [30]:
len(heads_greater_than_1)

9948

In [35]:
heads_greater_than_1[5:10]

[{'_id': ObjectId('60544a9311fe31d283dedcef'),
  'heads': [{'template_name': 'en-prep'},
   {'1': 'en', '2': 'prepositions', 'head': '', 'template_name': 'head'}],
  'word': 'abaft'},
 {'_id': ObjectId('60544a9311fe31d283dedd04'),
  'heads': [{'template_name': 'en-plural noun'},
   {'1': 'en',
    '2': 'noun',
    'cat2': 'pluralia tantum',
    'head': '',
    'sort': '',
    'g': 'p',
    '3': 'plural only',
    '4': '',
    '5': '',
    '6': '',
    '7': '',
    '8': '{{{sg}}}',
    'template_name': 'head'}],
  'word': 'abdominalia'},
 {'_id': ObjectId('60544a9311fe31d283dedd18'),
  'heads': [{'template_name': 'en-prep'},
   {'1': 'en', '2': 'prepositions', 'head': '', 'template_name': 'head'}],
  'word': 'abeam'},
 {'_id': ObjectId('60544a9311fe31d283dedd25'),
  'heads': [{'template_name': 'en-prefix'},
   {'1': 'en',
    '2': 'prefix',
    'head': '',
    'sort': '',
    'template_name': 'head'}],
  'word': 'a-'},
 {'_id': ObjectId('60544a9311fe31d283dedd26'),
  'heads': [{'templat

## keys missing in words

In [68]:
keys = ['abbreviations',
 'antonyms',
 'categories',
 'compounds',
 'coordinate_terms',
 'derived',
 'forms',
 'heads',
 'holonyms',
 'hypernyms',
 'hyphenation',
 'hyponyms',
 'inflection',
 'inflection_of',
 'lang',
 'lang_code',
 'meronyms',
 'pos',
 'proverbs',
 'related',
 'senses',
 'sounds',
 'synonyms',
 'translations',
 'troponyms',
 'wikipedia',
 'word']

In [None]:
total_docs = 1021124
res = {}
for key in keys:
    cnt = dict_col.count_documents({
        key:{
            '$exists':True
        }
    })
    res[key] = {
         cnt: cnt,
        'missing': total_docs - cnt
    }
    if cnt < total_docs:
        print(key, cnt)

In [70]:
res

{'abbreviations': {107: 107, 'missing': 1021017},
 'antonyms': {9956: 9956, 'missing': 1011168},
 'categories': {81619: 81619, 'missing': 939505},
 'compounds': {2: 2, 'missing': 1021122},
 'coordinate_terms': {1646: 1646, 'missing': 1019478},
 'derived': {21125: 21125, 'missing': 999999},
 'forms': {387771: 387771, 'missing': 633353},
 'heads': {1020235: 1020235, 'missing': 889},
 'holonyms': {74: 74, 'missing': 1021050},
 'hypernyms': {935: 935, 'missing': 1020189},
 'hyphenation': {11462: 11462, 'missing': 1009662},
 'hyponyms': {1600: 1600, 'missing': 1019524},
 'inflection': {172: 172, 'missing': 1020952},
 'inflection_of': {9: 9, 'missing': 1021115},
 'lang': {1021124: 1021124, 'missing': 0},
 'lang_code': {1021124: 1021124, 'missing': 0},
 'meronyms': {92: 92, 'missing': 1021032},
 'pos': {1021124: 1021124, 'missing': 0},
 'proverbs': {1: 1, 'missing': 1021123},
 'related': {24297: 24297, 'missing': 996827},
 'senses': {1021124: 1021124, 'missing': 0},
 'sounds': {114966: 114966

In [50]:
basic_keys = ['pos', 'word', 'lang', 'lang_code', 'senses']
alternative_keys = [
 'forms',
 'heads',
 'antonyms',
 'wikipedia'
 'abbreviations',
 'categories',
 'compounds',
 'coordinate_terms',
 'derived',
 'holonyms',
 'hypernyms',
 'hyphenation',
 'hyponyms',
 'inflection',
 'inflection_of',
 'meronyms',
 'proverbs',
 'related',
 'sounds',
 'synonyms',
 'translations',
 'troponyms',]

## forms

keep

forms  string[]

form_data  object[]

In [10]:
lis = query_words_by_key('forms')
len(lis)

387771

In [54]:
lis[:5]

[{'_id': ObjectId('60544a9311fe31d283dedc19'),
  'forms': [{'form': 'encyclopaedias', 'tags': ['plural']},
   {'form': 'encyclopaediae', 'tags': ['plural']}],
  'word': 'encyclopaedia'},
 {'_id': ObjectId('60544a9311fe31d283dedc1c'),
  'forms': [{'form': 'current event', 'tags': ['singular']}],
  'word': 'current events'},
 {'_id': ObjectId('60544a9311fe31d283dedc1e'),
  'forms': [{'form': 'livres', 'tags': ['plural']}],
  'word': 'livre'},
 {'_id': ObjectId('60544a9311fe31d283dedc1f'),
  'forms': [{'form': 'multiculturalisms', 'tags': ['plural']}],
  'word': 'multiculturalism'},
 {'_id': ObjectId('60544a9311fe31d283dedc20'),
  'forms': [{'form': 'books', 'tags': ['plural']}],
  'word': 'book'}]

In [59]:
distribution,samples = len_distribution('forms')
distribution

{2: 70340,
 1: 273405,
 4: 37692,
 3: 4468,
 6: 605,
 5: 721,
 7: 429,
 9: 17,
 11: 4,
 8: 65,
 10: 20,
 12: 3,
 24: 1,
 14: 1}

In [61]:
samples

{2: {'_id': ObjectId('60544a9311fe31d283dedc19'),
  'forms': [{'form': 'encyclopaedias', 'tags': ['plural']},
   {'form': 'encyclopaediae', 'tags': ['plural']}],
  'word': 'encyclopaedia'},
 1: {'_id': ObjectId('60544a9311fe31d283dedc1c'),
  'forms': [{'form': 'current event', 'tags': ['singular']}],
  'word': 'current events'},
 4: {'_id': ObjectId('60544a9311fe31d283dedc21'),
  'forms': [{'form': 'books',
    'tags': ['present', 'simple', 'singular', 'third-person']},
   {'form': 'booking', 'tags': ['participle', 'present']},
   {'form': 'booked', 'tags': ['past', 'simple']},
   {'form': 'booked', 'tags': ['participle', 'past']}],
  'word': 'book'},
 3: {'_id': ObjectId('60544a9311fe31d283dedc2b'),
  'forms': [{'form': 'rains cats and dogs',
    'tags': ['present', 'simple', 'singular', 'third-person']},
   {'form': 'raining cats and dogs', 'tags': ['participle', 'present']},
   {'form': 'and past participle rained cats and dogs',
    'tags': ['past', 'simple']}],
  'word': 'rain cat

In [13]:
form_tags = set()

for item in lis:
    for f in item['forms']:
        form_tags.update(f['tags'])
form_tags

{'US',
 'abbreviation',
 'accusative',
 'adjective',
 'adverb',
 'also',
 'alternative',
 'canonical',
 'capitalized',
 'chiefly',
 'comparative',
 'countable',
 'error-unknown-tag',
 'feminine',
 'first-person',
 'initialism',
 'lower-case',
 'masculine',
 'neuter',
 'nominative',
 'noun',
 'objective',
 'oblique',
 'often',
 'ordinal',
 'participle',
 'past',
 'plural',
 'positive',
 'possessive',
 'possessive-determiner',
 'present',
 'pronoun',
 'rare',
 'reflexive',
 'second-person',
 'simple',
 'singular',
 'sometimes',
 'specifically',
 'subjective',
 'superlative',
 'third-person',
 'transitive',
 'uncountable',
 'upper-case',
 'usually'}

In [11]:
def get_tag_examples(data,tags):
    res = {}
    for item in data:
        forms = item['forms']
        for f in forms:
            _tags = f['tags']
            t = _tags[0]
            if res.get(t) is None:
                res[t] = [item]
            elif len(res[t]) < 3:
                res[t].append(item)
    return res

In [14]:
tag_examples = get_tag_examples(lis, form_tags)

In [15]:
tag_examples

{'plural': [{'_id': ObjectId('60544a9311fe31d283dedc19'),
   'forms': [{'form': 'encyclopaedias', 'tags': ['plural']},
    {'form': 'encyclopaediae', 'tags': ['plural']}],
   'word': 'encyclopaedia'},
  {'_id': ObjectId('60544a9311fe31d283dedc19'),
   'forms': [{'form': 'encyclopaedias', 'tags': ['plural']},
    {'form': 'encyclopaediae', 'tags': ['plural']}],
   'word': 'encyclopaedia'},
  {'_id': ObjectId('60544a9311fe31d283dedc1e'),
   'forms': [{'form': 'livres', 'tags': ['plural']}],
   'word': 'livre'}],
 'singular': [{'_id': ObjectId('60544a9311fe31d283dedc1c'),
   'forms': [{'form': 'current event', 'tags': ['singular']}],
   'word': 'current events'},
  {'_id': ObjectId('60544a9411fe31d283dee074'),
   'forms': [{'form': 'simple present semes',
     'tags': ['singular', 'third-person']},
    {'form': 'seming', 'tags': ['participle', 'present']}],
   'word': 'seme'},
  {'_id': ObjectId('60544a9511fe31d283dee2b2'),
   'forms': [{'form': 'bollock', 'tags': ['singular']}],
   'word

## antonyms

keep

antonyms string[]

antonyms_missing string[]

In [18]:
from fastprogress import progress_bar

In [39]:
data = query_words_by_key('antonyms')
len(data)

9956

In [42]:
all_ans = sum([o['antonyms'] for o in data], [])
an_keys, samples, distr = get_uniq_keys(all_ans)
an_keys, samples, distr

(['topics', 'translation', 'sense', 'word', 'tags'],
 {'word': ['denotation', 'adaxial', 'aggrandizement'],
  'sense': ['to lower so as to cause pain or hurt feelings',
   'to lower so as to cause pain or hurt feelings',
   'to lower so as to cause pain or hurt feelings'],
  'tags': [['B.C.E.'], ['multiply by four'], ['life drive']],
  'translation': ['life drive',
   'language in use',
   'at a time in advance of the usual'],
  'topics': [['language', 'linguistics'],
   ['time', 'property'],
   ['geometry', 'mathematics']]},
 {'word': 15521, 'sense': 4338, 'tags': 530, 'translation': 139, 'topics': 78})

In [47]:
def check_if_antonyms_exists(data, extract_words):
    re = []
    
    existing_words = list(dict_col.find({}, keep_desired_keys('word')))
    print(len(existing_words))
    existing_words = set(
        [w['word'] for w in existing_words]
    )
    for item in progress_bar(data):
        words = extract_words(item)
        for word in words:
            if word not in existing_words:
                print('{},{}'.format(
                    item['word'],
                    word
                ))
                re.append(
                    (item['word'], word)
                )
    return re

In [24]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc34'),
  'antonyms': [{'word': 'denotation'}],
  'word': 'connotation'},
 {'_id': ObjectId('60544a9311fe31d283dedd0c'),
  'antonyms': [{'word': 'adaxial'}],
  'word': 'abaxial'},
 {'_id': ObjectId('60544a9311fe31d283dedd0f'),
  'antonyms': [{'word': 'aggrandizement'},
   {'word': 'dignity'},
   {'word': 'elevation'},
   {'word': 'exaltation'},
   {'word': 'honor'},
   {'word': 'promotion'},
   {'word': 'reputation'},
   {'word': 'repute'},
   {'word': 'standing'},
   {'word': 'supremacy'}],
  'word': 'abasement'}]

In [25]:
def get_antonyms(item):
    return [i['word'] for i in item['antonyms']]

In [48]:
res = check_if_antonyms_exists(data, get_antonyms)

1021124


head,sharp end
abduction,adductionreplacement
aberrant,see also Thesaurus:normal
aberrant,see also Thesaurus:virtuous
academically,non-academically
abstemious,See also Thesaurus:excessive
on,afterward/afterwards
accidentally,by choice
accidentally,by design
count noun,non-count noun
count noun,uncountable noun
AD,A.C. (ante Christum)
AD,b.c.
AD,B.C. (before Christ)
proper noun,common noun (common name)
proper noun,appellative noun (nomen appellativum)
BC,a.d.
BC,A.D. (anno Domini, Anno Domini, in the year of our Lord)
BC,CE (Christian Era, Common Era, Current Era)
cod,See Thesaurus:bad
everybody,Logically negates to not everybody.
singular,see also Thesaurus:generic
deficit,superavit
slow,see also Thesaurus:speedy
slow,see also Thesaurus:sudden
slow,see also Thesaurus:intelligent
slow,see also Thesaurus:prompt
slow,see also Thesaurus:active
increase,plummet (rapidly)
increase,plunge (rapidly)
increase,See also Thesaurus:decrease
increase,See also Thesaurus:diminish
increase,See also Th

producible,improducible
mind-bending,mind-expanding
illuminable,See also Thesaurus:incomprehensible
value bet,to bluff
value raise,to bluff
indivisibility,divisibility（state capable of being divided）
indivisibility,uncombinability（state incapable of being combined）
indivisibility,unmergeability（state incapable of being merged）
insobriety,see also Thesaurus:drunkenness#Antonyms
ready-made,See also: Thesaurus:custom-made
exergy,bound energy
soft key,hard key
soft key,high key
autotelic,non-autotelic
autotelic,nonautotelic
autotelic,non-autotelic
autotelic,nonautotelic
in one's right mind,See also Thesaurus:insane.
economies of scale,diseconomies of scale
finiteness,see also Thesaurus:infinity
-to-be,then-
gender-neutral,gender-specific
apodictic,assertorical
adverse effect,desired effect
in general,see also Thesaurus:specifically
poikilotherm,homoiotherm / homeotherm / homeothermic
slow oven,quick oven
basic research,applied research
digital signal,analogue signal
orientable,non-orientab

In [50]:
len(res) # 1112 antonyms missing

1112

## wikipedia

remove

In [51]:
data = query_words_by_key('wikipedia')
len(data)

46453

In [54]:
data[:4]

[{'_id': ObjectId('60544a9311fe31d283dedc1e'),
  'word': 'livre',
  'wikipedia': ['livre']},
 {'_id': ObjectId('60544a9311fe31d283dedc20'),
  'word': 'book',
  'wikipedia': ['book']},
 {'_id': ObjectId('60544a9311fe31d283dedc21'),
  'word': 'book',
  'wikipedia': ['book']},
 {'_id': ObjectId('60544a9311fe31d283dedc22'),
  'word': 'book',
  'wikipedia': ['book']}]

In [55]:
all_tags = set()
for item in data:
    all_tags.update(item['wikipedia'])
    
len(all_tags)

36513

In [57]:
all_tags = list(all_tags)
all_tags[:10]

['ameloblastoma',
 'Mexican',
 'waxing',
 'sedan',
 'saffron',
 'alcohol',
 'bluebird',
 'mercenary',
 'playlist',
 'fleet']

In [60]:
distribution,samples = len_distribution(data,'wikipedia')

In [61]:
distribution

{1: 42272,
 3: 599,
 2: 3243,
 4: 222,
 9: 2,
 7: 13,
 6: 23,
 17: 2,
 14: 2,
 5: 71,
 30: 1,
 8: 2,
 13: 1}

In [62]:
samples

{1: {'_id': ObjectId('60544a9311fe31d283dedc1e'),
  'word': 'livre',
  'wikipedia': ['livre']},
 3: {'_id': ObjectId('60544a9311fe31d283dedc24'),
  'wikipedia': ['Jabberwocky', 'Lewis Carroll', 'portmanteau'],
  'word': 'portmanteau'},
 2: {'_id': ObjectId('60544a9311fe31d283dedc4d'),
  'wikipedia': ['Word (computer architecture)', 'word'],
  'word': 'word'},
 4: {'_id': ObjectId('60544a9311fe31d283dedc74'),
  'wikipedia': ['Eternal September',
   'September',
   'September Group',
   'September Massacres'],
  'word': 'September'},
 9: {'_id': ObjectId('60544a9411fe31d283dedfda'),
  'wikipedia': ['Avatar (Hinduism)',
   'Chip Morningstar',
   'LucasArts',
   'Neal Stephenson',
   'Randy Farmer',
   'Shadowrun',
   'Snow Crash',
   'Ultima IV: Quest of the Avatar',
   'pen and paper'],
  'word': 'avatar'},
 7: {'_id': ObjectId('60544a9411fe31d283dee022'),
  'wikipedia': ['A Dictionary of the English Language',
   'Charles du Fresne, sieur du Cange#Works',
   'Color',
   'Noah Webster',


## abbreviations

keep

abbreviations   string[]

In [33]:
data = query_words_by_key('abbreviations')
len(data)

107

In [76]:
data[:3]

[{'_id': ObjectId('60544aa111fe31d283defc6f'),
  'abbreviations': [{'word': 'Some translation dictionaries have used the abbreviation s.o. or so for someone.'}],
  'word': 'someone'},
 {'_id': ObjectId('60544aa911fe31d283df0ee4'),
  'abbreviations': [{'word': '24th'}, {'word': '24^th'}],
  'word': 'twenty-fourth'},
 {'_id': ObjectId('60544aa911fe31d283df0eea'),
  'abbreviations': [{'word': '22nd'}, {'word': '22^nd'}],
  'word': 'twenty-second'}]

In [77]:
distribution, samples = len_distribution(data, 'abbreviations')

In [78]:
distribution

{1: 23, 2: 77, 3: 6, 4: 1}

In [79]:
samples

{1: {'_id': ObjectId('60544aa111fe31d283defc6f'),
  'abbreviations': [{'word': 'Some translation dictionaries have used the abbreviation s.o. or so for someone.'}],
  'word': 'someone'},
 2: {'_id': ObjectId('60544aa911fe31d283df0ee4'),
  'abbreviations': [{'word': '24th'}, {'word': '24^th'}],
  'word': 'twenty-fourth'},
 3: {'_id': ObjectId('60544aaa11fe31d283df0f0f'),
  'abbreviations': [{'word': '16th'},
   {'word': '16^th'},
   {'word': '(in names of monarchs and popes) XVI'}],
  'word': 'sixteenth'},
 4: {'_id': ObjectId('60544aaa11fe31d283df0f2c'),
  'abbreviations': [{'word': '18th'},
   {'word': '18^th'},
   {'word': 'XVIIIth'},
   {'word': '(in royal, papal and other names; for centuries) XVIII'}],
  'word': 'eighteenth'}}

In [38]:
## keys in abbr
abbrs = sum([o['abbreviations'] for o in data],[])
abbr_keys,abbr_samples,distribution = get_uniq_keys(abbrs)
abbr_keys,abbr_samples, distribution

(['word', 'sense'],
 {'word': ['Some translation dictionaries have used the abbreviation s.o. or so for someone.',
   '24th',
   '24^th'],
  'sense': ['computer software', 'interval', 'in music']},
 {'word': 199, 'sense': 3})

 ## categories
 
keep

wiktionary_categories string[]

In [52]:
data = query_words_by_key('categories')
len(data)

81619

In [81]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc19'),
  'categories': ['English nouns with irregular plurals'],
  'word': 'encyclopaedia'},
 {'_id': ObjectId('60544a9311fe31d283dedc1e'),
  'categories': ['Currencies', 'France', 'Units of measure'],
  'word': 'livre'},
 {'_id': ObjectId('60544a9311fe31d283dedc22'),
  'categories': ['English basic words',
   'Heraldic charges',
   'Textual division'],
  'word': 'book'}]

In [85]:
uniqued_categories = get_uniq_tags(data,'categories')
len(uniqued_categories)

4079

In [89]:
distr, samples =len_distribution(data,'categories')

In [90]:
distr

{1: 49617,
 3: 7587,
 4: 2871,
 2: 19667,
 9: 28,
 8: 91,
 5: 1116,
 6: 408,
 14: 1,
 7: 183,
 11: 16,
 23: 1,
 10: 20,
 12: 7,
 18: 1,
 13: 4,
 16: 1}

In [87]:
uniqued_categories[:10]

['Ichthyology',
 'Latvian pluralia tantum',
 'Tribes',
 'Honeyeaters',
 'Happiness',
 'Hungarian adjectives suffixed with -os',
 'German phrasebook',
 'Tokyo',
 'Esperanto compound words',
 'Measure theory']

## compounds

remove

In [91]:
data = query_words_by_key('compounds')
len(data)

2

In [92]:
data

[{'_id': ObjectId('60544aa111fe31d283defcb1'),
  'compounds': [{'word': 'live actors'},
   {'word': 'live action'},
   {'word': 'live album'},
   {'word': 'live broadcast'},
   {'word': 'live recording'}],
  'word': 'live'},
 {'_id': ObjectId('60544ac611fe31d283df5cd1'),
  'compounds': [{'word': 'balloon release'},
   {'word': 'canary release'},
   {'word': 'debt release'},
   {'word': 'firmware release'},
   {'word': 'hardware release'},
   {'word': 'release notes'},
   {'word': 'release process'},
   {'word': 'software release'},
   {'word': 'track-release'}],
  'word': 'release'}]

## coordinate_terms

remove

In [93]:
data = query_words_by_key('coordinate_terms')
len(data)

1646

In [95]:
data[3:5]

[{'_id': ObjectId('60544a9411fe31d283dedfbb'),
  'word': 'foreign debt',
  'coordinate_terms': [{'word': 'domestic debt'}]},
 {'_id': ObjectId('60544a9411fe31d283dee006'),
  'word': 'wolf',
  'coordinate_terms': [{'word': 'dingo', 'sense': 'large wild canid'},
   {'word': 'dog (members of Canis lupus not called wolf)',
    'sense': 'large wild canid'},
   {'word': 'coyote', 'sense': 'large wild canid'},
   {'word': 'jackal', 'sense': 'large wild canid'},
   {'word': 'fox (other canids)', 'sense': 'large wild canid'}]}]

## derived

keep

derived string[]

In [56]:
data = query_words_by_key('derived')
len(data)

21125

In [98]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc20'),
  'word': 'book',
  'derived': [{'word': 'address book'},
   {'word': 'audiobook'},
   {'word': 'book account'},
   {'word': 'book agent'},
   {'word': 'book-answerer'},
   {'word': 'book award'},
   {'word': 'book-bearer'},
   {'word': 'bookbinder'},
   {'word': 'book-board'},
   {'word': 'book-bosomed'},
   {'word': 'book-bound'},
   {'word': 'book-boy'},
   {'word': 'book-burning'},
   {'word': 'bookcase'},
   {'word': 'book-cloth'},
   {'word': 'book club'},
   {'word': 'book canvasser'},
   {'word': 'book concern'},
   {'word': 'book-crab'},
   {'word': 'book-credit'},
   {'word': 'book-debt'},
   {'word': 'book-edge gilder'},
   {'word': 'book-edge marbler'},
   {'word': 'book end'},
   {'word': 'bookend'},
   {'word': 'bookery'},
   {'word': 'booketeria'},
   {'word': 'book-farmer'},
   {'word': 'book-folder'},
   {'word': 'book-form'},
   {'word': 'bookful'},
   {'word': 'book-ghoul'},
   {'word': 'book-gill'},
   {'word': 'book ha

In [57]:
all_derived = sum([o['derived'] for o in data],[])
a,b,c = get_uniq_keys(all_derived)
a,b,c

(['topics', 'translation', 'sense', 'word', 'tags'],
 {'word': ['address book', 'audiobook', 'book account'],
  'tags': [['Corvus coronoides'], ['Corvus ruficollis'], ['Corvus moriorum']],
  'sense': ['assent and engagement by person on whom bill of exchange is drawn',
   'assent and engagement by person on whom bill of exchange is drawn',
   'typography'],
  'translation': ['back slang',
   'to talk at length in a foolish or boring way',
   'that which skews something'],
  'topics': [['grammar', 'linguistics'],
   ['the',
    'country',
    'countries',
    'region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences'],
   ['region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences']]},
 {'word': 144289,
  'tags': 7024,
  'sense': 870,
  'translation': 133,
  'topics': 109})

## holonyms

keep

holonyms string[]

In [99]:
data = query_words_by_key('holonyms')
len(data)

74

In [101]:
data[10:13]

[{'_id': ObjectId('60544a9a11fe31d283deec58'),
  'word': 'room',
  'holonyms': [{'word': 'house'},
   {'word': 'building'},
   {'word': 'structure'},
   {'word': 'apartment'},
   {'word': 'home'},
   {'word': 'flat'},
   {'word': 'hotel'},
   {'word': 'hospital'}]},
 {'_id': ObjectId('60544a9b11fe31d283deee59'),
  'word': 'rice',
  'holonyms': [{'word': 'bhelpuri'},
   {'word': 'California roll'},
   {'word': 'dosa'},
   {'word': 'gumbo'},
   {'word': 'idli'},
   {'word': 'jambalaya'},
   {'word': 'khir'},
   {'word': 'mirin'},
   {'word': 'mochi'},
   {'word': 'nasi goreng'},
   {'word': 'onigiri'},
   {'word': 'pad thai'},
   {'word': 'paella'},
   {'word': 'pilaf'},
   {'word': 'pilau'},
   {'word': 'rangoli'},
   {'word': 'rijsttafel'},
   {'word': 'risotto'},
   {'word': 'sake'},
   {'word': 'samshu'},
   {'word': 'sushi'}]},
 {'_id': ObjectId('60544a9e11fe31d283def684'),
  'word': 'theory',
  'holonyms': [{'word': 'formal system', 'sense': 'in logic'}]}]

## hypernyms

keep

hypernyms string[]

In [102]:
data = query_words_by_key('hypernyms')
len(data)

935

In [103]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc29'),
  'word': 'day',
  'hypernyms': [{'word': 'month'},
   {'word': 'time'},
   {'word': 'week'},
   {'word': 'year'}]},
 {'_id': ObjectId('60544a9311fe31d283dedcd1'),
  'word': 'Sunday',
  'hypernyms': [{'word': 'day'}]},
 {'_id': ObjectId('60544a9411fe31d283dede70'),
  'word': 'abscissa',
  'hypernyms': [{'word': 'coordinate', 'sense': 'first of two coordinates'},
   {'word': 'axis', 'sense': 'horizontal line'}]}]

## hyphenation

keep

processing: unique

hyphenation

In [104]:
data = query_words_by_key('hyphenation')
len(data)

11462

In [105]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc19'),
  'word': 'encyclopaedia',
  'hyphenation': ['en‧cy‧clo‧pe‧dia']},
 {'_id': ObjectId('60544a9311fe31d283dedc1d'),
  'hyphenation': ['flocci‧nauci‧ni‧hili‧pili‧fi‧ca‧tion'],
  'word': 'floccinaucinihilipilification'},
 {'_id': ObjectId('60544a9311fe31d283dedc2d'),
  'word': 'denotation',
  'hyphenation': ['de‧no‧ta‧tion']}]

In [106]:
distribution, samples = len_distribution(data,'hyphenation')

In [108]:
distribution,samples

({1: 11294, 2: 124, 4: 15, 3: 29},
 {1: {'_id': ObjectId('60544a9311fe31d283dedc19'),
   'word': 'encyclopaedia',
   'hyphenation': ['en‧cy‧clo‧pe‧dia']},
  2: {'_id': ObjectId('60544a9311fe31d283deddb4'),
   'hyphenation': ['aba‧cus', 'aba‧ci'],
   'word': 'abacus'},
  4: {'_id': ObjectId('60544a9511fe31d283dee166'),
   'word': 'second',
   'hyphenation': ['sec‧ond', 'sec‧ond', 'sec‧ond;', 'sec‧ond']},
  3: {'_id': ObjectId('60544aa411fe31d283df01d2'),
   'word': 'postulate',
   'hyphenation': ['pos‧tu‧late;', 'pos‧tu‧late;', 'pos‧tu‧late']}})

## hyponyms

keep

strings in form to processs: 'See Thesaurus:book'

hyponyms

In [109]:
data = query_words_by_key('hyponyms')
len(data)

1600

In [111]:
data[10:13]

[{'_id': ObjectId('60544a9311fe31d283dedcd1'),
  'word': 'Sunday',
  'hyponyms': [{'word': "Hall' Sunday"},
   {'word': 'Albless Sunday'},
   {'word': 'Alb Sunday'},
   {'word': 'Antipascha Sunday'},
   {'word': 'Ascension Sunday'},
   {'word': 'Black Sunday'},
   {'word': 'Bloody Sunday'},
   {'word': 'Branch Sunday'},
   {'word': 'cannonball Sunday'},
   {'word': 'Cantate Sunday'},
   {'word': 'Care Sunday'},
   {'word': 'Carling Sunday'},
   {'word': 'Chestnut Sunday'},
   {'word': 'Christmas Sunday'},
   {'word': 'Cold Sunday'},
   {'word': 'Communion Sunday'},
   {'word': 'Divine Mercy Sunday'},
   {'word': 'Easter Sunday'},
   {'word': 'Expectation Sunday'},
   {'word': 'Fast Sunday'},
   {'word': 'Fig Sunday'},
   {'word': 'Garland Sunday'},
   {'word': 'Gaudete Sunday'},
   {'word': "God's Sunday"},
   {'word': 'Good Shepherd Sunday'},
   {'word': 'Greasy Sunday'},
   {'word': 'Hospital Sunday'},
   {'word': 'Jubilate Sunday'},
   {'word': 'Judica Sunday'},
   {'word': 'Justice

## inflection

discard

In [112]:
data = query_words_by_key('inflection')
len(data)

172

In [113]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedcd0'),
  'inflection': [{'template_name': 'en-conj-simple'}],
  'word': 'verb'},
 {'_id': ObjectId('60544a9311fe31d283deddc1'),
  'inflection': [{'template_name': 'en-conj-simple'}],
  'word': 'abhor'},
 {'_id': ObjectId('60544a9411fe31d283dede48'),
  'inflection': [{'1': 'es', 'template_name': 'en-conj-simple'}],
  'word': 'abolish'}]

## inflection_of

discard

In [114]:
data = query_words_by_key('inflection_of')
len(data)

9

In [115]:
data[:3]

[{'_id': ObjectId('60544a9b11fe31d283deefea'),
  'inflection_of': ['they'],
  'word': 'them'},
 {'_id': ObjectId('60544a9e11fe31d283def65b'),
  'inflection_of': ['the indefinite personal pronoun one'],
  'word': 'oneself'},
 {'_id': ObjectId('60544a9e11fe31d283def66c'),
  'inflection_of': ['he'],
  'word': 'himself'}]

## meronyms

discard

In [116]:
data = query_words_by_key('meronyms')
len(data)

92

In [117]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc7c'),
  'word': 'dialect',
  'meronyms': [{'word': 'idiolect'}]},
 {'_id': ObjectId('60544a9411fe31d283dee0d4'),
  'word': 'century',
  'meronyms': [{'word': 'cohort', 'sense': 'major unit of the Roman army'},
   {'word': 'maniple', 'sense': 'major unit of the Roman army'},
   {'word': 'legion', 'sense': 'major unit of the Roman army'}]},
 {'_id': ObjectId('60544a9511fe31d283dee1f4'),
  'word': 'flute',
  'meronyms': [{'word': 'fipple', 'sense': 'music'},
   {'word': 'labium', 'sense': 'music'}]}]

## proverbs
discard

In [118]:
data = query_words_by_key('proverbs')
data

[{'_id': ObjectId('60544a9911fe31d283dee98b'),
  'proverbs': [{'word': "money doesn't grow on trees"},
   {'word': 'see the forest for the trees'}],
  'word': 'tree'}]

## related
multi words

keep

action or environment related with word

related string[]

In [119]:
data = query_words_by_key('related')
len(data)

24297

In [120]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc20'),
  'word': 'book',
  'related': [{'word': 'incunable'},
   {'word': 'scroll'},
   {'word': 'tome'},
   {'word': 'volume'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc25'),
  'word': 'portmanteau',
  'related': [{'word': 'List of portmanteau words defined in Wiktionary'},
   {'word': 'Wikipedia article on portmanteaus',
    'tags': ['cases and words']}]},
 {'_id': ObjectId('60544a9311fe31d283dedc29'),
  'word': 'day',
  'related': [{'word': 'day of the week',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property']},
   {'word': 'Sunday',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property']},
   {'word': '\u200e Monday',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property']},
   {'word': '\u200e Tuesday',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property']},
   {'word': '\u200e Wednesday',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'p

## sounds

keep

need to clean data

In [121]:
data = query_words_by_key('sounds')
len(data)

114966

In [122]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc19'),
  'word': 'encyclopaedia',
  'sounds': [{'ipa': '/ənˌsəɪ.kləˈpi.di.ə/', 'tags': ['Canada']},
   {'ipa': '/ɪnˌsaɪ.kləˈpi(ː).di.ə/', 'tags': ['UK', 'US']},
   {'rhymes': '-iːdiə'},
   {'audio': 'en-us-encyclopedia.ogg', 'tags': ['US'], 'text': 'Audio (US)'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc1a'),
  'word': 'pies',
  'sounds': [{'ipa': '/paɪz/'},
   {'rhymes': '-aɪz'},
   {'audio': 'LL-Q1860 (eng)-Vealhurl-pies.wav',
    'tags': ['UK'],
    'text': 'Audio (UK)'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc1b'),
  'word': 'pies',
  'sounds': [{'ipa': '/paɪz/'},
   {'rhymes': '-aɪz'},
   {'audio': 'LL-Q1860 (eng)-Vealhurl-pies.wav',
    'tags': ['UK'],
    'text': 'Audio (UK)'}]}]

In [123]:
distr,samples = len_distribution(data,'sounds')

In [125]:
distr, samples

({4: 10303,
  3: 16438,
  2: 23686,
  9: 721,
  7: 1750,
  1: 50623,
  6: 3245,
  10: 487,
  5: 5488,
  8: 1097,
  11: 326,
  14: 112,
  12: 211,
  19: 18,
  13: 154,
  17: 49,
  16: 50,
  27: 3,
  22: 21,
  20: 16,
  15: 58,
  23: 4,
  21: 28,
  18: 43,
  26: 3,
  36: 2,
  48: 2,
  24: 7,
  33: 4,
  40: 2,
  43: 13,
  44: 2},
 {4: {'_id': ObjectId('60544a9311fe31d283dedc19'),
   'word': 'encyclopaedia',
   'sounds': [{'ipa': '/ənˌsəɪ.kləˈpi.di.ə/', 'tags': ['Canada']},
    {'ipa': '/ɪnˌsaɪ.kləˈpi(ː).di.ə/', 'tags': ['UK', 'US']},
    {'rhymes': '-iːdiə'},
    {'audio': 'en-us-encyclopedia.ogg',
     'tags': ['US'],
     'text': 'Audio (US)'}]},
  3: {'_id': ObjectId('60544a9311fe31d283dedc1a'),
   'word': 'pies',
   'sounds': [{'ipa': '/paɪz/'},
    {'rhymes': '-aɪz'},
    {'audio': 'LL-Q1860 (eng)-Vealhurl-pies.wav',
     'tags': ['UK'],
     'text': 'Audio (UK)'}]},
  2: {'_id': ObjectId('60544a9311fe31d283dedc1e'),
   'word': 'livre',
   'sounds': [{'ipa': '/ˈliːvɹə/'}, {'ipa': '/ˈ

## synonyms
keep

data cleaning needed

In [126]:
data = query_words_by_key('synonyms')
len(data)

14563

In [127]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc20'),
  'word': 'book',
  'synonyms': [{'word': 'See Thesaurus:book'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc2f'),
  'word': 'thesaurus',
  'synonyms': [{'word': 'synonymicon'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc34'),
  'word': 'connotation',
  'synonyms': [{'word': 'intension'}]}]

In [131]:
dist,samples = len_distribution(data,'synonyms')
dist, samples

({1: 6179,
  12: 91,
  2: 2775,
  4: 1129,
  3: 1732,
  14: 52,
  6: 516,
  5: 693,
  18: 27,
  24: 8,
  10: 144,
  33: 1,
  8: 296,
  15: 34,
  25: 8,
  22: 10,
  9: 184,
  23: 10,
  11: 114,
  31: 1,
  16: 36,
  13: 59,
  36: 1,
  34: 3,
  7: 385,
  19: 13,
  32: 2,
  20: 8,
  21: 10,
  35: 1,
  26: 9,
  52: 3,
  46: 2,
  17: 18,
  30: 3,
  29: 2,
  41: 1,
  27: 2,
  38: 1},
 {1: {'_id': ObjectId('60544a9311fe31d283dedc20'),
   'word': 'book',
   'synonyms': [{'word': 'See Thesaurus:book'}]},
  12: {'_id': ObjectId('60544a9311fe31d283dedc41'),
   'word': 'cat',
   'synonyms': [{'word': 'feliform (carnivoran)',
     'sense': 'any member of the suborder (sometimes superfamily) Feliformia or Feloidea'},
    {'word': 'feloid (compare Caniformia, Canoidea)',
     'sense': 'any member of the suborder (sometimes superfamily) Feliformia or Feloidea'},
    {'word': 'feline cat',
     'sense': 'any member of the subfamily Felinae, genera Puma, Acinonyx, Lynx, Leopardus, and Felis)'},
    {'wor

## translations
keep

In [132]:
data = query_words_by_key('translations')
len(data)

32146

In [133]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc23'),
  'word': 'portmanteau',
  'translations': [{'lang': 'Albanian',
    'code': 'sq',
    'sense': 'case',
    'word': 'baule',
    'tags': ['feminine']},
   {'lang': 'Arabic', 'code': 'ar', 'sense': 'case', 'word': 'حقيبَة سفر'},
   {'lang': 'Breton',
    'code': 'br',
    'sense': 'case',
    'word': 'mal',
    'tags': ['feminine']},
   {'lang': 'Breton',
    'code': 'br',
    'sense': 'case',
    'word': 'malizenn',
    'tags': ['feminine']},
   {'lang': 'Bulgarian',
    'code': 'bg',
    'sense': 'case',
    'word': 'голям кожен куфар',
    'roman': 'goljam kožen kufar'},
   {'lang': 'Czech', 'code': 'cs', 'sense': 'case', 'word': 'kontaminace'},
   {'lang': 'Dutch',
    'code': 'nl',
    'sense': 'case',
    'word': 'koffer',
    'tags': ['feminine']},
   {'lang': 'Dutch',
    'code': 'nl',
    'sense': 'case',
    'word': 'valies',
    'tags': ['feminine']},
   {'lang': 'Finnish', 'code': 'fi', 'sense': 'case', 'word': 'matkalaukku'},
 

## troponyms

discard

In [134]:
data = query_words_by_key('troponyms')
len(data)

10

In [135]:
data[:3]

[{'_id': ObjectId('60544a9c11fe31d283def15b'),
  'word': 'sale',
  'troponyms': [{'word': 'cut-rate sale',
    'sense': 'selling of goods at reduced prices'},
   {'word': 'sales event', 'sense': 'selling of goods at reduced prices'},
   {'word': 'auction',
    'sense': 'act of putting up for auction to the highest bidder'},
   {'word': 'public sale',
    'sense': 'act of putting up for auction to the highest bidder'}]},
 {'_id': ObjectId('60544a9d11fe31d283def4a3'),
  'word': 'cut',
  'troponyms': [{'word': 'chop'},
   {'word': 'hack'},
   {'word': 'slice'},
   {'word': 'trim'}]},
 {'_id': ObjectId('60544a9e11fe31d283def7a7'),
  'word': 'sleep',
  'troponyms': [{'word': 'nap',
    'sense': 'rest in a state of reduced consciousness'},
   {'word': 'doze', 'sense': 'rest in a state of reduced consciousness'},
   {'word': 'snooze', 'sense': 'rest in a state of reduced consciousness'}]}]

## senses

In [7]:
data = query_words_by_key('senses')

In [8]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc17'),
  'word': 'GNU FDL',
  'senses': [{'glosses': ['Initialism of GNU Free Documentation License.'],
    'tags': ['abbreviation', 'alt-of', 'initialism'],
    'alt_of': ['GNU Free Documentation License'],
    'id': 'GNU_FDL-name'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc18'),
  'word': 'Pope Julius',
  'senses': [{'categories': ['Card games'],
    'tags': ['obsolete'],
    'glosses': ['A sixteenth-century gambling card game about which little is known.'],
    'id': 'Pope_Julius-name'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc19'),
  'word': 'encyclopaedia',
  'senses': [{'categories': ['Australian English', 'British English'],
    'tags': ['Australia', 'Britain', 'alt-of', 'alternative', 'chiefly'],
    'glosses': ['Alternative spelling of encyclopedia'],
    'alt_of': ['encyclopedia'],
    'id': 'encyclopaedia-noun'}]}]

In [154]:
dist,samples = len_distribution(data, 'senses')

In [155]:
dist

{1: 926841,
 2: 59183,
 3: 16958,
 4: 7115,
 5: 3726,
 6: 2145,
 7: 1424,
 8: 904,
 9: 639,
 10: 424,
 11: 333,
 12: 262,
 13: 202,
 14: 149,
 15: 134,
 16: 87,
 17: 100,
 18: 74,
 19: 58,
 20: 48,
 21: 46,
 22: 32,
 23: 34,
 24: 29,
 25: 18,
 26: 15,
 27: 20,
 28: 7,
 29: 12,
 30: 7,
 31: 9,
 32: 10,
 33: 11,
 34: 8,
 35: 6,
 36: 7,
 37: 5,
 38: 4,
 39: 4,
 40: 3,
 41: 1,
 42: 1,
 43: 3,
 44: 3,
 45: 1,
 46: 2,
 48: 1,
 49: 1,
 52: 4,
 54: 1,
 57: 1,
 58: 3,
 61: 1,
 63: 1,
 66: 1,
 75: 1,
 78: 1,
 80: 1,
 87: 1,
 101: 1,
 119: 1}

In [140]:
samples

{1: {'_id': ObjectId('60544a9311fe31d283dedc17'),
  'word': 'GNU FDL',
  'senses': [{'glosses': ['Initialism of GNU Free Documentation License.'],
    'tags': ['abbreviation', 'alt-of', 'initialism'],
    'alt_of': ['GNU Free Documentation License'],
    'id': 'GNU_FDL-name'}]},
 2: {'_id': ObjectId('60544a9311fe31d283dedc1b'),
  'word': 'pies',
  'senses': [{'glosses': ['Third-person singular simple present indicative form of pie'],
    'tags': ['form-of',
     'indicative',
     'present',
     'simple',
     'singular',
     'third-person'],
    'form_of': ['pie'],
    'id': 'pies-verb-2jEXbWw3'},
   {'glosses': ['Third-person singular simple present indicative form of pi'],
    'tags': ['form-of',
     'indicative',
     'present',
     'simple',
     'singular',
     'third-person'],
    'form_of': ['pi'],
    'id': 'pies-verb-eNU3Pi6K'}]},
 14: {'_id': ObjectId('60544a9311fe31d283dedc20'),
  'word': 'book',
  'senses': [{'glosses': ['A collection of sheets of paper bound together

In [156]:
samples[9]

{'_id': ObjectId('60544a9311fe31d283dedc8a'),
 'word': 'brown',
 'senses': [{'tags': ['countable', 'uncountable'],
   'glosses': ['A colour like that of chocolate or coffee.'],
   'translations': [{'lang': 'Afrikaans',
     'code': 'af',
     'sense': 'colour',
     'word': 'bruin'},
    {'lang': 'Akan', 'code': 'ak', 'sense': 'colour', 'word': 'ahaban dada'},
    {'lang': 'Akan',
     'code': 'ak',
     'sense': 'colour',
     'word': 'ntokowa ntokowa'},
    {'lang': 'Albanian', 'code': 'sq', 'sense': 'colour', 'word': 'bojëkafe'},
    {'lang': 'American Sign Language',
     'code': 'ase',
     'sense': 'colour',
     'word': 'B@Cheek-PalmForward B@Jaw-PalmForward'},
    {'lang': 'Arabic',
     'code': 'ar',
     'sense': 'colour',
     'word': 'بُنِّيّ',
     'roman': 'bunniyy'},
    {'lang': 'Arabic',
     'code': 'ar',
     'sense': 'colour',
     'word': 'أَسْمَر',
     'tags': ['masculine'],
     'roman': 'hair'},
    {'lang': 'Armenian',
     'code': 'hy',
     'sense': 'colour'

In [145]:
dist.keys()

dict_keys([1, 2, 14, 8, 3, 6, 4, 5, 19, 7, 9, 15, 27, 11, 23, 12, 29, 10, 13, 37, 20, 21, 16, 58, 22, 17, 52, 63, 24, 18, 80, 25, 26, 32, 38, 35, 28, 33, 41, 34, 44, 39, 31, 61, 43, 54, 40, 30, 36, 57, 46, 42, 78, 45, 49, 87, 75, 48, 119, 101, 66])

In [9]:
all_senses=[]

for item in data:
    all_senses.extend(item['senses'])

len(all_senses)

1212639

In [24]:
senses_keys = []
senses_key_examples = {}

for sense in all_senses:
    keys = list(sense.keys())
    for key in keys:     
        if key not in senses_keys:
            senses_keys.append(key)
    
        if senses_key_examples.get(key) is None:
            senses_key_examples[key] = [sense[key]]
        elif len(senses_key_examples[key]) <= 3:
            senses_key_examples[key].append(sense[key])

In [25]:
len(senses_keys), senses_keys

(21,
 ['glosses',
  'tags',
  'alt_of',
  'id',
  'categories',
  'form_of',
  'translations',
  'related',
  'wikipedia',
  'synonyms',
  'topics',
  'derived',
  'senseid',
  'hyponyms',
  'hypernyms',
  'coordinate_terms',
  'wikidata',
  'holonyms',
  'meronyms',
  'troponyms',
  'compound_of'])

In [None]:
cns = [{'lang': 'Chinese',
    'code': 'yue',
    'tags': ['Cantonese'],
    'sense': 'news items',
    'word': '時事'},
   {'lang': 'Chinese',
    'code': 'yue',
    'tags': ['Cantonese'],
    'sense': 'news items',
    'word': '时事',
    'roman': 'si^4 si^6'},
   {'lang': 'Chinese',
    'code': 'cmn',
    'tags': ['Mandarin'],
    'sense': 'news items',
    'word': '時事'},
   {'lang': 'Chinese',
    'code': 'cmn',
    'tags': ['Mandarin'],
    'sense': 'news items',
    'word': '时事',
    'roman': 'shíshì'}]

In [26]:
senses_key_examples

{'glosses': [['Initialism of GNU Free Documentation License.'],
  ['A sixteenth-century gambling card game about which little is known.'],
  ['Alternative spelling of encyclopedia'],
  ['plural of pie']],
 'tags': [['abbreviation', 'alt-of', 'initialism'],
  ['obsolete'],
  ['Australia', 'Britain', 'alt-of', 'alternative', 'chiefly'],
  ['form-of', 'plural']],
 'alt_of': [['GNU Free Documentation License'],
  ['encyclopedia'],
  ['gross domestic product'],
  ['guanosine diphosphate, a nucleotide']],
 'id': ['GNU_FDL-name', 'Pope_Julius-name', 'encyclopaedia-noun', 'pies-noun'],
 'categories': [['Card games'],
  ['Australian English', 'British English'],
  ['English pluralia tantum'],
  ['Long English words']],
 'form_of': [['pie'], ['pie'], ['pi'], ['bake']],
 'translations': [[{'lang': 'Chinese',
    'code': 'yue',
    'tags': ['Cantonese'],
    'sense': 'news items',
    'word': '時事'},
   {'lang': 'Chinese',
    'code': 'yue',
    'tags': ['Cantonese'],
    'sense': 'news items',
   

- 'glosses': `string[]`
- 'tags': `string[]`
- 'alt_of': `string[]`
- 'id': `string`
- 'categories': `string[]`
- 'form_of': `string[]`
- 'translations': `{lang:string, code: string, tags:string[], sense:string, word: string, roman: string}[]`
- 'related': `{word: string, tags?:string[]}[]`
- 'wikipedia': `string[]`
- 'synonyms': `{word: string}[]`
- 'topics': `string[]`
- 'derived': `{word: string}[]`
- 'senseid': `string[]`
- 'hyponyms': `{word: string, sense: string}[]`
- 'hypernyms': `{word: string}[]`
- 'coordinate_terms': `{word: string, sense?:string}[]`
- 'wikidata': `string[]`
- 'holonyms': `{word: string}[]`
- 'meronyms': `{word: string, topics: string[], sense: string, tags: string[]}[]`
- 'troponyms': `{word: string, sense: string}[]`
- 'compound_of': `string[]`

## check repeated words

In [15]:
data = query_words_by_key('word')
len(data)

1021124

In [16]:
all_words = [i['word'] for i in data]

In [31]:
all_words[:10]

['GNU FDL',
 'Pope Julius',
 'encyclopaedia',
 'pies',
 'pies',
 'current events',
 'floccinaucinihilipilification',
 'livre',
 'multiculturalism',
 'book']

In [43]:
len(set(all_words))

952502

In [30]:
1021124-952502

68622

In [30]:
data[0]

{'_id': ObjectId('60544a9311fe31d283dedc17'), 'word': 'GNU FDL'}

In [17]:
def get_repeated_words(data, words):
    repeated = []
    idxes_to_skip = set()
    for idx, item in enumerate(data):
        temp_repeated = []
        if idx in idxes_to_skip:
            continue
        for idx2, word in enumerate(words[idx+1:]):
            if item['word'] == word:
                idxes_to_skip.add(idx2)
                temp_repeated.append(item['_id'])
        if len(temp_repeated) > 0 :
            repeated.append(
                (item, temp_repeated)
            )
    return repeated

In [18]:
from multiprocessing import Pool, cpu_count
import math

In [19]:
def chunk_data(data, bs=10):
    sz =math.floor(len(data)/bs)
    res = []
    for i in range(bs):
        block = data[sz*i : sz*(i+1)] if i != bs-1 else data[sz*i :] 
        res.append(block)
    
    return res

In [26]:
res = []
num = 40
with Pool(num) as p:
    for i in range(num):
        chunked_words = chunk_data(all_words, num)
        chunked_data = chunk_data(data, num)
        r = p.apply_async(get_repeated_words, args=(chunked_data[i], chunked_words[i],))
        res.append(r.get())

In [29]:
res = sum(res,[])
len(res)

68617

## handler

In [54]:
class NoneValueException(Exception):
    pass

def _check_none_value(item,key_name):
    data = item.get(key_name)
    if data is None:
        raise NoneValueException
    
    return data

def _uniq_list(expre):
    return list(set(expre))

def abbreviations_handler(item):
    try:
        abbr = _check_none_value(item, 'abbreviations')
        return {
        'abbrs': _uniq_list(o['word'] for o in abbr),
    }
    except NoneValueException:
        return {}


def antonyms_handler(item):
    data = _check_none_value(item, 'antonyms')
    return {
        'antonyms': _uniq_list(o['word'] for o in data),
        'antonyms_data': data
    }  
    
def categories_handler(item):
    data = _check_none_value(item, 'categories')
    return {
        'categories': _uniq_list(data)
    }

def compounds_handler(item):
    return {}

def coordinate_terms_handler(item):
    return {}

def derived_handler(item):
    data = _check_none_value(item, 'derived')
    return {
        'derived': _uniq_list(o['word'] for o in data),
        'derived_data': data
    }


handler = {
 'abbreviations',
 'antonyms',
 'categories',
 'compounds',
 'coordinate_terms',
 'derived',
 'forms',
 'heads',
 'holonyms',
 'hypernyms',
 'hyphenation',
 'hyponyms',
 'inflection',
 'inflection_of',
 'lang',
 'lang_code',
 'meronyms',
 'pos',
 'proverbs',
 'related',
 'senses',
 'sounds',
 'synonyms',
 'translations',
 'troponyms',
 'wikipedia',
 'word'
}