In [1]:
import pymongo
from pathlib import Path
import os
from dotenv import load_dotenv

In [45]:
from fastprogress import progress_bar
import json

In [3]:
from utils.langconv import Converter

In [4]:
Converter('zh-hans').convert('我是誰')

'我是谁'

In [5]:
pth = Path('.').joinpath('./mongo.env')
load_dotenv(pth)

True

## UTILS

In [6]:
all_keys = ['abbreviations',
 'antonyms',
 'categories',
 'compounds',
 'coordinate_terms',
 'derived',
 'forms',
 'heads',
 'holonyms',
 'hypernyms',
 'hyphenation',
 'hyponyms',
 'inflection',
 'inflection_of',
 'lang',
 'lang_code',
 'meronyms',
 'pos',
 'proverbs',
 'related',
 'senses',
 'sounds',
 'synonyms',
 'translations',
 'troponyms',
 'wikipedia',
 'word']

def keep_desired_keys(*keys):
    return {
     k:0   for k in all_keys if k not in keys
    }

In [7]:
def query_words_by_key(key_name):
    return list(dict_col.find({
    key_name:{
        '$exists':True
    }
},{ **keep_desired_keys(key_name,'word') }))


def len_distribution(data, key_name):
    distr = {}
    samples = {}
    for h in data:
        len_h = len(h[key_name])
        if not distr.get(len_h):
            distr[len_h] = 1
            samples[len_h] = h
        else:
            distr[len_h] +=1
    d2 = {}       
    for k in sorted(distr.keys()):
        d2[k] = distr[k]
    return d2, samples

def get_uniq_tags(data, key_name):
    res = set()
    for item in data:
        res.update(item[key_name])
    return list(res)

def get_uniq_keys(arr):
    keys = set()
    samples = {}
    distr = {}
    for o in progress_bar(arr):
        _keys = list(o.keys())
        keys.update(_keys)
        for key in _keys:
            val = samples.get(key)
            if val is None:
                samples[key] = [o[key]]
            elif len(val) < 3:
                samples[key].append(o[key])
                
            if distr.get(key) is None:
                distr[key] = 1
            else:
                distr[key] += 1
    
    return list(keys), samples,distr

In [8]:
mongo_username, mongo_password = os.getenv('MONGO_INITDB_ROOT_USERNAME'), os.getenv('MONGO_INITDB_ROOT_PASSWORD')
conn = pymongo.MongoClient(
    host='localhost',
    port=27017,
    username=mongo_username,
    password=mongo_password
)

In [9]:
db_name = 'chartreuse'
dict_db = conn[db_name]
col_name = 'wiktionary'
dict_col = dict_db[col_name]

m_dict_col = dict_db['dictionary_en']
m_sense_col = dict_db['sense_en']

In [None]:
dict_pth = Path('.')/'wiktionary'/'dictionary-English.json'
dict_sense_pth = Path('.')/'wiktionary'/'English-all-senses.json'
dict_pth,dict_sense_pth

In [None]:
import json

In [None]:
def all_keys(pth):
    keys = set()
    with open(pth) as f:
        for line in f:
            json1 = json.loads(line)
            keys.update(json1.keys())
    return keys

keys = all_keys(dict_sense_pth)
keys

In [None]:
def count_key_lens(pth):
    max_dic = {}
    min_dic = {}
    key_lens_dic = {}
    with open(pth) as f:
        for line in f:
            json1 = json.loads(line)
            key_len = len(json1.keys())
#             if max_dic is None:
#                 max_dic = json1
#             if min_dic is None:
#                 min_dic = json1

            max_len = len(max_dic.keys())
            min_len = len(min_dic.keys())

            if key_len > max_len:
                max_dic = json1
            if min_len == 0 or key_len < min_len:
                min_dic = json1

            if key_lens_dic.get(key_len, None) is None:
                key_lens_dic[key_len] = 1
            else:
                key_lens_dic[key_len] +=1
    
    return max_dic, min_dic, key_lens_dic

In [None]:
max_dic, min_dic, key_lens = count_key_lens(dict_pth)

In [None]:
s_max_dic, s_min_dic, s_key_lens = count_key_lens(dict_sense_pth)

In [None]:
len(max_dic.keys()),max_dic.keys()

In [None]:
max_dic

In [None]:
len(min_dic.keys()), min_dic.keys(), min_dic

In [None]:
len(s_max_dic.keys()), s_max_dic.keys()

In [None]:
s_max_dic

In [None]:
len(s_min_dic.keys()), s_min_dic.keys(), s_min_dic

In [None]:
key_lens, s_key_lens

In [None]:
i = 0
for v in key_lens.values():
    i+= v
i

In [None]:
from tqdm import tqdm
import json
from datetime import datetime
with open(dict_sense_pth) as f:
    for line in tqdm(f):
        loaded = json.loads(line)
        dict_col.insert_one({
            **loaded,
        })

 ## HEADS
 
 remove

In [None]:
## words with heads
lis = list(dict_col.find({
    'heads':{
        '$exists':True
    }
},{ **keep_desired_keys('heads','word') }))
len(lis)

In [None]:
lis[:10]

In [None]:
heads_greater_than_1 = [
    h
    for h in lis
    if len(h.get('heads',[])) > 1
]

In [None]:
heads_len_distribution = {}

for h in lis:
    len_h = len(h['heads'])
    if not heads_len_distribution.get(len_h):
        heads_len_distribution[len_h] = 1
    else:
        heads_len_distribution[len_h] +=1

heads_len_distribution

In [None]:
len(heads_greater_than_1)

In [None]:
heads_greater_than_1[5:10]

## keys missing in words

In [None]:
keys = ['abbreviations',
 'antonyms',
 'categories',
 'compounds',
 'coordinate_terms',
 'derived',
 'forms',
 'heads',
 'holonyms',
 'hypernyms',
 'hyphenation',
 'hyponyms',
 'inflection',
 'inflection_of',
 'lang',
 'lang_code',
 'meronyms',
 'pos',
 'proverbs',
 'related',
 'senses',
 'sounds',
 'synonyms',
 'translations',
 'troponyms',
 'wikipedia',
 'word']

In [None]:
total_docs = 1021124
res = {}
for key in keys:
    cnt = dict_col.count_documents({
        key:{
            '$exists':True
        }
    })
    res[key] = {
         cnt: cnt,
        'missing': total_docs - cnt
    }
    if cnt < total_docs:
        print(key, cnt)

In [None]:
res

In [None]:
basic_keys = ['pos', 'word', 'lang', 'lang_code', 'senses']
alternative_keys = [
 'forms',
 'heads',
 'antonyms',
 'wikipedia'
 'abbreviations',
 'categories',
 'compounds',
 'coordinate_terms',
 'derived',
 'holonyms',
 'hypernyms',
 'hyphenation',
 'hyponyms',
 'inflection',
 'inflection_of',
 'meronyms',
 'proverbs',
 'related',
 'sounds',
 'synonyms',
 'translations',
 'troponyms',]

## forms

keep

forms  string[]

form_data  object[]

In [68]:
data = query_words_by_key('forms')
len(data)

387771

In [69]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc19'),
  'forms': [{'form': 'encyclopaedias', 'tags': ['plural']},
   {'form': 'encyclopaediae', 'tags': ['plural']}],
  'word': 'encyclopaedia'},
 {'_id': ObjectId('60544a9311fe31d283dedc1c'),
  'forms': [{'form': 'current event', 'tags': ['singular']}],
  'word': 'current events'},
 {'_id': ObjectId('60544a9311fe31d283dedc1e'),
  'forms': [{'form': 'livres', 'tags': ['plural']}],
  'word': 'livre'}]

In [70]:
all_forms = []
for item in data:
    all_forms.extend(item['forms'])
len(all_forms)

589486

In [71]:
a,b,c = get_uniq_keys(all_forms)

In [72]:
a,b,c

(['form', 'tags'],
 {'form': ['encyclopaedias', 'encyclopaediae', 'current event'],
  'tags': [['plural'], ['plural'], ['singular']]},
 {'form': 589486, 'tags': 589486})

In [None]:
data[:5]

In [None]:
distribution,samples = len_distribution('forms')
distribution

In [None]:
samples

In [None]:
form_tags = set()

for item in lis:
    for f in item['forms']:
        form_tags.update(f['tags'])
form_tags

In [None]:
def get_tag_examples(data,tags):
    res = {}
    for item in data:
        forms = item['forms']
        for f in forms:
            _tags = f['tags']
            t = _tags[0]
            if res.get(t) is None:
                res[t] = [item]
            elif len(res[t]) < 3:
                res[t].append(item)
    return res

In [None]:
tag_examples = get_tag_examples(lis, form_tags)

In [None]:
tag_examples

## antonyms

keep

antonyms string[]

antonyms_missing string[]

In [None]:
from fastprogress import progress_bar

In [None]:
data = query_words_by_key('antonyms')
len(data)

In [None]:
all_ans = sum([o['antonyms'] for o in data], [])
an_keys, samples, distr = get_uniq_keys(all_ans)
an_keys, samples, distr

In [None]:
def check_if_antonyms_exists(data, extract_words):
    re = []
    
    existing_words = list(dict_col.find({}, keep_desired_keys('word')))
    print(len(existing_words))
    existing_words = set(
        [w['word'] for w in existing_words]
    )
    for item in progress_bar(data):
        words = extract_words(item)
        for word in words:
            if word not in existing_words:
                print('{},{}'.format(
                    item['word'],
                    word
                ))
                re.append(
                    (item['word'], word)
                )
    return re

In [None]:
data[:3]

In [None]:
def get_antonyms(item):
    return [i['word'] for i in item['antonyms']]

In [None]:
res = check_if_antonyms_exists(data, get_antonyms)

In [None]:
len(res) # 1112 antonyms missing

## wikipedia

remove

In [None]:
data = query_words_by_key('wikipedia')
len(data)

In [None]:
data[:4]

In [None]:
all_tags = set()
for item in data:
    all_tags.update(item['wikipedia'])
    
len(all_tags)

In [None]:
all_tags = list(all_tags)
all_tags[:10]

In [None]:
distribution,samples = len_distribution(data,'wikipedia')

In [None]:
distribution

In [None]:
samples

## abbreviations

keep

abbreviations   string[]

In [None]:
data = query_words_by_key('abbreviations')
len(data)

In [None]:
data[:3]

In [None]:
distribution, samples = len_distribution(data, 'abbreviations')

In [None]:
distribution

In [None]:
samples

In [None]:
## keys in abbr
abbrs = sum([o['abbreviations'] for o in data],[])
abbr_keys,abbr_samples,distribution = get_uniq_keys(abbrs)
abbr_keys,abbr_samples, distribution

 ## categories
 
keep

wiktionary_categories string[]

In [None]:
data = query_words_by_key('categories')
len(data)

In [None]:
data[:3]

In [None]:
uniqued_categories = get_uniq_tags(data,'categories')
len(uniqued_categories)

In [None]:
distr, samples =len_distribution(data,'categories')

In [None]:
distr

In [None]:
uniqued_categories[:10]

## compounds

remove

In [None]:
data = query_words_by_key('compounds')
len(data)

In [None]:
data

## coordinate_terms

remove

In [None]:
data = query_words_by_key('coordinate_terms')
len(data)

In [None]:
data[3:5]

## derived

keep

derived string[]

In [42]:
data = query_words_by_key('derived')
len(data)

21125

In [43]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc20'),
  'word': 'book',
  'derived': [{'word': 'address book'},
   {'word': 'audiobook'},
   {'word': 'book account'},
   {'word': 'book agent'},
   {'word': 'book-answerer'},
   {'word': 'book award'},
   {'word': 'book-bearer'},
   {'word': 'bookbinder'},
   {'word': 'book-board'},
   {'word': 'book-bosomed'},
   {'word': 'book-bound'},
   {'word': 'book-boy'},
   {'word': 'book-burning'},
   {'word': 'bookcase'},
   {'word': 'book-cloth'},
   {'word': 'book club'},
   {'word': 'book canvasser'},
   {'word': 'book concern'},
   {'word': 'book-crab'},
   {'word': 'book-credit'},
   {'word': 'book-debt'},
   {'word': 'book-edge gilder'},
   {'word': 'book-edge marbler'},
   {'word': 'book end'},
   {'word': 'bookend'},
   {'word': 'bookery'},
   {'word': 'booketeria'},
   {'word': 'book-farmer'},
   {'word': 'book-folder'},
   {'word': 'book-form'},
   {'word': 'bookful'},
   {'word': 'book-ghoul'},
   {'word': 'book-gill'},
   {'word': 'book ha

In [44]:
all_derived = sum([o['derived'] for o in data],[])
a,b,c = get_uniq_keys(all_derived)
a,b,c

(['tags', 'topics', 'sense', 'word', 'translation'],
 {'word': ['address book', 'audiobook', 'book account'],
  'tags': [['Corvus coronoides'], ['Corvus ruficollis'], ['Corvus moriorum']],
  'sense': ['assent and engagement by person on whom bill of exchange is drawn',
   'assent and engagement by person on whom bill of exchange is drawn',
   'typography'],
  'translation': ['back slang',
   'to talk at length in a foolish or boring way',
   'that which skews something'],
  'topics': [['grammar', 'linguistics'],
   ['the',
    'country',
    'countries',
    'region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences'],
   ['region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences']]},
 {'word': 144289,
  'tags': 7024,
  'sense': 870,
  'translation': 133,
  'topics': 109})

## holonyms

discard

holonyms string[]

In [35]:
data = query_words_by_key('holonyms')
len(data)

74

In [36]:
[o['word'] for o in data]

['zero',
 'South Africa',
 'New Zealand',
 'Botswana',
 'Mozambique',
 'New Caledonia',
 'Micronesia',
 'ontology',
 'antecedent',
 'chocolate',
 'room',
 'rice',
 'theory',
 'bell',
 'act',
 'acorn',
 'goat',
 'badger',
 'mountain',
 'Hawaii',
 'link',
 'bishop',
 'polynomial',
 'Ethiopian',
 'New Guinea',
 'lion',
 'pons',
 'trimester',
 'Holy Spirit',
 'axiom',
 'part',
 'range',
 'guy',
 'client',
 'atrium',
 'poem',
 'chameleon',
 'geometry',
 'amino acid',
 'sucker',
 'upright',
 'cranium',
 'buntline',
 'stile',
 'theorem',
 'Genesis',
 'conjunct',
 'topology',
 'authorization',
 'consequent',
 'Calgary',
 'Algenib',
 'Markab',
 'sumti',
 'legionary',
 'cordage',
 'purchasing',
 'Minneapolis',
 'Westphalian',
 'clergywoman',
 'self-awareness',
 'Sichuan pepper',
 'Manchuria',
 'disjunct',
 'trigram',
 'open set',
 'coalface',
 'ultrafilter',
 'onsen',
 'Deep Web',
 'Upper German',
 'Central German',
 'Dark Web',
 'Almach']

In [15]:
all_holonyms = sum([o['holonyms'] for o in data],[])

In [16]:
a,b,c = get_uniq_keys(all_holonyms)
a,b,c

(['word', 'topics', 'sense', 'tags'],
 {'word': ['kernel', 'Subsahara', 'Polynesia'],
  'sense': ["value of a function's variables at zero",
   'in logic',
   'structure housing bells'],
  'topics': [['Roman', 'Catholicism', 'Catholicism', 'Christianity'],
   ['Roman', 'Catholicism', 'Catholicism', 'Christianity'],
   ['Roman', 'Catholicism', 'Catholicism', 'Christianity']],
  'tags': [['Methodism', 'worldwide'],
   ['Eastern Orthodoxy', 'worldwide'],
   ['Eastern Orthodoxy', 'worldwide']]},
 {'word': 148, 'sense': 47, 'topics': 3, 'tags': 16})

In [14]:
data[10:13]

[{'_id': ObjectId('60544a9a11fe31d283deec58'),
  'word': 'room',
  'holonyms': [{'word': 'house'},
   {'word': 'building'},
   {'word': 'structure'},
   {'word': 'apartment'},
   {'word': 'home'},
   {'word': 'flat'},
   {'word': 'hotel'},
   {'word': 'hospital'}]},
 {'_id': ObjectId('60544a9b11fe31d283deee59'),
  'word': 'rice',
  'holonyms': [{'word': 'bhelpuri'},
   {'word': 'California roll'},
   {'word': 'dosa'},
   {'word': 'gumbo'},
   {'word': 'idli'},
   {'word': 'jambalaya'},
   {'word': 'khir'},
   {'word': 'mirin'},
   {'word': 'mochi'},
   {'word': 'nasi goreng'},
   {'word': 'onigiri'},
   {'word': 'pad thai'},
   {'word': 'paella'},
   {'word': 'pilaf'},
   {'word': 'pilau'},
   {'word': 'rangoli'},
   {'word': 'rijsttafel'},
   {'word': 'risotto'},
   {'word': 'sake'},
   {'word': 'samshu'},
   {'word': 'sushi'}]},
 {'_id': ObjectId('60544a9e11fe31d283def684'),
  'word': 'theory',
  'holonyms': [{'word': 'formal system', 'sense': 'in logic'}]}]

## hypernyms

keep

hypernyms string[]

In [17]:
data = query_words_by_key('hypernyms')
len(data)

935

In [18]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc29'),
  'word': 'day',
  'hypernyms': [{'word': 'month'},
   {'word': 'time'},
   {'word': 'week'},
   {'word': 'year'}]},
 {'_id': ObjectId('60544a9311fe31d283dedcd1'),
  'word': 'Sunday',
  'hypernyms': [{'word': 'day'}]},
 {'_id': ObjectId('60544a9411fe31d283dede70'),
  'word': 'abscissa',
  'hypernyms': [{'word': 'coordinate', 'sense': 'first of two coordinates'},
   {'word': 'axis', 'sense': 'horizontal line'}]}]

In [19]:
all_hypernyms = sum([o['hypernyms'] for o in data],[])
a,b,c = get_uniq_keys(all_hypernyms)
a,b,c

(['word', 'topics', 'sense', 'translation', 'tags'],
 {'word': ['month', 'time', 'week'],
  'sense': ['first of two coordinates', 'horizontal line', 'astronomy'],
  'tags': [['includes list of coordinate terms'],
   ['sometimes'],
   ['may or may not be married']],
  'translation': ['includes list of coordinate terms',
   "sibling of someone's parent",
   "sibling of someone's parent"],
  'topics': [['order', 'theory', 'mathematics'],
   ['finance', 'economics'],
   ['design', 'arts', 'lifestyle']]},
 {'word': 1521, 'sense': 483, 'tags': 91, 'translation': 39, 'topics': 5})

## hyphenation

keep

processing: unique

hyphenation

In [20]:
data = query_words_by_key('hyphenation')
len(data)

11462

In [21]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc19'),
  'word': 'encyclopaedia',
  'hyphenation': ['en‧cy‧clo‧pe‧dia']},
 {'_id': ObjectId('60544a9311fe31d283dedc1d'),
  'hyphenation': ['flocci‧nauci‧ni‧hili‧pili‧fi‧ca‧tion'],
  'word': 'floccinaucinihilipilification'},
 {'_id': ObjectId('60544a9311fe31d283dedc2d'),
  'word': 'denotation',
  'hyphenation': ['de‧no‧ta‧tion']}]

In [22]:
distribution, samples = len_distribution(data,'hyphenation')

In [23]:
distribution,samples

({1: 11294, 2: 124, 3: 29, 4: 15},
 {1: {'_id': ObjectId('60544a9311fe31d283dedc19'),
   'word': 'encyclopaedia',
   'hyphenation': ['en‧cy‧clo‧pe‧dia']},
  2: {'_id': ObjectId('60544a9311fe31d283deddb4'),
   'hyphenation': ['aba‧cus', 'aba‧ci'],
   'word': 'abacus'},
  4: {'_id': ObjectId('60544a9511fe31d283dee166'),
   'word': 'second',
   'hyphenation': ['sec‧ond', 'sec‧ond', 'sec‧ond;', 'sec‧ond']},
  3: {'_id': ObjectId('60544aa411fe31d283df01d2'),
   'word': 'postulate',
   'hyphenation': ['pos‧tu‧late;', 'pos‧tu‧late;', 'pos‧tu‧late']}})

## hyponyms

keep

strings in form to processs: 'See Thesaurus:book'

hyponyms

In [25]:
data = query_words_by_key('hyponyms')
len(data)

1600

In [None]:
data[10:13]

In [27]:
all_items = sum([o['hyponyms'] for o in data],[])
a,b,c = get_uniq_keys(all_items)
a,b,c

(['word', 'topics', 'sense', 'translation', 'tags'],
 {'word': ['See Thesaurus:book', 'bad hair day', 'Bastille Day'],
  'sense': ['domestic species', 'domestic species', 'domestic species'],
  'tags': [['Japanese'], ['Chinese'], ['structurally']],
  'translation': ['open a bit', 'young rooster', 'young'],
  'topics': [['countries',
    'region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences',
    'countries',
    'of',
    'Europe',
    'countries',
    'region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences'],
   ['countries',
    'region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences',
    'countries',
    'of',
    'Europe',
    'countries',
    'region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences'],
   ['countries',
    'region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences'

## inflection

discard

In [28]:
data = query_words_by_key('inflection')
len(data)

172

In [29]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedcd0'),
  'inflection': [{'template_name': 'en-conj-simple'}],
  'word': 'verb'},
 {'_id': ObjectId('60544a9311fe31d283deddc1'),
  'inflection': [{'template_name': 'en-conj-simple'}],
  'word': 'abhor'},
 {'_id': ObjectId('60544a9411fe31d283dede48'),
  'inflection': [{'1': 'es', 'template_name': 'en-conj-simple'}],
  'word': 'abolish'}]

## inflection_of

discard

In [30]:
data = query_words_by_key('inflection_of')
len(data)

9

In [31]:
data[:3]

[{'_id': ObjectId('60544a9b11fe31d283deefea'),
  'inflection_of': ['they'],
  'word': 'them'},
 {'_id': ObjectId('60544a9e11fe31d283def65b'),
  'inflection_of': ['the indefinite personal pronoun one'],
  'word': 'oneself'},
 {'_id': ObjectId('60544a9e11fe31d283def66c'),
  'inflection_of': ['he'],
  'word': 'himself'}]

## meronyms

discard

In [32]:
data = query_words_by_key('meronyms')
len(data)

92

In [33]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc7c'),
  'word': 'dialect',
  'meronyms': [{'word': 'idiolect'}]},
 {'_id': ObjectId('60544a9411fe31d283dee0d4'),
  'word': 'century',
  'meronyms': [{'word': 'cohort', 'sense': 'major unit of the Roman army'},
   {'word': 'maniple', 'sense': 'major unit of the Roman army'},
   {'word': 'legion', 'sense': 'major unit of the Roman army'}]},
 {'_id': ObjectId('60544a9511fe31d283dee1f4'),
  'word': 'flute',
  'meronyms': [{'word': 'fipple', 'sense': 'music'},
   {'word': 'labium', 'sense': 'music'}]}]

## proverbs
discard

In [None]:
data = query_words_by_key('proverbs')
data

## related
multi words

keep

action or environment related with word

related string[]

In [38]:
data = query_words_by_key('related')
len(data)

24297

In [40]:
all_items = []
for o in data:
    all_items.extend(o['related'])
    
a,b,c = get_uniq_keys(all_items)
a,b,c

(['word', 'topics', 'sense', 'translation', 'tags'],
 {'word': ['incunable', 'scroll', 'tome'],
  'tags': [['cases and words'],
   ['see more related terms'],
   ['Hebrew calendar months']],
  'topics': [['days', 'of', 'the', 'week', 'weekdays', 'time', 'property'],
   ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property'],
   ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property']],
  'sense': ['13th-c. counterfeit coin',
   '13th-c. counterfeit coin',
   '13th-c. counterfeit coin'],
  'translation': ['thou', 'mathematics', 'mathematics']},
 {'word': 85171,
  'tags': 10433,
  'topics': 4399,
  'sense': 2649,
  'translation': 362})

In [39]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc20'),
  'word': 'book',
  'related': [{'word': 'incunable'},
   {'word': 'scroll'},
   {'word': 'tome'},
   {'word': 'volume'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc25'),
  'word': 'portmanteau',
  'related': [{'word': 'List of portmanteau words defined in Wiktionary'},
   {'word': 'Wikipedia article on portmanteaus',
    'tags': ['cases and words']}]},
 {'_id': ObjectId('60544a9311fe31d283dedc29'),
  'word': 'day',
  'related': [{'word': 'day of the week',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property']},
   {'word': 'Sunday',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property']},
   {'word': '\u200e Monday',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property']},
   {'word': '\u200e Tuesday',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'property']},
   {'word': '\u200e Wednesday',
    'topics': ['days', 'of', 'the', 'week', 'weekdays', 'time', 'p

## sounds

keep

need to clean data

remove keys:  text, homophone, enpr, other, audio

In [41]:
data = query_words_by_key('sounds')
len(data)

114966

In [51]:
def sounds_data_with_key(key_name):
    res = []
    for item in data:
        for sound in item['sounds'] :
            if key_name in sound.keys():
                res.append(item)
    return res

In [57]:
sounds_data_with_key('text')[:2]

[{'_id': ObjectId('60544a9311fe31d283dedc19'),
  'word': 'encyclopaedia',
  'sounds': [{'ipa': '/ənˌsəɪ.kləˈpi.di.ə/', 'tags': ['Canada']},
   {'ipa': '/ɪnˌsaɪ.kləˈpi(ː).di.ə/', 'tags': ['UK', 'US']},
   {'rhymes': '-iːdiə'},
   {'audio': 'en-us-encyclopedia.ogg', 'tags': ['US'], 'text': 'Audio (US)'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc1a'),
  'word': 'pies',
  'sounds': [{'ipa': '/paɪz/'},
   {'rhymes': '-aɪz'},
   {'audio': 'LL-Q1860 (eng)-Vealhurl-pies.wav',
    'tags': ['UK'],
    'text': 'Audio (UK)'}]}]

In [60]:
all_tags = set()
for item in all_items:
    if 'tags' in item.keys():
        all_tags.update(
            item['tags']
        )
len(all_tags), list(all_tags)

(880,
 ['postpositive',
  'Sense 2',
  'some speakers',
  'or a quintuplet',
  'Gaia hypothesis',
  'Australian English',
  'rare',
  'female given name',
  'California English',
  'past tense and past participle',
  'unstressed or',
  'Ontario',
  'especially in Dublin',
  'military',
  'found in the US',
  'Estuary English',
  'Slang',
  'New Jersey',
  'sense 1',
  'London',
  'dialects with the father-bother merger',
  '“to sleep”',
  'Estuary',
  'Midwestern US',
  'Rare',
  'CA synth',
  'dialectal',
  'US military command',
  'variant',
  'modern',
  'English Midlands',
  'Inland North',
  'US with /u/',
  'merged vowel',
  'Washington',
  "'a rabbit'",
  'sense 2',
  'US Inland North',
  'trap-bath split',
  'Western Canada',
  'in the medical sense',
  'Southern',
  'what he said',
  'yod coalescence',
  'laxly',
  'city in Washington',
  'possibly dated outside the local dialect',
  'now nonstandard in reference to the state',
  'no weak vowel merger',
  'US and Central Canad

In [None]:
tags_distr = {}
for item in all_items:
    if item.get('tags',None) is None:
        continue
    for tag in item.get('tags'):
        if tags_distr.get(tag) is None:
            tags_distr[tag] = 1
        else:
            tags_distr[tag] +=1
tags_distr

In [71]:
kvs = [(k,v) for k,v in tags_distr.items()]
kvs = sorted(kvs, key = lambda x: x[1], reverse=True)
tags_distr = {k:v for k,v in kvs}
tags_distr

{'US': 49316,
 'UK': 36016,
 'AU': 22623,
 'Received Pronunciation': 21615,
 'General American': 19667,
 'Canada': 2027,
 'GA': 1444,
 'General Australian': 1304,
 'RP': 904,
 'General New Zealand': 845,
 'cot-caught merger': 735,
 'without the horse-hoarse merger': 645,
 'verb': 488,
 'noun': 446,
 'Scotland': 392,
 'non-rhotic': 385,
 'rhotic': 373,
 'CA': 342,
 'adjective': 306,
 'Ireland': 284,
 'Northern England': 251,
 'phoneme': 244,
 'NYC': 201,
 'obsolete': 186,
 'unstressed': 171,
 'Northern California': 163,
 'AUS': 154,
 'weak vowel merger': 130,
 'pin-pen merger': 119,
 'stressed': 118,
 'Southern American English': 114,
 'Singapore': 106,
 'æ-tensing': 100,
 'dated': 92,
 'letter name': 87,
 'AAVE': 84,
 'also': 82,
 'Mary-marry-merry merger': 79,
 'New England': 77,
 'rare': 74,
 'nonstandard': 67,
 'California': 65,
 'General South African': 64,
 'without the wine-whine merger': 64,
 'Wales': 61,
 'dialectal': 59,
 'Australia': 55,
 'main allophones': 55,
 'India': 53,


In [99]:
general_tags = ['US','UK','AU','Received Pronunciation','General American','Canada']

In [93]:
# 36541 ['US','UK','AU','Received Pronunciation','General American','Canada']
# 53822 ['US','UK','Received Pronunciation','General American','Canada']  - AU
# 55407 ['US','UK','Received Pronunciation']
# 38031 ['US','UK','Received Pronunciation','AU']
# 37874 ['US','UK','Received Pronunciation','AU','Canada']
# 36682 ['US','UK','Received Pronunciation','AU','General American']
# 53973 ['US','UK','Received Pronunciation','General American']
# 64273 ['US','UK']

In [100]:
def test_sounds_with_general_tags():
    n_word_missing = 0
    examples = []
    for word in data:
        sounds = word['sounds']
        flag = False
        for sound in sounds:
            if sound.get('tags') is None:
                continue
            else:
                intersected = set(sound['tags']).intersection(general_tags)
                if len(intersected) > 0:
                    flag = True
        if not flag:
            n_word_missing +=1
            examples.append(word)
    return n_word_missing, examples

word_missing, examples = test_sounds_with_general_tags()

word_missing

64273

In [94]:
def sounds_without_tags():
    n_word_missing = 0
    examples = []
    for word in data:
        sounds = word['sounds']
        all_keys = set(
            sum(
                [list(s.keys()) for s in sounds],
                []
            )
        )
        if 'tags' not in all_keys:
            n_word_missing+=1
            examples.append(word)
    return n_word_missing, examples

In [96]:
def sounds_without_ipa():
    n_word_missing = 0
    examples = []
    for word in data:
        sounds = word['sounds']
        all_keys = set(
            sum(
                [list(s.keys()) for s in sounds],
                []
            )
        )
        if 'ipa' not in all_keys:
            n_word_missing+=1
            examples.append(word)
    return n_word_missing, examples

In [98]:
n2,ex2 = sounds_without_ipa()
n2, ex2[:10]

(25248,
 [{'_id': ObjectId('60544a9311fe31d283dedc2b'),
   'word': 'rain cats and dogs',
   'sounds': [{'audio': 'En-au-rain cats and dogs.ogg',
     'tags': ['AU'],
     'text': 'Audio (AU)'}]},
  {'_id': ObjectId('60544a9311fe31d283dedc6c'),
   'word': 'fabaceous',
   'sounds': [{'rhymes': '-eɪʃəs'}]},
  {'_id': ObjectId('60544a9311fe31d283dedc89'),
   'word': 'abacination',
   'sounds': [{'rhymes': '-eɪʃən'}]},
  {'_id': ObjectId('60544a9311fe31d283deddad'),
   'word': 'abluvion',
   'sounds': [{'rhymes': '-uːviən'}]},
  {'_id': ObjectId('60544a9411fe31d283dede0d'),
   'word': 'abray',
   'sounds': [{'rhymes': '-eɪ'}]},
  {'_id': ObjectId('60544a9411fe31d283dede86'),
   'word': 'crude oil',
   'sounds': [{'audio': 'En-uk-crude oil.ogg',
     'tags': ['UK'],
     'text': 'Audio (UK)'}]},
  {'_id': ObjectId('60544a9411fe31d283dedf2a'),
   'word': 'acceptilation',
   'sounds': [{'rhymes': '-eɪʃən'}]},
  {'_id': ObjectId('60544a9411fe31d283dedf31'),
   'word': 'accessive',
   'sounds': 

In [95]:
n,ex = sounds_without_tags()
n

35273

In [45]:
all_items = []
for o in data:
    all_items.extend(o['sounds'])
    
a,b,c = get_uniq_keys(all_items)

In [46]:
a,b,c

(['other', 'audio', 'homophone', 'ipa', 'tags', 'rhymes', 'enpr', 'text'],
 {'ipa': ['/ənˌsəɪ.kləˈpi.di.ə/', '/ɪnˌsaɪ.kləˈpi(ː).di.ə/', '/paɪz/'],
  'tags': [['Canada'], ['UK', 'US'], ['US']],
  'rhymes': ['-iːdiə', '-aɪz', '-aɪz'],
  'audio': ['en-us-encyclopedia.ogg',
   'LL-Q1860 (eng)-Vealhurl-pies.wav',
   'LL-Q1860 (eng)-Vealhurl-pies.wav'],
  'text': ['Audio (US)', 'Audio (UK)', 'Audio (UK)'],
  'homophone': ['book', 'book', 'book'],
  'enpr': ['bo͝ok', 'bo͞ok', 'bo͝ok'],
  'other': ['/j/', '/ˈfɛb.ɹi/', '/ɹə.ɹi/']},
 {'ipa': 147804,
  'tags': 157236,
  'rhymes': 44114,
  'audio': 65111,
  'text': 64840,
  'homophone': 11019,
  'enpr': 14691,
  'other': 1200})

In [42]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc19'),
  'word': 'encyclopaedia',
  'sounds': [{'ipa': '/ənˌsəɪ.kləˈpi.di.ə/', 'tags': ['Canada']},
   {'ipa': '/ɪnˌsaɪ.kləˈpi(ː).di.ə/', 'tags': ['UK', 'US']},
   {'rhymes': '-iːdiə'},
   {'audio': 'en-us-encyclopedia.ogg', 'tags': ['US'], 'text': 'Audio (US)'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc1a'),
  'word': 'pies',
  'sounds': [{'ipa': '/paɪz/'},
   {'rhymes': '-aɪz'},
   {'audio': 'LL-Q1860 (eng)-Vealhurl-pies.wav',
    'tags': ['UK'],
    'text': 'Audio (UK)'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc1b'),
  'word': 'pies',
  'sounds': [{'ipa': '/paɪz/'},
   {'rhymes': '-aɪz'},
   {'audio': 'LL-Q1860 (eng)-Vealhurl-pies.wav',
    'tags': ['UK'],
    'text': 'Audio (UK)'}]}]

In [43]:
distr,samples = len_distribution(data,'sounds')

In [44]:
distr, samples

({1: 50623,
  2: 23686,
  3: 16438,
  4: 10303,
  5: 5488,
  6: 3245,
  7: 1750,
  8: 1097,
  9: 721,
  10: 487,
  11: 326,
  12: 211,
  13: 154,
  14: 112,
  15: 58,
  16: 50,
  17: 49,
  18: 43,
  19: 18,
  20: 16,
  21: 28,
  22: 21,
  23: 4,
  24: 7,
  26: 3,
  27: 3,
  33: 4,
  36: 2,
  40: 2,
  43: 13,
  44: 2,
  48: 2},
 {4: {'_id': ObjectId('60544a9311fe31d283dedc19'),
   'word': 'encyclopaedia',
   'sounds': [{'ipa': '/ənˌsəɪ.kləˈpi.di.ə/', 'tags': ['Canada']},
    {'ipa': '/ɪnˌsaɪ.kləˈpi(ː).di.ə/', 'tags': ['UK', 'US']},
    {'rhymes': '-iːdiə'},
    {'audio': 'en-us-encyclopedia.ogg',
     'tags': ['US'],
     'text': 'Audio (US)'}]},
  3: {'_id': ObjectId('60544a9311fe31d283dedc1a'),
   'word': 'pies',
   'sounds': [{'ipa': '/paɪz/'},
    {'rhymes': '-aɪz'},
    {'audio': 'LL-Q1860 (eng)-Vealhurl-pies.wav',
     'tags': ['UK'],
     'text': 'Audio (UK)'}]},
  2: {'_id': ObjectId('60544a9311fe31d283dedc1e'),
   'word': 'livre',
   'sounds': [{'ipa': '/ˈliːvɹə/'}, {'ipa': '/ˈ

## synonyms
keep

data cleaning needed

In [102]:
data = query_words_by_key('synonyms')
len(data)

14563

In [103]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc20'),
  'word': 'book',
  'synonyms': [{'word': 'See Thesaurus:book'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc2f'),
  'word': 'thesaurus',
  'synonyms': [{'word': 'synonymicon'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc34'),
  'word': 'connotation',
  'synonyms': [{'word': 'intension'}]}]

In [105]:
all_items = sum([o['synonyms'] for o in data],[])

In [106]:
a,b,c = get_uniq_keys(all_items)
a

['word', 'topics', 'sense', 'translation', 'tags']

In [107]:
b,c

({'word': ['See Thesaurus:book', 'synonymicon', 'intension'],
  'sense': ['any member of the suborder (sometimes superfamily) Feliformia or Feloidea',
   'any member of the suborder (sometimes superfamily) Feliformia or Feloidea',
   'any member of the subfamily Felinae, genera Puma, Acinonyx, Lynx, Leopardus, and Felis)'],
  'tags': [['sensu stricto'], ['sensu stricto'], ['sensu stricto']],
  'translation': ['not informal', 'dialectal, obsolete', 'very rare'],
  'topics': [['country',
    'countries',
    'region',
    'geography',
    'location',
    'property',
    'natural-sciences',
    'sciences'],
   ['grammar', 'linguistics'],
   ['pool', 'games']]},
 {'word': 44403,
  'sense': 27851,
  'tags': 4730,
  'translation': 1134,
  'topics': 320})

In [104]:
dist,samples = len_distribution(data,'synonyms')
dist, samples

({1: 6179,
  2: 2775,
  3: 1732,
  4: 1129,
  5: 693,
  6: 516,
  7: 385,
  8: 296,
  9: 184,
  10: 144,
  11: 114,
  12: 91,
  13: 59,
  14: 52,
  15: 34,
  16: 36,
  17: 18,
  18: 27,
  19: 13,
  20: 8,
  21: 10,
  22: 10,
  23: 10,
  24: 8,
  25: 8,
  26: 9,
  27: 2,
  29: 2,
  30: 3,
  31: 1,
  32: 2,
  33: 1,
  34: 3,
  35: 1,
  36: 1,
  38: 1,
  41: 1,
  46: 2,
  52: 3},
 {1: {'_id': ObjectId('60544a9311fe31d283dedc20'),
   'word': 'book',
   'synonyms': [{'word': 'See Thesaurus:book'}]},
  12: {'_id': ObjectId('60544a9311fe31d283dedc41'),
   'word': 'cat',
   'synonyms': [{'word': 'feliform (carnivoran)',
     'sense': 'any member of the suborder (sometimes superfamily) Feliformia or Feloidea'},
    {'word': 'feloid (compare Caniformia, Canoidea)',
     'sense': 'any member of the suborder (sometimes superfamily) Feliformia or Feloidea'},
    {'word': 'feline cat',
     'sense': 'any member of the subfamily Felinae, genera Puma, Acinonyx, Lynx, Leopardus, and Felis)'},
    {'wor

## translations
keep

In [None]:
cns = [{'lang': 'Chinese',
    'code': 'yue',
    'tags': ['Cantonese'],
    'sense': 'news items',
    'word': '時事'},
   {'lang': 'Chinese',
    'code': 'yue',
    'tags': ['Cantonese'],
    'sense': 'news items',
    'word': '时事',
    'roman': 'si^4 si^6'},
   {'lang': 'Chinese',
    'code': 'cmn',
    'tags': ['Mandarin'],
    'sense': 'news items',
    'word': '時事'},
   {'lang': 'Chinese',
    'code': 'cmn',
    'tags': ['Mandarin'],
    'sense': 'news items',
    'word': '时事',
    'roman': 'shíshì'}]

In [11]:
data = query_words_by_key('translations')
len(data)

32146

In [122]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc23'),
  'word': 'portmanteau',
  'translations': [{'lang': 'Albanian',
    'code': 'sq',
    'sense': 'case',
    'word': 'baule',
    'tags': ['feminine']},
   {'lang': 'Arabic', 'code': 'ar', 'sense': 'case', 'word': 'حقيبَة سفر'},
   {'lang': 'Breton',
    'code': 'br',
    'sense': 'case',
    'word': 'mal',
    'tags': ['feminine']},
   {'lang': 'Breton',
    'code': 'br',
    'sense': 'case',
    'word': 'malizenn',
    'tags': ['feminine']},
   {'lang': 'Bulgarian',
    'code': 'bg',
    'sense': 'case',
    'word': 'голям кожен куфар',
    'roman': 'goljam kožen kufar'},
   {'lang': 'Czech', 'code': 'cs', 'sense': 'case', 'word': 'kontaminace'},
   {'lang': 'Dutch',
    'code': 'nl',
    'sense': 'case',
    'word': 'koffer',
    'tags': ['feminine']},
   {'lang': 'Dutch',
    'code': 'nl',
    'sense': 'case',
    'word': 'valies',
    'tags': ['feminine']},
   {'lang': 'Finnish', 'code': 'fi', 'sense': 'case', 'word': 'matkalaukku'},
 

In [125]:
all_items = []

for item in data:
    all_items.extend(item['translations'])

len(all_items)

1210485

In [128]:
def word_contains_chinese(k='lang',v='Chinese'):
    n_cns = 0
    for item in data:
        translations = item['translations']
        for trans in translations:
            if trans['lang'] == 'Chinese':
                n_cns += 1
                continue
    return n_cns
word_contains_chinese()

52484

In [129]:
word_contains_chinese(k='code',v='cmn'), word_contains_chinese(k='code',v='yue'), 

(52484, 52484)

In [13]:
def count_word_(handler):
    n_cns = 0
    for item in data:
        translations = item['translations']
        for trans in translations:
            if handler(trans):
                n_cns += 1
                continue
    return n_cns        

In [None]:
# mandarin 41391, cantonese 5296

In [19]:
# count mandarin
def _is_mandarin(item):
    return 'Mandarin' in item.get('tags',[])
print(
    count_word_(_is_mandarin)
)

def _is_cantonese(item):
    return 'Cantonese' in item.get('tags',[])
print(
    count_word_(_is_cantonese)
)

def _is_simplified_cns(item):
    return item['code'] == 'cmn' and Converter('zh-hans').convert(item['word']) == item['word']
print(
    count_word_(_is_simplified_cns)
)

def _is_traditional_cns(item):
    return item['code'] == 'cmn' and Converter('zh-hans').convert(item['word']) != item['word']

print(
    count_word_(_is_traditional_cns)
)

41391
5296
26552
15147


In [None]:
# simplified cns with pinyin 24928  code == cmn
# simplified cns with optional pinyin 26552    code == cmn
# simplified cns                     33669  lang==Chinese

In [None]:
def _cns_without_word(item):
    converted = Converter('zh-hans').convert(item['word'])
    res = item['lang'] == 'Chinese' and converted == item['word']
    if item['code'] == 'cmn' and converted != item['word']:
        print(item['word'])
#     if item['code'] == 'cmn' and converted == item['word'] and item.get('roman') is None:
#         print(item)
    return res
count_word_(_cns_without_word)

## troponyms

discard

In [119]:
data = query_words_by_key('troponyms')
len(data)

10

In [120]:
data[:3]

[{'_id': ObjectId('60544a9c11fe31d283def15b'),
  'word': 'sale',
  'troponyms': [{'word': 'cut-rate sale',
    'sense': 'selling of goods at reduced prices'},
   {'word': 'sales event', 'sense': 'selling of goods at reduced prices'},
   {'word': 'auction',
    'sense': 'act of putting up for auction to the highest bidder'},
   {'word': 'public sale',
    'sense': 'act of putting up for auction to the highest bidder'}]},
 {'_id': ObjectId('60544a9d11fe31d283def4a3'),
  'word': 'cut',
  'troponyms': [{'word': 'chop'},
   {'word': 'hack'},
   {'word': 'slice'},
   {'word': 'trim'}]},
 {'_id': ObjectId('60544a9e11fe31d283def7a7'),
  'word': 'sleep',
  'troponyms': [{'word': 'nap',
    'sense': 'rest in a state of reduced consciousness'},
   {'word': 'doze', 'sense': 'rest in a state of reduced consciousness'},
   {'word': 'snooze', 'sense': 'rest in a state of reduced consciousness'}]}]

## senses

In [272]:
data = query_words_by_key('senses')

In [39]:
data[:3]

[{'_id': ObjectId('60544a9311fe31d283dedc17'),
  'word': 'GNU FDL',
  'senses': [{'glosses': ['Initialism of GNU Free Documentation License.'],
    'tags': ['abbreviation', 'alt-of', 'initialism'],
    'alt_of': ['GNU Free Documentation License'],
    'id': 'GNU_FDL-name'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc18'),
  'word': 'Pope Julius',
  'senses': [{'categories': ['Card games'],
    'tags': ['obsolete'],
    'glosses': ['A sixteenth-century gambling card game about which little is known.'],
    'id': 'Pope_Julius-name'}]},
 {'_id': ObjectId('60544a9311fe31d283dedc19'),
  'word': 'encyclopaedia',
  'senses': [{'categories': ['Australian English', 'British English'],
    'tags': ['Australia', 'Britain', 'alt-of', 'alternative', 'chiefly'],
    'glosses': ['Alternative spelling of encyclopedia'],
    'alt_of': ['encyclopedia'],
    'id': 'encyclopaedia-noun'}]}]

In [273]:
all_items = []

for o in data:
    all_items.extend(o['senses'])

In [274]:
len(all_items)

1212639

In [44]:
a,b,c = get_uniq_keys(all_items)

In [50]:
b,c

({'glosses': [['Initialism of GNU Free Documentation License.'],
   ['A sixteenth-century gambling card game about which little is known.'],
   ['Alternative spelling of encyclopedia']],
  'tags': [['abbreviation', 'alt-of', 'initialism'],
   ['obsolete'],
   ['Australia', 'Britain', 'alt-of', 'alternative', 'chiefly']],
  'alt_of': [['GNU Free Documentation License'],
   ['encyclopedia'],
   ['gross domestic product']],
  'id': ['GNU_FDL-name', 'Pope_Julius-name', 'encyclopaedia-noun'],
  'categories': [['Card games'],
   ['Australian English', 'British English'],
   ['English pluralia tantum']],
  'form_of': [['pie'], ['pie'], ['pi']],
  'translations': [[{'lang': 'Chinese',
     'code': 'yue',
     'tags': ['Cantonese'],
     'sense': 'news items',
     'word': '時事'},
    {'lang': 'Chinese',
     'code': 'yue',
     'tags': ['Cantonese'],
     'sense': 'news items',
     'word': '时事',
     'roman': 'si^4 si^6'},
    {'lang': 'Chinese',
     'code': 'cmn',
     'tags': ['Mandarin'],


In [None]:
sense_keys = { 'derived',
  'translations',
  'wikidata',
  'hypernyms',
  'wikipedia',
  'id',
  'related',
  'form_of',
  'holonyms',
  'categories',
  'synonyms',
  'alt_of',
  'hyponyms',
  'coordinate_terms',
  'meronyms',
  'tags',
  'troponyms',
  'glosses',
  'senseid',
  'compound_of',
  'topics'}
sense_keys.difference(set(all_keys))

In [None]:
differeniated = {'alt_of',
 'compound_of',
 'form_of',
 'glosses',
 'id',
 'senseid',
 'tags',
 'topics',
 'wikidata'}

In [None]:
intersected = {'categories',
 'coordinate_terms',
 'derived',
 'holonyms',
 'hypernyms',
 'hyponyms',
 'meronyms',
 'related',
 'synonyms',
 'translations',
 'troponyms',
 'wikipedia'}

In [None]:
senses_key_examples

- 'glosses': `string[]`
- 'tags': `string[]`
- 'alt_of': `string[]`
- 'id': `string`
- 'categories': `string[]`
- 'form_of': `string[]`
- 'translations': `{lang:string, code: string, tags:string[], sense:string, word: string, roman: string}[]`
- 'related': `{word: string, tags?:string[]}[]`
- 'wikipedia': `string[]`
- 'synonyms': `{word: string}[]`
- 'topics': `string[]`
- 'derived': `{word: string}[]`
- 'senseid': `string[]`
- 'hyponyms': `{word: string, sense: string}[]`
- 'hypernyms': `{word: string}[]`
- 'coordinate_terms': `{word: string, sense?:string}[]`
- 'wikidata': `string[]`
- 'holonyms': `{word: string}[]`
- 'meronyms': `{word: string, topics: string[], sense: string, tags: string[]}[]`
- 'troponyms': `{word: string, sense: string}[]`
- 'compound_of': `string[]`

## check repeated words

In [None]:
data = query_words_by_key('word')
len(data)

In [None]:
all_words = [i['word'] for i in data]

In [None]:
all_words[:10]

In [None]:
len(set(all_words))

In [None]:
1021124-952502

In [None]:
data[0]

In [None]:
def get_repeated_words(data, words):
    repeated = []
    idxes_to_skip = set()
    for idx, item in enumerate(data):
        temp_repeated = []
        if idx in idxes_to_skip:
            continue
        for idx2, word in enumerate(words[idx+1:]):
            if item['word'] == word:
                idxes_to_skip.add(idx2)
                temp_repeated.append(item['_id'])
        if len(temp_repeated) > 0 :
            repeated.append(
                (item, temp_repeated)
            )
    return repeated

In [None]:
from multiprocessing import Pool, cpu_count
import math

In [None]:
def chunk_data(data, bs=10):
    sz =math.floor(len(data)/bs)
    res = []
    for i in range(bs):
        block = data[sz*i : sz*(i+1)] if i != bs-1 else data[sz*i :] 
        res.append(block)
    
    return res

In [None]:
res = []
num = 40
with Pool(num) as p:
    for i in range(num):
        chunked_words = chunk_data(all_words, num)
        chunked_data = chunk_data(data, num)
        r = p.apply_async(get_repeated_words, args=(chunked_data[i], chunked_words[i],))
        res.append(r.get())

In [None]:
res = sum(res,[])
len(res)

## handler

In [217]:
class NoneValueException(Exception):
    pass

def _check_none_value(item,key_name):
    data = item.get(key_name)
    if data is None:
        raise NoneValueException
    
    return data

def _uniq_list(expre):
    return list(set(expre))

def _extract_value_from_arr(arr, key_name, uniq=True):
    res = [o[key_name] for o in arr]
    if uniq:
        return _uniq_list(res)
    else:
        return res
    

    
def _map_handler(val):
    if isinstance(val, str):
        return val
    
    if isinstance(val, list):
        return _arr_handler(val)
    
    if isinstance(val, object):
        return _obj_handler(val)
    
def _arr_handler(item):
    if len(item) == 0: return item 
    
    if isinstance(item[0], str):
        return _uniq_list(item)
    if isinstance(item[0], list):
        return [_arr_handler(o) for o in item]
        
    if isinstance(item[0], object):
        return [_obj_handler(o) for o in item]
    
    return item
    
def _obj_handler(obj):
    return {
        k: _map_handler(v)
        for k,v in obj.items()
    }
    
    
def _uniq_arr_with_obj(arr,key_names):
    res = []
    for item in arr:
        item2 = {}
        for k,v in item.items():
            if k in key_names:
                item2[k] = _arr_handler(v)
            else:
                item2[k] = v
        res.append(item2)
    return res

# basic attribution
def lang_handler(item):
    return {
        'lang': item['lang']
    }

def lang_code_handler(item):
    return {
        'lang_code': item['lang_code']
    }

def pos_handler(item):
    return {
        'all_pos': [item['pos']] if isinstance(item['pos'], str) else item['pos']
    }

def word_handler(item):
    return {
        'word':item['word']
    }


def senses_handler(item):
    pass

# end basic attribution
    
def abbreviations_handler(item):
    '''
     alternative
    '''
    try:
        abbr = _check_none_value(item, 'abbreviations')
        return {
        'abbrs': _uniq_list(o['word'] for o in abbr),
    }
    except NoneValueException:
        return {}


def antonyms_handler(item):
    '''
    relation
    '''
    data = _check_none_value(item, 'antonyms')
    return {
        'antonyms': _uniq_list(o['word'] for o in data),
        'antonyms_data': _map_handler(data)
    }  
    
def categories_handler(item):
    '''
    description
    '''
    data = _check_none_value(item, 'categories')
    return {
        'categories': _uniq_list(data)
    }

def derived_handler(item):
    '''
    relation
    '''
    data = _check_none_value(item, 'derived')
    return {
        'derived': _uniq_list(o['word'] for o in data),
        'derived_data': _map_handler(data)
    }

def forms_handler(item):
    '''
    alternative
    '''
    data = _check_none_value(item,'forms')
    data = [
        o
        for o in data
        if 'error-unknown-tag' not in o.get('tags',[])
    ]
    return {
        'forms': _uniq_list(o['form'] for o in data),
        'form_data': _map_handler(data)
    }

def hypernyms_handler(item):
    '''
    relation
    '''
    data = _check_none_value(item, 'hypernyms')
    return {
        'hypernyms': _uniq_list(o['word'] for o in data),
        'hypernyms_data': _map_handler(data)
    }

def hyphenation_handler(item):
    '''
    
    '''
    data =_check_none_value(item, 'hyphenation')
    return {
        'hyphenation': _uniq_list(data)
    }
    
def hyponyms_handler(item):
    '''
    relation
    '''
    data = _check_none_value(item, 'hyponyms')
    return {
        'hyponyms': _uniq_list(o['word'] for o in data),
        'hyponyms_data': _map_handler(data)
    }


def related_handler(item):
    '''
    relation
    '''
    data =_check_none_value(item,'related')
    
    return {
        'related': _uniq_list(o['word'] for o in data),
        'related_data': _map_handler(data)
    }

    
def _sound_item_processor(sound_item):
    sounds_tags_to_keep ={'US','UK','AU','Received Pronunciation','General American','Canada'}
    sounds_keys_to_remove = [ 'text', 'homophone', 'enpr', 'other', 'audio']
    res =  {
        k:v   
        for k,v in sound_item.items()
        if k not in sounds_keys_to_remove
    }
    
    if bool(res) is False:
        return None
    if res.get('tags') is None:
        return res
    else:
        res['tags'] = list(
            set(res['tags']).intersection(sounds_tags_to_keep)
        )
        if len(res['tags']) == 0:
            res.pop('tags')
        return res
    
def sounds_handler(item):
    '''
        all keys: 'other', 'audio', 'homophone', 'ipa', 'tags', 'rhymes', 'enpr', 'text'
    '''
    data = _check_none_value(item,'sounds')
    data = [o for o in data if o.get('ipa') is not None]
    if len(data) == 0: return {}
    sounds = [
        _sound_item_processor(sound_item)
        for sound_item in data
        if _sound_item_processor(sound_item) is not None
    ]
    if len(sounds) == 0:
        return {}
    return {
        'sounds': _map_handler(sounds)
    }
    
def synonyms_handler(item):
    '''
        related
    '''
    data = _check_none_value(item, 'synonyms')
    return {
        'synonyms': _extract_value_from_arr(data,'word'),
        'synonyms_data': _map_handler(data)
    }


def translations_hanlder(item):
    data = _check_none_value(item,'translations')
    cns = [o 
           for o in data 
           if o['code'] == 'cmn'
          ]
    if len(cns) ==0:
        return {}
    
    cns = sorted(cns, key=lambda x: len(x.keys()), reverse=True)
    res = cns[0]
    res['word'] = Converter('zh-hans').convert(res['word'])
    res = {
        k:v
        for k,v in res.items()
        if k in ['word','roman','lang','code','tags']
    }
    return { 
        'zh_cn': res
    }

# discard 

def compounds_handler(item):
    return {}

def coordinate_terms_handler(item):
    return {}


def troponyms_handler(item):
    '''
    discard
    '''
    return {}

def wikipedia_handler(item):
    '''
    discard
    '''
    return {}

def inflection_handler(item):
    return {}

def inflection_of_handler(item):
    return {}


def meronyms_handler(item):
    return {}


def proverbs_handler(item):
    return {}


def heads_handler(item):
    return {}

def holonyms_handler(item):
    '''
    discard
    '''
    return {}

    data = _check_none_value(item,'holonyms')
    return {
        'holonyms': _uniq_list(o['word'] for o in data),
        'holonyms_data': _map_handler(data)
    }

# end discard



word_handler = {
 'abbreviations': abbreviations_handler,
 'antonyms': antonyms_handler,
 'categories': categories_handler,
 'compounds': compounds_handler,
 'coordinate_terms': coordinate_terms_handler,
 'derived': derived_handler,
 'forms':forms_handler,
 'heads': heads_handler,
 'holonyms':holonyms_handler,
 'hypernyms': hypernyms_handler,
 'hyphenation':hyphenation_handler,
 'hyponyms': hyponyms_handler,
 'inflection': inflection_handler,
 'inflection_of': inflection_of_handler,
 'lang': lang_handler,
 'lang_code': lang_code_handler,
 'meronyms': meronyms_handler,
 'pos': pos_handler,
 'proverbs': proverbs_handler,
 'related': related_handler,
 'senses': senses_handler,
 'sounds': sounds_handler,
 'synonyms': synonyms_handler,
 'translations': translations_hanlder,
 'troponyms': troponyms_handler,
 'wikipedia': wikipedia_handler,
 'word': word_handler
}

## sense handler

In [218]:
def alt_of_handler(item):
    data = _check_none_value(item,'alt_of')
    return {
        'alt_of': _uniq_list(data)
    }

def compound_of_handler(item):
    return {}


def form_of_handler(item):
    data = _check_none_value(item, 'form_of')
    return {
        'form_of': _uniq_list(data)
    }
    
def glosses_handler(item):
    data = _check_none_value(item, 'glosses')
    return {
        'glosses': _uniq_list(data)
    }
 
def id_handler(item):    
    data = _check_none_value(item, 'id')
    return {
        'sense_id': data
    }


def senseid_handler(item):
    return {}

def tags_handler(item):
    data = _check_none_value(item, 'tags')
    return {
        'tags': _uniq_list(data)
    } 

def topics_handler(item):
    data = _check_none_value(item, 'topics')
    return {
        'topics': _uniq_list(data)
    } 
def wikidata_handler(item):
    return {}

sense_handler = {
     'categories': categories_handler,
     'coordinate_terms':coordinate_terms_handler ,
     'derived': derived_handler,
     'holonyms': holonyms_handler,
     'hypernyms':hypernyms_handler ,
     'hyponyms': hyponyms_handler,
     'meronyms': meronyms_handler,
     'related': related_handler,
     'synonyms': synonyms_handler,
     'translations': translations_hanlder,
     'troponyms': troponyms_handler,
     'wikipedia': wikipedia_handler,
    
     'alt_of': alt_of_handler,
     'compound_of': compound_of_handler,
     'form_of': form_of_handler,
     'glosses': glosses_handler,
     'id': id_handler,
     'senseid': senseid_handler,
     'tags':tags_handler,
     'topics': topics_handler,
     'wikidata': wikidata_handler
}

In [219]:
def process_single_sense(sense, addons={}):
    res = {**addons}
    for k,v in sense.items():
        try:
            obj = sense_handler[k](sense)
            res.update(obj)
        except NoneValueException:
            pass
    return res

def process_single_word(word):
    senses = word['senses']
    _id = ObjectId()
    word_res = {
        '_id':_id
    }
    # word
    for k,v in word.items():
        try:
            if k in ['senses','_id']: continue
            word_res.update(
                word_handler[k](word)
            )
        except NoneValueException:
            pass
    
    # sense
    sense_addons = {
        'pos': word['pos'],
        'priority': 1,
        'word_id': _id
    }
    senses_res = [
        process_single_sense(s, addons=sense_addons)
        for s in senses
    ]
    return word_res, senses_res

## test

In [58]:
def sample_words(n=5):
    return list(dict_col.aggregate([
        {
            '$match':{
                
            }
        },
        {
            '$sample':{'size':n}
        }
    ]))

In [118]:
sw = sample_words(50)

In [119]:
for i,o in enumerate(sw):
    try:
        process_single_word(o)
    except Exception as e:
        print(i)
        raise e

## get duplicated words

In [8]:
all_words = query_words_by_key('pos')
len(all_words)

1021124

In [10]:
dic = {}
for item in progress_bar(all_words):
    word = item['word']
    if dic.get(word) is None:
        dic[word] = [(item['_id'], item['pos'])]
    else:
        dic[word].append(
            (item['_id'], item['pos'])
        )
        
dic = {
    k:v
    for k,v in dic.items()
    if len(v) > 1
}

In [11]:
len(dic.keys())

57303

In [25]:
temp_arr = sorted([
    (k, v)
    for k,v in dic.items()
],
key = lambda x: len(x[1]),
reverse = True
)
dic = {
    k:v
    for k,v in temp_arr
}

In [26]:
for k,v in list(dic.items())[:10]:
    print(k,len(v))

rout 13
a 13
stale 12
-er 12
dink 12
cat 11
pink 11
X 11
rake 11
rack 11


In [25]:
res = list(dict_col.find({
    'word':'rout'
}))

In [26]:
[len(o['senses']) for o in res]

[7, 1, 2, 2, 3, 1, 3, 2, 6, 2, 1, 1, 1]

In [27]:
res_nouns = sorted([o for o in res if o['pos'] =='noun'], key= lambda x: len(x['senses']), reverse=True)
res_verbs = sorted([o for o in res if o['pos'] =='verb'], key= lambda x: len(x['senses']), reverse=True)
len(res_nouns), len(res_verbs)

(6, 7)

In [28]:
[len(o['senses']) for o in res_nouns], [len(o['senses']) for o in res_verbs]

([7, 2, 2, 1, 1, 1], [6, 3, 3, 2, 2, 1, 1])

In [29]:
def _count_len_of_v(item):
    res = {}
    for k,v in item.items():
        if isinstance(v, list):
            res[k] = len(v)
        else:
            res[k] = v
    return res
res_nouns_cnt = [_count_len_of_v(o) for o in res_nouns]
res_verbs_cnt = [_count_len_of_v(o) for o in res_verbs]

In [30]:
res_nouns_cnt

[{'_id': ObjectId('60544ac611fe31d283df5cf2'),
  'pos': 'noun',
  'heads': 1,
  'forms': 1,
  'word': 'rout',
  'lang': 'English',
  'lang_code': 'en',
  'sounds': 43,
  'categories': 1,
  'translations': 13,
  'derived': 2,
  'senses': 7},
 {'_id': ObjectId('60544ac611fe31d283df5cf4'),
  'pos': 'noun',
  'heads': 1,
  'forms': 1,
  'categories': 2,
  'word': 'rout',
  'lang': 'English',
  'lang_code': 'en',
  'sounds': 43,
  'translations': 12,
  'senses': 2},
 {'_id': ObjectId('60544ac611fe31d283df5cf9'),
  'pos': 'noun',
  'heads': 1,
  'forms': 1,
  'categories': 4,
  'word': 'rout',
  'lang': 'English',
  'lang_code': 'en',
  'sounds': 43,
  'senses': 2},
 {'_id': ObjectId('60544ac611fe31d283df5cf7'),
  'pos': 'noun',
  'heads': 1,
  'forms': 1,
  'word': 'rout',
  'lang': 'English',
  'lang_code': 'en',
  'sounds': 43,
  'categories': 1,
  'senses': 1},
 {'_id': ObjectId('60544ac611fe31d283df5cfd'),
  'pos': 'noun',
  'heads': 1,
  'forms': 1,
  'word': 'rout',
  'lang': 'English

In [41]:
res_nouns[1]['categories']

['Colors', 'Military']

## handle word with multiple words

1. order by length of `senses`
2. get `pos` from parent
3. merge `forms` with `form_of`
3. merge `derived`

In [50]:
words_keys = [
 'abbreviations',
 'antonyms',
 'categories',
#  'compounds',
#  'coordinate_terms',
 'derived',
 'forms',
#  'heads',
#  'holonyms',
 'hypernyms',
 'hyphenation',
 'hyponyms',
#  'inflection',
#  'inflection_of',
 'lang',
 'lang_code',
#  'meronyms',
 'pos',
#  'proverbs',
 'related',
 'senses',
 'sounds',
 'synonyms',
 'translations',
#  'troponyms',
#  'wikipedia',
 'word']

In [220]:
def _merge_arr(*arrs):
    res = []
    for arr in arrs:
        res.extend(arr)
    res = set(json.dumps(o) for o in res)
    return [
        json.loads(o)
        for o in res
    ]

keys_to_merge = {
    'abbreviations',
    'antonyms',
    'categories',
    'derived',
    'forms',
    'hypernyms',
    'hyphenation',
    'hyponyms',
    'related',
    'sounds',
    'synonyms',
    'translations'
}
def merge_words(words):
    '''
        words should be ordered by length of senses
    '''
    main_word = {**words[0]}
    pos = list(set(o['pos'] for o in words))
    main_word['pos'] = pos
    
    for key in keys_to_merge:
        values = [
            o.get(key)
            for o in words[1:]
            if o.get(key) is not None
        ]
        res = _merge_arr(
            main_word.get(key,[]),
            *values
        )
        if len(res) >0:
            main_word[key] = res
    return main_word
        
            
            

keys_to_inherit = [
     'categories',
     'coordinate_terms',
     'derived',
     'hypernyms',
     'hyponyms',
     'related',
     'synonyms',
     'translations',
]

def _inherit(word,sense):
    res = {**sense}
    for k in keys_to_inherit:
        values = _merge_arr(res.get(k,[]), word.get(k,[]))
        if len(values) > 0:
            res[k] = values
            
    return res
    

def process_senses_with_multi_words(word, addons={}):
    '''
    
    '''
    senses = word['senses']
    word_forms = [
        o['form'] 
        for o in word.get('forms',[])
        if 'error-unknown-tag' not in o.get('tags',[])
    ]
    senses = [
        {
            **o,
            'form_of': o.get('form_if',[]) + word_forms
        }
        for o in senses
    ]
    attr_to_inherit = {
        k:v
        for k,v in word.items()
        if k in keys_to_inherit
    }
    
    senses = [
        _inherit(word, s)
        for s in senses
    ]
    
    return [
        process_single_sense(s,addons = addons)
        for s in senses
    ]
        
    
def _find_word_with_pos(words,pos):
    return [
        o
        for o in words
        if o['pos'] == pos
    ]

def _group_words_by_pos(sorted_words):
    poses = set(
        o['pos']
        for o in sorted_words
    )
    return [
        _find_word_with_pos(sorted_words,pos)
        for pos in poses
    ]
    
def process_multi_words(words):
    sorted_words = sorted(
        words, 
        key= lambda x: len(x['senses']), 
        reverse=True)
    main_word = merge_words(sorted_words)
    _id = ObjectId()
    word_res = {
        '_id':_id
    }
    # word
    for k,v in main_word.items():
        try:
            if k in ['senses','_id']: continue
            word_res.update(
                word_handler[k](main_word)
            )
        except NoneValueException:
            pass
    # word
    groups = _group_words_by_pos(sorted_words)
    
    # sense
    senses = []
    for grouped_words in groups:
        for idx,word in enumerate(grouped_words):
            sense_addons = {
                'pos': word['pos'],
                'priority': idx+1,
                'word_id':_id
            }
            if word.get('forms') is not None:
                sense_addons['form_data'] = forms_handler(word)['form_data']
                
            senses += process_senses_with_multi_words(word, addons=sense_addons)
    # sense
    return word_res, senses

## test

In [121]:
duplicated_words = list(dict_col.find({
    'word':'pitch'
}))

In [131]:
word,senses = process_multi_words(duplicated_words)
senses

[{'pos': 'verb',
  'priority': 1,
  'tags': ['transitive'],
  'glosses': ['To throw.'],
  'sense_id': 'pitch-verb-en:to_throw',
  'form_of': ['pitching', 'pitched', 'pitches'],
  'derived': ['pitchfork',
   'pitcher',
   'pitch upon',
   'pitch up',
   'pitch-a-fit',
   'pitch and putt',
   'pitch a tent',
   'pitch in'],
  'derived_data': [{'word': 'pitch and putt'},
   {'word': 'pitch upon'},
   {'word': 'pitch-a-fit'},
   {'word': 'pitch up'},
   {'word': 'pitcher'},
   {'word': 'pitchfork'},
   {'word': 'pitch in'},
   {'word': 'pitch a tent'}]},
 {'pos': 'verb',
  'priority': 1,
  'categories': ['Baseball'],
  'tags': ['transitive', 'intransitive'],
  'topics': ['baseball', 'sports'],
  'glosses': ['To throw (the ball) toward a batter at home plate.'],
  'sense_id': 'pitch-verb-en:baseball:_to_throw_the_ball_toward_home_plate',
  'form_of': ['pitching', 'pitched', 'pitches'],
  'derived': ['pitchfork',
   'pitcher',
   'pitch upon',
   'pitch up',
   'pitch-a-fit',
   'pitch and p

In [None]:
word

In [None]:
exclude_keys = ['pos', 'word', 'lang', 'lang_code', 'derived','translations','_id','heads','sounds']
[
    {k:v for k,v in item.items() if k not in exclude_keys}
    for item in duplicated_words
]

In [None]:
[o['forms'] for o in duplicated_words]

## create dict collection

In [213]:
all_words = query_words_by_key('pos')
len(all_words)

1021124

In [214]:
repeated_dict = {}
for item in progress_bar(all_words):
    word = item['word']
    if repeated_dict.get(word) is None:
        repeated_dict[word] = [(item['_id'], item['pos'])]
    else:
        repeated_dict[word].append(
            (item['_id'], item['pos'])
        )
        
repeated_dict = {
    k:v
    for k,v in repeated_dict.items()
    if len(v) > 1
}
repeated_words = list(repeated_dict.keys())
len(repeated_words)

57303

In [215]:
repeated_words = set(repeated_words)

In [216]:
len(repeated_words)

57303

In [221]:
from datetime import datetime
from bson import ObjectId
def query_by_word(*words):
    return list(
    dict_col.find({
        'word':{
            '$in':words
        }
    },{
    k:1
    for k in words_keys
})
)

def get_date():
    return {
        'created_at':datetime.now(),
        'updated_at':datetime.now()
    }

In [200]:
# for item in progress_bar(all_words):
#     word = item['word'] 
#     is_multi = word in repeated_words
#     data = query_by_word(word)
#     for d in data:
#         d.pop('_id')
#     func = process_multi_words if is_multi else process_single_word
#     n_word,senses = func(
#         data if is_multi else data[0]
#     )
#     inserted_id = m_dict_col.insert_one({
#         **n_word,
#         **get_date()
#     }).inserted_id
    
#     m_sense_col.insert_many([
#         {**s, 'word_id':inserted_id, **get_date()}
#         for s in senses
#     ])

In [265]:
all_res_senses = []
def insert_data(words,senses):
#     global all_res_senses
#     all_res_senses += senses
    m_dict_col.insert_many([{
        **word,
        **get_date()
    }
        for word in words
    ])
    
    m_sense_col.insert_many([
        {**s, **get_date()}
        for s in senses
    ])

def handle_multi_single(word_datas):
    """
    [{},{}]
    """
    all_senses = []
    all_words = []
    for word in word_datas:
        wd,senses = process_single_word(word)
        all_words.append(wd)
        all_senses += senses
    return all_words, all_senses
    
def handle_multi_multi(word_datas):
    dic = {}
    for data in word_datas:
        if dic.get(data['word']) is None:
            dic[data['word']] = [data]
        else:
            dic[data['word']].append(data)
    all_senses = []
    all_words = []
    for words in dic.values():
        word, senses = process_multi_words(words)
        all_words.append(word)
        all_senses += senses
    return all_words, all_senses
    
def create_db(words):
    w_set = set(words)
    single_words = w_set.difference(repeated_words)
    multi_words = w_set.intersection(repeated_words)
    data = query_by_word(*w_set)
    for d in data:
        d.pop('_id')
    s_word_data = [
        d
        for d in data
        if d['word'] in single_words
    ]
    m_word_data = [
        d
        for d in data
        if d['word'] in multi_words
    ]
#     print(len(s_word_data),len(m_word_data),s_word_data, m_word_data)
    w1s,s1s = handle_multi_single(s_word_data)
    w2s, s2s = handle_multi_multi(m_word_data)
    insert_data(
        [*w1s,*w2s],
        [*s1s,*s2s]
    )

In [270]:
import math
def chunk_data(data, bs=10):
    sz =math.floor(len(data)/bs)
    res = []
    for i in range(bs):
        block = data[sz*i : sz*(i+1)] if i != bs-1 else data[sz*i :] 
        res.append(block)
    
    return res

In [269]:
m_dict_col.delete_many({})
m_sense_col.delete_many({})

<pymongo.results.DeleteResult at 0x7ff58d8981e0>

In [268]:
pure_words = list(set(o['word'] for o in all_words))
len(pure_words)

952502

In [271]:
bs=500
chunked = chunk_data(pure_words,bs=bs)
for words in progress_bar(chunked):
    create_db(words)