In [1]:
## dictionary parser
import pandas as pd
import json
import pickle
import os
import unidecode

pd.set_option('display.max_columns', 100)
current_directory = r'C:\Users\LG\Desktop\rueng'

## lemmatizer test
import spacy
from spacy.lang.ru.examples import sentences 

## database save
import sqlite3

### 0. Dump File Json Parsing 

In [2]:
## dictionary file parsing

## read as txt file
dump_path = r'kaikki.org-dictionary-Russian.json'
target_path = os.path.join(current_directory, dump_path)

json_dic = []

with open(target_path, newline='\n', encoding='UTF-8') as file:
    dictionary_raw = file.readlines()

## json parsing
dictionary_raw_json = [json.loads(r) for r in dictionary_raw]

In [3]:
set([dic['pos'] for dic in dictionary_raw_json])

{'adj',
 'adv',
 'ambiposition',
 'character',
 'combining_form',
 'conj',
 'det',
 'infix',
 'interfix',
 'intj',
 'name',
 'noun',
 'num',
 'particle',
 'phrase',
 'postp',
 'prefix',
 'prep',
 'prep_phrase',
 'pron',
 'proverb',
 'punct',
 'suffix',
 'symbol',
 'verb'}

In [4]:
def dictionary_parser(pos_input):
    ## dictionary

    dictionary_dump = [n for n in dictionary_raw_json if n['pos'] == pos_input]

    canonical_list= [c['word'] for c in dictionary_dump]

    declension_list = []
    canonical_list = []
    meaning_list = []
    
    for dump in dictionary_dump:


        try:
            meanings = dump['senses'][0]['links']
        except KeyError:
            try:
                meanings = dump['senses'][0]['glosses']
            except KeyError:
                continue

        try:
            declensions = [f for f in dump['forms'] if len(f) == 4]
        except KeyError:
            continue

        canonical_words = dump['word']
        

        canonical_list.extend([{'canonical_form':canonical_words}]*len(declensions))
        declension_list.extend(declensions)
        meaning_list.extend([{'meaning':meanings}]*len(declensions))



    ## dictionary table
    declension_table = pd.DataFrame(declension_list)[['form', 'tags']]
    canonical_table = pd.DataFrame(canonical_list)
    meaning_table = pd.DataFrame(meaning_list)
    
    dictionary_table = pd.concat([declension_table, canonical_table, meaning_table], axis=1)

    ## dictionary table preprocessing
    dictionary_table['tags'] = ['_'.join(t) for t in dictionary_table.tags]

    for idx in range(dictionary_table.shape[0]):
        dictionary_table.iloc[idx, 3] = '; '.join([list(set(m))[0] for m in dictionary_table.iloc[idx, 3]])

    dictionary_table = dictionary_table[['tags', 'form', 'meaning', 'canonical_form']]
    
    return dictionary_table

In [5]:
## clean_form column
def strip_stress_marks(text):
   b = text.encode('utf-8')
   # correct error where latin accented ó is used
   b = b.replace(b'\xc3\xb3', b'\xd0\xbe')
   # correct error where latin accented á is used
   b = b.replace(b'\xc3\xa1', b'\xd0\xb0')
   # correct error where latin accented é is used
   b = b.replace(b'\xc3\xa0', b'\xd0\xb5')
   # correct error where latin accented ý is used
   b = b.replace(b'\xc3\xbd', b'\xd1\x83')
   # remove combining diacritical mark
   b = b.replace(b'\xcc\x81',b'').decode()
   return b

### 1. Nouns, Adjectives, Verbs

In [6]:
noun_table = dictionary_parser('noun')
noun_table['pos'] = 'noun'

adj_table = dictionary_parser('adj')
adj_table['pos'] = 'adj'

verb_table = dictionary_parser('verb')
verb_table['pos'] = 'verb'

russian_dictionary = pd.concat([noun_table, adj_table, verb_table], axis = 0)

In [7]:
russian_dictionary.head()

Unnamed: 0,tags,form,meaning,canonical_form,pos
0,nominative_singular,соба́ка,dog,собака,noun
1,nominative_plural,соба́ки,dog,собака,noun
2,genitive_singular,соба́ки,dog,собака,noun
3,genitive_plural,соба́к,dog,собака,noun
4,dative_singular,соба́ке,dog,собака,noun


### 2. final dictionary table

In [8]:
## word_canonical table

canonical_words = list(set(russian_dictionary.canonical_form))
word_id = list(range(len(canonical_words)))

word_canonical = pd.DataFrame({'canonical_id': word_id,'canonical_form':canonical_words})
word_canonical = word_canonical.merge(russian_dictionary[['canonical_form','pos', 'meaning']].drop_duplicates(['canonical_form']))
## word_declension table

word_declension = russian_dictionary.merge(word_canonical, on = ['canonical_form'])
word_declension['declension_id'] = list(range(word_declension.shape[0]))


word_declension['clean_form'] = [strip_stress_marks(f) for f in word_declension.form]
word_declension = word_declension[['declension_id', 'form', 'clean_form', 'tags', 'canonical_id']]

In [9]:
word_canonical.head()

Unnamed: 0,canonical_id,canonical_form,pos,meaning
0,0,гнома,noun,n
1,1,Танин,adj,Tanya
2,2,тренировавший,verb,тренировать#Russian
3,3,анкета,noun,questionnaire; form
4,4,животновод,noun,stockbreeder; livestock; breeder


In [10]:
word_declension.head()

Unnamed: 0,declension_id,form,clean_form,tags,canonical_id
0,0,соба́ка,собака,nominative_singular,27925
1,1,соба́ки,собаки,nominative_plural,27925
2,2,соба́ки,собаки,genitive_singular,27925
3,3,соба́к,собак,genitive_plural,27925
4,4,соба́ке,собаке,dative_singular,27925


In [11]:
## add verb_conjugate column to word_canonical table
verb_collection = pd.read_csv('RussianVerbsClassification.csv',delimiter=';')
verb_collection = verb_collection[['Инфинитив', 'Совершенный вид','Пара аспектов']]
verb_collection.columns = ['canonical_form', 'verb_aspect', 'imperfective_perfective']

## merge to word_conanical
word_canonical = word_canonical.merge(verb_collection, how='left')

word_canonical.head()

Unnamed: 0,canonical_id,canonical_form,pos,meaning,verb_aspect,imperfective_perfective
0,0,гнома,noun,n,,
1,1,Танин,adj,Tanya,,
2,2,тренировавший,verb,тренировать#Russian,,
3,3,анкета,noun,questionnaire; form,,
4,4,животновод,noun,stockbreeder; livestock; breeder,,


In [12]:
## same canonical_id (i.e. same word) but with different imperfective/perfective info => concatenate imperfective/perfective strings and
## make 2 rows into 1 row

## find duplicate rows with only imperfective/perfective row different
word_canonical_duplicates = word_canonical.groupby(['canonical_id'], group_keys=False).apply(lambda word_canonical:word_canonical if word_canonical.shape[0] > 1 else None)

duplicates_preprocessed = word_canonical_duplicates.groupby(['canonical_id']).apply(lambda x: '; '.join(x['imperfective_perfective'])).reset_index()
duplicates_preprocessed.columns = ['canonical_id', 'imperfective_perfective']

## update imperfective/perfective row info and drop duplicates 
for id_, imperfective_perfective in zip(duplicates_preprocessed.canonical_id, duplicates_preprocessed.imperfective_perfective):
    word_canonical.loc[word_canonical.canonical_id == id_]['imperfective_perfective'] = imperfective_perfective

word_canonical.drop_duplicates(['canonical_id'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  word_canonical.loc[word_canonical.canonical_id == id_]['imperfective_perfective'] = imperfective_perfective


### 4. ru_dictionary to sqlite3

In [26]:
## save to DB - word_canonical

conn = sqlite3.connect('rueng.sqlite3')
cursor = conn.cursor()

## word_canonical
canonical_tuples = [(id_, canonical_form, pos, meaning, aspect_type, aspect_word) for id_, canonical_form, pos, meaning, aspect_type, aspect_word  in zip(word_canonical.canonical_id, word_canonical.canonical_form, word_canonical.pos, word_canonical.meaning, word_canonical.verb_aspect, word_canonical.imperfective_perfective)]
cursor.executemany("INSERT INTO word_canonical VALUES (?, ?, ?, ?, ?, ?)", canonical_tuples)

## word_declension
declesion_tuples = [(id_, form, clean_form, tag, canonical_id) for id_, form, clean_form, tag, canonical_id in zip(word_declension.declension_id, word_declension.form, word_declension.clean_form ,word_declension.tags, word_declension.canonical_id)]
cursor.executemany("INSERT INTO word_declension VALUES (?, ?, ?, ?, ?)", declesion_tuples)

## commit
conn.commit()
conn.close()