In [1]:
from __future__ import print_function
import numpy as np
from nltk.stem import SnowballStemmer
from gensim.models import Word2Vec
from gensim import corpora
from collections import defaultdict
import sys
import pickle
import os

UNDEFINED_TOKEN = "undefined_token"
MAX_WORD_LENGTH = 20
UNDEFINED = "_"
stemmer = SnowballStemmer("russian")

class LowerSentencesWithoutStops(object):
    def __init__(self, fnames, token2token, stops):
        self.fnames = fnames        
        self.token2token = token2token         
        
    def __iter__(self):
        for fname in self.fnames:
            for line in open(fname, 'r', encoding="utf8"):            
                yield [self.token2token.get(token, UNDEFINED_TOKEN)
                       for token in line.lower().split()]
                
                

    
def split_word(word, stemmer):
    flex = word[len(stemmer.stem(word)):]
    if len(flex):
        return word[:-len(flex)], flex
    return word, "empty"


def build_vocab(sentences, min_freq=0, max_size=10000, undefined_id=0):
    """ 
    Строит словарь из слов встертившихся более min_freq раз,
    но размеров  не более max_size, в случае бОльшего количества токенов
    отбрасываются менее частотные токены, undefined_id - id первого токена в словаре,
    который будет называться "undefined_token"
    """
    offset = undefined_id
    token2id = {UNDEFINED_TOKEN: offset}
    id2token = {offset: UNDEFINED_TOKEN}    
    
    counter = defaultdict(int)    
    for sentence in sentences:
        for token in sentence:
            counter[token] += 1
    sorted_tokens = [t_f[0]  for t_f in 
                     sorted([t_f for t_f in counter.items() if t_f[1] >= min_freq],
                           key=lambda tf: -tf[1])]                     
    
    for token in sorted_tokens[:max_size - len(token2id)]:
        offset += 1
        token2id[token] = offset
        id2token[offset] = token
    return token2id, id2token 


def build_ch_vocab(text, min_freq=0, max_size=100, undefined_id=0):
    """ 
    Строит словарь из слов встертившихся более min_freq раз,
    но размеров  не более max_size, в случае бОльшего количества токенов
    отбрасываются менее частотные токены, undefined_id - id первого токена в словаре,
    который будет называться "undefined_token"
    """
    offset = undefined_id
    token2id = {UNDEFINED_TOKEN: offset}
    id2token = {offset: UNDEFINED_TOKEN}    
    
    counter = defaultdict(int)    
    for token in text:
        counter[token] += 1
        
    sorted_tokens = [t_f[0]  for t_f in 
                     sorted([t_f for t_f in counter.items() if t_f[1] >= min_freq],
                           key=lambda tf: -tf[1])]                     
    
    for token in sorted_tokens[:max_size - len(token2id)]:
        offset += 1
        token2id[token] = offset
        id2token[offset] = token
    return token2id, id2token  




def read_gikrya(path):
    """
    Reading format:
    row_index<TAB>form<TAB>lemma<TAB>POS<TAB>tag
    """
    
    morpho_map = {"POS":{UNDEFINED: 0, 
                         0: UNDEFINED}}
    sentences = []
    vocab = {}    
    with open(path, 'r') as f:
        
        sentence = []
        for line in f:
            splits = line.strip().split('\t')      
            if len(splits) == 4:
                splits.insert(0, 1)
            if len(splits) == 5:
                form, lemma, POS, tags = splits[1:]
                if POS not in  morpho_map["POS"]:
                    morpho_map["POS"][POS] = len(morpho_map["POS"]) // 2 
                    morpho_map["POS"][morpho_map["POS"][POS]] =  POS
                tags_list = [("POS", POS)]
                if tags != "_":
                    for tag_val in tags.split("|"):
                        tag, val = tag_val.split("=")
                        tags_list.append((tag, val))
                        if tag not in morpho_map:
                            morpho_map[tag] = {UNDEFINED: 0,
                                               0: UNDEFINED}
                        if val not in morpho_map[tag]:
                            morpho_map[tag][val] = len(morpho_map[tag]) // 2 
                            morpho_map[tag][morpho_map[tag][val]] = val
                if form not in vocab:
                    vocab[form] = form
                sentence.append((vocab[form], lemma, tags_list) )
            elif len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
    return sentences, morpho_map 


def read_corpus(path):
    sentences = []
    with open(path, 'r') as f:
        for line in f:
            sentences.append(line.strip().lower().split())
    return sentences
        
    
def write_vecs(path, vecs_path, id2token, w2v_model):
    # косяк с тем чтон undefined token не 0
    vecs = np.zeros(shape=(len(token2id), w2v_model.vector_size))
    with open(path, 'w') as f:
        for tid in range(len(id2token)):
            vecs[tid, :] = w2v_model[id2token[tid]]
            f.write(id2token[tid])
            f.write("\n")
    np.save(vecs_path, vecs)
    

def preproc_dataset(full_tag_sentences, stemmer):    
    sentences = []
    flexes = []
    token_tags = []
    
    for sent in full_tag_sentences:
        temp_sent = []
        temp_flexes = []
        for token_info in sent:
            token = token_info[0].lower()          
            splits = split_word(token, stemmer)
            temp_sent.append(splits[0])
            temp_flexes.append(splits[1])
            token_tags.append(token_info[2])  # надо бы переделать под стиль sentences или?          
        sentences.append(temp_sent)
        flexes.append(temp_flexes)    
    return sentences, flexes, token_tags


def get_tokens(sentences):
    tokens = []
    for sent in sentences:
        for token in sent:
            tokens.append(token)
    return tokens
    
    
def preproc_files(fnames):
    sentences_full = []
    for fname in fnames:
        s_full, _ = read_gikrya(fname)
        sentences_full = sentences_full + s_full
    return sentences_full
            
    

In [192]:
X.max()

NameError: name 'X' is not defined

In [2]:
path_to_tagged = "../morphoRuEval-2017/Baseline/source/gikrya_train.txt"
path_to_write_morpho = "../models/morpho.pickle"
sentences_full, morpho_map = read_gikrya(path_to_tagged)
cat_order = sorted([key for key in morpho_map.keys()])
pickle.dump((morpho_map, cat_order), open(path_to_write_morpho, 'wb'))

In [3]:
fnames = ["../JointMorphoClosed.txt", 
          "../morphoRuEval-2017/test_collection/VK.txt",
         "../morphoRuEval-2017/test_collection/JZ.txt",
          "../morphoRuEval-2017/test_collection/Lenta.txt"]

sentences_full = preproc_files(fnames)

In [15]:
# morpho_map
# !head  "../JointMorphoClosed.txt"
# len(stem_modeiil.vocab)
sentences, flexes, token_tags = preproc_dataset(sentences_full, stemmer)

In [4]:
char2id, id2char = build_ch_vocab(open("../JointMorphoClosed.txt", "r").read().lower(),
                                  max_size=80)

In [5]:
with open("char2id", "w") as f:
    for i in range(len(char2id)):
        f.write("{}\n".format(id2char[i]))


In [292]:
sorted([ch for ch in char2id.keys()])

['\t',
 '\n',
 '!',
 '"',
 '#',
 '%',
 '&',
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'undefined_token',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '«',
 '»',
 'а',
 'б',
 'в',
 'г',
 'д',
 'е',
 'ж',
 'з',
 'и',
 'й',
 'к',
 'л',
 'м',
 'н',
 'о',
 'п',
 'р',
 'с',
 'т',
 'у',
 'ф',
 'х',
 'ц',
 'ч',
 'ш',
 'щ',
 'ъ',
 'ы',
 'ь',
 'э',
 'ю',
 'я',
 'ё',
 '–',
 '—',
 '•',
 '…',
 '№']

In [174]:
!cat char2id | wc -l

101


In [7]:
len(sentences_full)

363470

In [None]:
stem_path = '../for_embedding/allTexts_stemmas.txt'
flex_path = '../for_embedding/allTexts_flexias.txt'

stemmas = read_corpus(stem_path)
flexias = read_corpus(flex_path)

In [10]:
stemmas = get_tokens(stemmas)
flexias = get_tokens(flexias)
# len(set(stemmas)), len(set(flexias))

In [16]:
len(set(stemmas)), len(set(flexias))

(612715, 768)

In [18]:
token2id, id2token = build_vocab(stemmas, 
                                 min_freq=1,
                                 max_size=80000)

flex2id, id2flex = build_vocab(flexias, 
                               min_freq=2, 
                               max_size=500)

len(token2id), len(flex2id)

(1514, 26)

In [None]:
stem2stem = {}
for stem in token2id.keys():
    stem2stem[stem] = stem
    
flex2flex = {}
for flex in flex2id.keys():
    flex2flex[flex] = flex

new_sents = [[stem2stem.get(token, UNDEFINED_TOKEN) for token in sent] for sent in sentences]
new_flexes = [[flex2flex.get(token, UNDEFINED_TOKEN) for token in sent] for sent in flexes]
len(stem2stem), len(flex2flex)

In [189]:
del stem_sentences, flex_sentences

In [167]:
# stem_sentences = list(LowerSentencesWithoutStops([stem_path], stem2stem, set([])))
# stem_model = Word2Vec(stem_sentences, size=128, sg=1, workers=5, iter=10, min_count=1)
stem_model = Word2Vec(new_sents, size=200, sg=1, workers=5, iter=10, min_count=1)

In [96]:
new_sents[0]
# stem_model.vector_size
# print([stiiem for stem, i in token2id.items() if stem not in stem_model])
# x = [flex for flex, i in flex2id.items() if flex not in flex_model]

['undefined_token',
 35,
 127,
 7,
 1199,
 3,
 31520,
 38671,
 1,
 16489,
 556,
 4598,
 50,
 9006,
 1,
 9,
 100,
 89,
 23]

In [182]:
# flex_sentences = list(LowerSentencesWithoutStops([flex_path], flex2flex, set([])))
# flex_model = Word2Vec(flex_sentences, size=128, sg=1, workers=5, iter=10, min_count=1)
flex_model = Word2Vec(new_flexes, size=128, sg=1, workers=5, iter=10, min_count=1)

In [183]:
prefix = "../models"


write_vecs(os.path.join(prefix,"stem2id"),
           os.path.join(prefix, "stem_embeddings"),
           id2token, stem_model)


write_vecs(os.path.join(prefix, "flex2id"), 
           os.path.join(prefix, "flex_embeddings"),
           id2flex, flex_model)

In [None]:
len(id2token)

In [None]:
"asd asd asd\n".split()

In [111]:
token2id[UNDEFINED_TOKEN]

1

In [127]:
!head "../models/stem2id"

undefined_token
,
.
и
в
не
-
на
эт
что


In [117]:
id2flex[2]

'empty'

{'e', 'q', 'r', 'w'}