In [3]:
import sys
import gensim, logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [29]:
import math
import numpy

In [31]:
import os
import codecs

### Выделение контекстов, нахождение векторов для них

In [4]:
import zipfile
model_file = "C:\\Users\\boss\\Documents\\Diploma\\180.zip"
with zipfile.ZipFile(model_file, 'r') as archive:
    stream = archive.open('model.bin')
    model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

2019-04-25 02:34:16,665 : INFO : loading projection weights from <zipfile.ZipExtFile name='model.bin' mode='r' compress_type=deflate>
2019-04-25 02:34:25,544 : INFO : loaded (189193, 300) matrix from <zipfile.ZipExtFile [closed]>


In [17]:
keys = ["dom", "glava", "luk", "organ", "vid"]
path = "C:\\Users\\boss\\Documents\\Diploma\\RNC_Subcorpus\\!raznoje\\preprocessed\\without_some_POS\\"

In [25]:
translit_raznoje = {"dom":"дом",
"glava":"глава",
"luk":"лук",
"organ":"орган",
"vid":"вид"}

In [38]:
from collections import Iterable

def flatten_list(items):
    """Yield items from any nested iterable; see Reference."""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten_list(x):
                yield sub_x
        else:
            yield x

In [40]:
# класс для контекстов
class ContextClass(object):
    
    def __init__(self, context, sem_set, target_word):
        self.context = context
        self.sem_set = sem_set
        self.target_word = target_word
        self.vector = None  # пока что 
    
    def __repr__(self):
        list_to_return = [self.sem_set, self.target_word, str(self.context)]
        return " - ".join(list_to_return)

In [44]:
# получаем список контекстов

contexts_dict = dict()
windows_size = 5
vector_dimensions = 300

for key in translit_raznoje:
    # сначала просто список контекстов
    contexts_dict[key] = []
    target_word = translit_raznoje[key] + "_NOUN"
    files = os.listdir(path + key + "\\")
    for file in files:
        sems = file[4:-4]  # without the "NEW_" and ".txt"
        with codecs.open(path + key + "\\" + file, "r", "utf-8") as input_file:
            lines = input_file.read().split("\n")
        for line in lines:
            if len(line.split()) < 3:  # совсем короткие строки нам не нужны
                continue
            else:
                contexts = get_contexts(target_word, windows_size, line)
                for context in contexts:
                    contexts_dict[key].append(ContextClass(context, sems, target_word))
                    
with codecs.open(path + "DICT_contexts.txt", "w", "utf-8") as output_dict:
    output_dict.write(str(contexts_dict))  # записали, потому что вектора всё равно там не отражаются

In [46]:
# теперь добавляем вектора
for key in contexts_dict:
    for i, context_instance in enumerate(contexts_dict[key]):
        contexts_dict[key][i].vector = get_context_vector(context_instance.context, windows_size, vector_dimensions, model)

In [42]:
def get_contexts(target_word, windows_size, line):
    # line is with POS tags, so is the target word (дом_NOUN)
    list_of_contexts = []
    
    splitted_line = line.split()
    for i, word in enumerate(splitted_line):
        if word == target_word:
            context = []
            
            # проверка слева
            for counter in range(windows_size):
                index = i - windows_size + counter
                if index < 0:
                    # добаляем слева инстансы, которые в последствии дадут пустые вектора
                    context.append("_#_#_#_")  # something that is definetely not in the model and will return an empty vector
                else:
                    context.append(splitted_line[index])
            
            context.append(splitted_line[i])
                           
            # теперь проверяем правую сторону
            for counter in range(windows_size):
                try:
                    context.append(splitted_line[i + counter + 1])
                except:
                    context.append("_#_#_#_")
            list_of_contexts.append(context) 
                
    return(list_of_contexts)      
    

In [43]:
# все контексты в таком виде, что целевое слово посередине, а если слов слева/справа не хватает, там стоит 
# последовательность "_#_#_#_", которая даст пустые вектора (тк её точно нет в модели)

def get_context_vector(context, window_size, vector_dimensions, model):
    words_vectors = []
    context_vector = numpy.zeros(vector_dimensions)
    
    if len(context) != window_size * 2 + 1:
        raise ValueError("Контекст неправильной размерности")

    for word in context:
        if (word in model):
            words_vectors.append(model[word])
        else: 
            words_vectors.append(numpy.zeros(vector_dimensions))


    for i in range(300):
        for j, vector in enumerate(words_vectors):
            if j != window_size + 1:
                context_vector[i] += vector[i] * ((window_size - math.fabs(window_size + 1 - j))/window_size)
    return(context_vector)

In [18]:
# примерчик
sample_context = ["ребенок_NOUN", "страдать_VERB", "врожденный_ADJ", "порок_NOUN", "различный_ADJ", 
                  "орган_NOUN", 
                  "новообразование_NOUN", "другой_ADJ", "тяжелый_ADJ", "недуг_NOUN", "можно_ADV"]
res = get_context_vector(sample_context, 5, 300, model)

### Классификация! 

In [48]:
import nltk

In [49]:
from nltk.cluster import KMeansClusterer

In [50]:
dict_with_cluster_sem_correspondence = dict()

for key in contexts_dict:
    files = os.listdir(path + key + "\\")
    NUM_CLUSTERS = len(files)
    list_with_vectors = [instance.vector for instance in contexts_dict[key]]  # все векторы
    list_with_sems = [instance.sem_set for instance in contexts_dict[key]]  # все семы
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
    list_with_cluster_labels = kclusterer.cluster(list_with_vectors, assign_clusters=True)
    dict_with_cluster_sem_correspondence[key] = zip(list_with_sems, list_with_cluster_labels)


In [52]:
for key in dict_with_cluster_sem_correspondence:
    print(list(dict_with_cluster_sem_correspondence[key])[:20])

[("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 3), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 3), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 1), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 3), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t