In [3]:
import sys
import gensim, logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [29]:
import math
import numpy

In [31]:
import os
import codecs

### Выделение контекстов, нахождение векторов для них

In [4]:
import zipfile
model_file = "C:\\Users\\boss\\Documents\\Diploma\\180.zip"
with zipfile.ZipFile(model_file, 'r') as archive:
    stream = archive.open('model.bin')
    model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

2019-04-25 02:34:16,665 : INFO : loading projection weights from <zipfile.ZipExtFile name='model.bin' mode='r' compress_type=deflate>
2019-04-25 02:34:25,544 : INFO : loaded (189193, 300) matrix from <zipfile.ZipExtFile [closed]>


In [17]:
keys = ["dom", "glava", "luk", "organ", "vid"]
path = "C:\\Users\\boss\\Documents\\Diploma\\RNC_Subcorpus\\!raznoje\\preprocessed\\without_some_POS\\"

In [25]:
translit_raznoje = {"dom":"дом",
"glava":"глава",
"luk":"лук",
"organ":"орган",
"vid":"вид"}

In [38]:
from collections import Iterable

def flatten_list(items):
    """Yield items from any nested iterable; see Reference."""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten_list(x):
                yield sub_x
        else:
            yield x

In [40]:
# класс для контекстов
class ContextClass(object):
    
    def __init__(self, context, sem_set, target_word):
        self.context = context
        self.sem_set = sem_set
        self.target_word = target_word
        self.vector = None  # пока что 
    
    def __repr__(self):
        list_to_return = [self.sem_set, self.target_word, str(self.context)]
        return " - ".join(list_to_return)

In [44]:
# получаем список контекстов

contexts_dict = dict()
windows_size = 5
vector_dimensions = 300

for key in translit_raznoje:
    # сначала просто список контекстов
    contexts_dict[key] = []
    target_word = translit_raznoje[key] + "_NOUN"
    files = os.listdir(path + key + "\\")
    for file in files:
        sems = file[4:-4]  # without the "NEW_" and ".txt"
        with codecs.open(path + key + "\\" + file, "r", "utf-8") as input_file:
            lines = input_file.read().split("\n")
        for line in lines:
            if len(line.split()) < 3:  # совсем короткие строки нам не нужны
                continue
            else:
                contexts = get_contexts(target_word, windows_size, line)
                for context in contexts:
                    contexts_dict[key].append(ContextClass(context, sems, target_word))
                    
with codecs.open(path + "DICT_contexts.txt", "w", "utf-8") as output_dict:
    output_dict.write(str(contexts_dict))  # записали, потому что вектора всё равно там не отражаются

In [46]:
# теперь добавляем вектора
for key in contexts_dict:
    for i, context_instance in enumerate(contexts_dict[key]):
        contexts_dict[key][i].vector = get_context_vector(context_instance.context, windows_size, vector_dimensions, model)

In [42]:
def get_contexts(target_word, windows_size, line):
    # line is with POS tags, so is the target word (дом_NOUN)
    list_of_contexts = []
    
    splitted_line = line.split()
    for i, word in enumerate(splitted_line):
        if word == target_word:
            context = []
            
            # проверка слева
            for counter in range(windows_size):
                index = i - windows_size + counter
                if index < 0:
                    # добаляем слева инстансы, которые в последствии дадут пустые вектора
                    context.append("_#_#_#_")  # something that is definetely not in the model and will return an empty vector
                else:
                    context.append(splitted_line[index])
            
            context.append(splitted_line[i])
                           
            # теперь проверяем правую сторону
            for counter in range(windows_size):
                try:
                    context.append(splitted_line[i + counter + 1])
                except:
                    context.append("_#_#_#_")
            list_of_contexts.append(context) 
                
    return(list_of_contexts)      
    

In [43]:
# все контексты в таком виде, что целевое слово посередине, а если слов слева/справа не хватает, там стоит 
# последовательность "_#_#_#_", которая даст пустые вектора (тк её точно нет в модели)

def get_context_vector(context, window_size, vector_dimensions, model):
    words_vectors = []
    context_vector = numpy.zeros(vector_dimensions)
    
    if len(context) != window_size * 2 + 1:
        raise ValueError("Контекст неправильной размерности")

    for word in context:
        if (word in model):
            words_vectors.append(model[word])
        else: 
            words_vectors.append(numpy.zeros(vector_dimensions))


    for i in range(300):
        for j, vector in enumerate(words_vectors):
            if j != window_size + 1:
                context_vector[i] += vector[i] * ((window_size - math.fabs(window_size + 1 - j))/window_size)
    return(context_vector)

In [18]:
# примерчик
sample_context = ["ребенок_NOUN", "страдать_VERB", "врожденный_ADJ", "порок_NOUN", "различный_ADJ", 
                  "орган_NOUN", 
                  "новообразование_NOUN", "другой_ADJ", "тяжелый_ADJ", "недуг_NOUN", "можно_ADV"]
res = get_context_vector(sample_context, 5, 300, model)

### Классификация! 

In [48]:
import nltk

In [49]:
from nltk.cluster import KMeansClusterer

In [68]:
# словарь, значение которого для каждого целевого слова - зип файл с парами (сема, присвоенный номер кластера) 
dict_with_cluster_sem_correspondence = dict()

for key in contexts_dict:
    files = os.listdir(path + key + "\\")
    NUM_CLUSTERS = len(files)
    print(key)
    print(NUM_CLUSTERS)
    list_with_vectors = [instance.vector for instance in contexts_dict[key]]  # все векторы
    list_with_sems = [instance.sem_set for instance in contexts_dict[key]]  # все семы
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
    list_with_cluster_labels = kclusterer.cluster(list_with_vectors, assign_clusters=True)
    dict_with_cluster_sem_correspondence[key] = zip(list_with_sems, list_with_cluster_labels)


dom
8
glava
4
luk
2
organ
4
vid
6


In [69]:
for key in dict_with_cluster_sem_correspondence:
    dict_with_cluster_sem_correspondence[key] = list(dict_with_cluster_sem_correspondence[key])

In [70]:
for key in dict_with_cluster_sem_correspondence:
    print(dict_with_cluster_sem_correspondence[key][:10])

[("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 4), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 4), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5)]
[("{'r-concr_der-shift_dt-partb'}", 3), ("{'r-concr_der-shift_dt-partb'}", 2), ("{'r-concr_der-shift_dt-partb'}", 2), ("{'r-concr_der-shift_dt-partb'}", 3), ("{'r-concr_der-shift_dt-partb'}", 2), ("{'r-concr_der-shift_dt-partb'}", 1), ("{'r-concr_der-shift_dt-partb'}", 1), ("{'r-concr_der-shift_dt-partb'}", 3), ("{'r-concr_der-shift_dt-partb'}", 2), ("{'r-concr_der-shift_dt-partb'}", 1)]
[("{'r-concr_t-plant_t-fruit

In [71]:
for key in dict_with_cluster_sem_correspondence:
    print(key)
    print(len(dict_with_cluster_sem_correspondence[key]))

dom
3629
glava
733
luk
2642
organ
1136
vid
3033


In [73]:
for key in dict_with_cluster_sem_correspondence:
    print(key)
    print(len(dict_with_cluster_sem_correspondence[key][5]))

dom
2
glava
2
luk
2
organ
2
vid
2


In [75]:
# смотрим, сколько в исходных файлых контекстов на каждое значение
for key in dict_with_cluster_sem_correspondence:
    files = os.listdir(path + key + "\\")
    senses = []
    for file in files:
        senses.append(file[4:-4])
    
    num_true_contexts = [0]*len(senses)
    for i, sense in enumerate(senses):
        for pair in dict_with_cluster_sem_correspondence[key]:
            if pair[0] == sense:
                num_true_contexts[i] += 1
    print(key)
    print("Всего правильных контекстов")
    print(list(zip(senses, num_true_contexts)))
        
    
    #print(list(dict_with_cluster_sem_correspondence[key])[:20])

dom
Всего правильных контекстов
[("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 269), ("{'r-concr_t-constr_top-contain',_'r-concr_t-space'}", 1003), ("{'r-concr_t-constr_top-contain'}", 2105), ("{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}", 46), ("{'r-concr_t-group_pt-set_sc-hum'}", 23), ("{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}", 71), ("{'r-concr_t-org'}", 13), ("{'r-concr_t-space'}", 99)]
glava
Всего правильных контекстов
[("{'r-concr_der-shift_dt-partb'}", 139), ("{'r-concr_pt-partb_pc-hum'}", 8), ("{'r-concr_t-hum'}", 311), ("{'r-concr_t-text_pt-part_pc-text'}", 275)]
luk
Всего правильных контекстов
[("{'r-concr_t-plant_t-fruit_t-food_pt-aggr'}", 1979), ("{'r-concr_t-tool-weapon_top-arc'}", 663)]
organ
Всего правильных контекстов
[("{'r-concr_der-shift_dt-partb'}", 12), ("{'r-concr_pt-partb_pc-hum_pc-animal_hi-class'}", 170), ("{'r-concr_t-org_hi-class'}", 924), ("{'r-concr_t-tool-mus'}", 30)]
vid
Всего пр

In [None]:
# новый словарь - значения - тоже словари, где ключами в свою очередь являются различные смыслы целевого слова, а 

In [76]:
new_dict_with_all_assigned_clusters = dict()

for key in dict_with_cluster_sem_correspondence:
    new_dict_with_all_assigned_clusters[key] = dict()
    for pair in dict_with_cluster_sem_correspondence[key]:
        new_dict_with_all_assigned_clusters[key].setdefault(pair[0], [])
        new_dict_with_all_assigned_clusters[key][pair[0]].append(pair[1])

# посмотрели, что здесь всего столько же контекстов получается - уже хорошо
for key in new_dict_with_all_assigned_clusters:
    print(key)
    for sema in new_dict_with_all_assigned_clusters[key]:
        print(sema, len(new_dict_with_all_assigned_clusters[key][sema]))
        

dom
{'r-concr_t-constr_top-contain',_'r-concr_t-org'} 269
{'r-concr_t-constr_top-contain',_'r-concr_t-space'} 1003
{'r-concr_t-constr_top-contain'} 2105
{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'} 46
{'r-concr_t-group_pt-set_sc-hum'} 23
{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'} 71
{'r-concr_t-org'} 13
{'r-concr_t-space'} 99
glava
{'r-concr_der-shift_dt-partb'} 139
{'r-concr_pt-partb_pc-hum'} 8
{'r-concr_t-hum'} 311
{'r-concr_t-text_pt-part_pc-text'} 275
luk
{'r-concr_t-plant_t-fruit_t-food_pt-aggr'} 1979
{'r-concr_t-tool-weapon_top-arc'} 663
organ
{'r-concr_der-shift_dt-partb'} 12
{'r-concr_pt-partb_pc-hum_pc-animal_hi-class'} 170
{'r-concr_t-org_hi-class'} 924
{'r-concr_t-tool-mus'} 30
vid
{'r-abstr_der-shift'} 1154
{'r-abstr_r-concr_pt-set_sc-X'} 656
{'r-abstr_t-ment'} 13
{'r-abstr_t-perc_der-v'} 1193
{'r-concr_t-doc'} 7
{'r-concr_t-workart'} 10


In [77]:
for key in new_dict_with_all_assigned_clusters:
    print(key)
    for sema in new_dict_with_all_assigned_clusters[key]:
        print(sema)
        print(new_dict_with_all_assigned_clusters[key][sema][:10])

dom
{'r-concr_t-constr_top-contain',_'r-concr_t-org'}
[5, 5, 6, 5, 5, 4, 5, 4, 5, 5]
{'r-concr_t-constr_top-contain',_'r-concr_t-space'}
[1, 5, 7, 7, 4, 4, 3, 6, 0, 0]
{'r-concr_t-constr_top-contain'}
[1, 2, 1, 2, 3, 2, 0, 7, 5, 1]
{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}
[3, 3, 4, 3, 5, 6, 4, 4, 1, 1]
{'r-concr_t-group_pt-set_sc-hum'}
[3, 7, 4, 4, 1, 5, 4, 6, 3, 7]
{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}
[5, 5, 4, 3, 3, 3, 6, 4, 4, 5]
{'r-concr_t-org'}
[5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
{'r-concr_t-space'}
[0, 3, 3, 5, 1, 6, 5, 3, 3, 3]
glava
{'r-concr_der-shift_dt-partb'}
[3, 2, 2, 3, 2, 1, 1, 3, 2, 1]
{'r-concr_pt-partb_pc-hum'}
[1, 1, 1, 1, 0, 1, 0, 1]
{'r-concr_t-hum'}
[0, 3, 3, 2, 0, 3, 2, 3, 0, 1]
{'r-concr_t-text_pt-part_pc-text'}
[1, 3, 3, 3, 3, 3, 3, 3, 3, 1]
luk
{'r-concr_t-plant_t-fruit_t-food_pt-aggr'}
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
{'r-concr_t-tool-weapon_top-arc'}
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
organ
{'

In [78]:
for key in new_dict_with_all_assigned_clusters:
    print(key)
    for sema in new_dict_with_all_assigned_clusters[key]:
        print(sema)
        small_dict = dict()
        for number in new_dict_with_all_assigned_clusters[key][sema]:
            small_dict.setdefault(number, 0)
            small_dict[number] += 1
        for element in small_dict:
            print(element, small_dict[element])

dom
{'r-concr_t-constr_top-contain',_'r-concr_t-org'}
5 77
6 40
4 81
2 21
3 14
0 14
1 11
7 11
{'r-concr_t-constr_top-contain',_'r-concr_t-space'}
1 90
5 71
7 140
4 215
3 238
6 170
0 46
2 33
{'r-concr_t-constr_top-contain'}
1 270
2 366
3 208
0 140
7 198
5 218
4 526
6 179
{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}
3 10
4 13
5 4
6 4
1 6
7 8
2 1
{'r-concr_t-group_pt-set_sc-hum'}
3 4
7 3
4 8
1 3
5 2
6 3
{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}
5 12
4 19
3 10
6 7
7 6
2 9
1 6
0 2
{'r-concr_t-org'}
5 11
4 1
2 1
{'r-concr_t-space'}
0 11
3 28
5 9
1 3
6 16
4 12
7 19
2 1
glava
{'r-concr_der-shift_dt-partb'}
3 25
2 43
1 53
0 18
{'r-concr_pt-partb_pc-hum'}
1 6
0 2
{'r-concr_t-hum'}
0 71
3 59
2 153
1 28
{'r-concr_t-text_pt-part_pc-text'}
1 83
3 71
0 109
2 12
luk
{'r-concr_t-plant_t-fruit_t-food_pt-aggr'}
1 1780
0 199
{'r-concr_t-tool-weapon_top-arc'}
1 654
0 9
organ
{'r-concr_der-shift_dt-partb'}
1 6
3 1
0 5
{'r-concr_pt-partb_pc-h