In [1]:
import sys
import gensim, logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [2]:
import math
import numpy 

In [3]:
import os
import codecs

In [4]:
import sklearn

### Выделение контекстов, нахождение векторов для них

In [4]:
import zipfile
model_file = "C:\\Users\\boss\\Documents\\Diploma\\180.zip"
with zipfile.ZipFile(model_file, 'r') as archive:
    stream = archive.open('model.bin')
    model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

2019-05-03 16:38:57,327 : INFO : loading projection weights from <zipfile.ZipExtFile name='model.bin' mode='r' compress_type=deflate>
2019-05-03 16:39:07,852 : INFO : loaded (189193, 300) matrix from <zipfile.ZipExtFile [closed]>


In [6]:
import zipfile
model_file = "C:\\Users\\boss\\Documents\\Diploma\\182.zip"
with zipfile.ZipFile(model_file, 'r') as archive:
    stream = archive.open('model.bin')
    model_skipgram = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

2019-05-03 22:28:24,070 : INFO : loading projection weights from <zipfile.ZipExtFile name='model.bin' mode='r' compress_type=deflate>
2019-05-03 22:28:37,279 : INFO : loaded (248978, 300) matrix from <zipfile.ZipExtFile [closed]>


In [7]:
keys = ["dom", "glava", "luk", "organ", "vid"]
path = "C:\\Users\\boss\\Documents\\Diploma\\RNC_Subcorpus\\!raznoje\\preprocessed\\without_some_POS\\"

In [8]:
translit_raznoje = {"dom":"дом",
"glava":"глава",
"luk":"лук",
"organ":"орган",
"vid":"вид"}

In [9]:
from collections import Iterable

def flatten_list(items):
    """Yield items from any nested iterable; see Reference."""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten_list(x):
                yield sub_x
        else:
            yield x

In [10]:
# класс для контекстов
class ContextClass(object):
    
    def __init__(self, context, sem_set, target_word):
        self.context = context
        self.sem_set = sem_set
        self.target_word = target_word
        self.vector = None  # пока что 
    
    def __repr__(self):
        list_to_return = [self.sem_set, self.target_word, str(self.context)]
        return " - ".join(list_to_return)

In [11]:
def get_contexts(target_word, windows_size, line):
    # line is with POS tags, so is the target word (дом_NOUN)
    list_of_contexts = []
    
    splitted_line = line.split()
    for i, word in enumerate(splitted_line):
        if word == target_word:
            context = []
            
            # проверка слева
            for counter in range(windows_size):
                index = i - windows_size + counter
                if index < 0:
                    # добаляем слева инстансы, которые в последствии дадут пустые вектора
                    context.append("_#_#_#_")  # something that is definetely not in the model and will return an empty vector
                else:
                    context.append(splitted_line[index])
            
            context.append(splitted_line[i])
                           
            # теперь проверяем правую сторону
            for counter in range(windows_size):
                try:
                    context.append(splitted_line[i + counter + 1])
                except:
                    context.append("_#_#_#_")
            list_of_contexts.append(context) 
                
    return(list_of_contexts)      
    

In [12]:
# получаем список контекстов

contexts_dict = dict()
windows_size = 5
vector_dimensions = 300

for key in translit_raznoje:
    # сначала просто список контекстов
    contexts_dict[key] = []
    target_word = translit_raznoje[key] + "_NOUN"
    files = os.listdir(path + key + "\\")
    for file in files:
        sems = file[4:-4]  # without the "NEW_" and ".txt"
        with codecs.open(path + key + "\\" + file, "r", "utf-8") as input_file:
            lines = input_file.read().split("\n")
        for line in lines:
            if len(line.split()) < 3:  # совсем короткие строки нам не нужны
                continue
            else:
                contexts = get_contexts(target_word, windows_size, line)
                for context in contexts:
                    contexts_dict[key].append(ContextClass(context, sems, target_word))
                    
with codecs.open(path + "DICT_contexts.txt", "w", "utf-8") as output_dict:
    output_dict.write(str(contexts_dict))  # записали, потому что вектора всё равно там не отражаются

In [18]:
# здесь можно было бы сделать кэш слов, чтобы не каждый раз заново находились вектора

# все контексты в таком виде, что целевое слово посередине, а если слов слева/справа не хватает, там стоит 
# последовательность "_#_#_#_", которая даст пустые вектора (тк её точно нет в модели)

def get_context_vector(context, window_size, vector_dimensions, model):
    words_vectors = []
    context_vector = numpy.zeros(vector_dimensions)
    non_zero_vectors_num = 0
    
    if len(context) != window_size * 2 + 1:
        raise ValueError("Контекст неправильной размерности")

    for word in context:
        if (word in model):
            non_zero_vectors_num += 1
            words_vectors.append(model[word])
        else: 
            words_vectors.append(numpy.zeros(vector_dimensions))
    
    if non_zero_vectors_num-1 <= 0:
        print(context)
    
    # fractional decay weighting
    for i in range(300):
        for j, vector in enumerate(words_vectors):
            if j != window_size + 1:
                context_vector[i] += vector[i] * ((window_size - math.fabs(window_size + 1 - j))/window_size)
                
    # + нормализация: после того как получили вектор контекста, делим все его числа на количество непустых векторов:
    # так как там идёт суммирование весов для всех слов в контексте, если есть пустые слова, то сумма неоправданно меньше  
    # это у нас считал non_zero_vectors_num, но нужно вычесть единицу, так она отвечает за целевое слово
    if non_zero_vectors_num <= 1:
        return None
    else:
        for i, digit in enumerate(context_vector):
            context_vector[i] = digit/(non_zero_vectors_num-1)
        return(context_vector)

In [19]:
# !! выведены контексты, где помимо целевого слова остальных нет в модели

# теперь добавляем вектора
for key in contexts_dict:
    for i, context_instance in enumerate(contexts_dict[key]):
        # в первом варианте была просто model 
        contexts_dict[key][i].vector = get_context_vector(context_instance.context, windows_size, vector_dimensions, model_skipgram)

['_#_#_#_', '_#_#_#_', '_#_#_#_', 'он_PRON', 'они_PRON', 'дом_NOUN', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_']
['_#_#_#_', '_#_#_#_', '_#_#_#_', 'он_PRON', 'я_PRON', 'дом_NOUN', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_']
['_#_#_#_', '_#_#_#_', '_#_#_#_', 'это_PRON', 'ваш_DET', 'дом_NOUN', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_']
['_#_#_#_', '_#_#_#_', '_#_#_#_', 'стеречь_PROPN', 'наш_DET', 'дом_NOUN', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_']
['_#_#_#_', '_#_#_#_', '_#_#_#_', 'это_PRON', 'он_PRON', 'дом_NOUN', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_']
['_#_#_#_', '_#_#_#_', 'гиацинть_PROPN', 'иридодиктиа_NUM', 'ксифиа_NOUN', 'лук_NOUN', 'мускари_NOUN', 'птицемлечникать_NOUN', 'пушкиний_NOUN', 'пролескый_NOUN', 'хионодокс_NOUN']
['_#_#_#_', '_#_#_#_', '_#_#_#_', 'коровяк_PROPN', 'котовник_NOUN', 'лук_NOUN', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_']
['_#_#_#_', '_#_#_#_', '_#_#_#_', '_#_#_#_', 'ты_PRON', 'лук_N

In [31]:
for key in contexts_dict:
    print(key)
    print(len(contexts_dict[key]))

dom
3629
glava
733
luk
2642
organ
1136
vid
3033


In [37]:
# убираем из словаря такие контексты
for key in contexts_dict:
    for i, context_instance in enumerate(contexts_dict[key]):
        if context_instance.vector is None:
            contexts_dict[key].pop(i)

In [38]:
for key in contexts_dict:
    print(key)
    print(len(contexts_dict[key]))

dom
3624
glava
733
luk
2638
organ
1135
vid
3027


In [18]:
# примерчик
sample_context = ["ребенок_NOUN", "страдать_VERB", "врожденный_ADJ", "порок_NOUN", "различный_ADJ", 
                  "орган_NOUN", 
                  "новообразование_NOUN", "другой_ADJ", "тяжелый_ADJ", "недуг_NOUN", "можно_ADV"]
res = get_context_vector(sample_context, 5, 300, model)

### Кластеризация

In [20]:
import nltk

In [21]:
from nltk.cluster import KMeansClusterer

In [40]:
# словарь, значение которого для каждого целевого слова - зип файл с парами (сема, присвоенный номер кластера) 
dict_with_cluster_sem_correspondence = dict()

for key in contexts_dict:
    files = os.listdir(path + key + "\\")
    NUM_CLUSTERS = len(files)
    print(key)
    print(NUM_CLUSTERS)
    list_with_vectors = [instance.vector for instance in contexts_dict[key]]  # все векторы
    list_with_sems = [instance.sem_set for instance in contexts_dict[key]]  # все семы
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
    list_with_cluster_labels = kclusterer.cluster(list_with_vectors, assign_clusters=True)
    dict_with_cluster_sem_correspondence[key] = zip(list_with_sems, list_with_cluster_labels)


dom
8
glava
4
luk
2
organ
4
vid
6


In [41]:
for key in dict_with_cluster_sem_correspondence:
    dict_with_cluster_sem_correspondence[key] = list(dict_with_cluster_sem_correspondence[key])

In [70]:
for key in dict_with_cluster_sem_correspondence:
    print(dict_with_cluster_sem_correspondence[key][:10])

[("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 6), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 4), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 4), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5), ("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 5)]
[("{'r-concr_der-shift_dt-partb'}", 3), ("{'r-concr_der-shift_dt-partb'}", 2), ("{'r-concr_der-shift_dt-partb'}", 2), ("{'r-concr_der-shift_dt-partb'}", 3), ("{'r-concr_der-shift_dt-partb'}", 2), ("{'r-concr_der-shift_dt-partb'}", 1), ("{'r-concr_der-shift_dt-partb'}", 1), ("{'r-concr_der-shift_dt-partb'}", 3), ("{'r-concr_der-shift_dt-partb'}", 2), ("{'r-concr_der-shift_dt-partb'}", 1)]
[("{'r-concr_t-plant_t-fruit

In [23]:
for key in dict_with_cluster_sem_correspondence:
    print(key)
    print(len(dict_with_cluster_sem_correspondence[key]))

dom
3629
glava
733
luk
2642
organ
1136
vid
3033


In [73]:
# смотрим, что в рандомном - пятом - элементе два значения: это list_with_sems, list_with_cluster_labels
for key in dict_with_cluster_sem_correspondence:
    print(key)
    print(len(dict_with_cluster_sem_correspondence[key][5]))

dom
2
glava
2
luk
2
organ
2
vid
2


In [42]:
# смотрим, сколько в исходных файлых контекстов на каждое значение
for key in dict_with_cluster_sem_correspondence:
    files = os.listdir(path + key + "\\")
    senses = []
    for file in files:
        senses.append(file[4:-4])
    
    num_true_contexts = [0]*len(senses)
    for i, sense in enumerate(senses):
        for pair in dict_with_cluster_sem_correspondence[key]:
            if pair[0] == sense:
                num_true_contexts[i] += 1
    print(key)
    print("Всего правильных контекстов")
    print(list(zip(senses, num_true_contexts)))
        
    
    #print(list(dict_with_cluster_sem_correspondence[key])[:20])

dom
Всего правильных контекстов
[("{'r-concr_t-constr_top-contain',_'r-concr_t-org'}", 269), ("{'r-concr_t-constr_top-contain',_'r-concr_t-space'}", 999), ("{'r-concr_t-constr_top-contain'}", 2104), ("{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}", 46), ("{'r-concr_t-group_pt-set_sc-hum'}", 23), ("{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}", 71), ("{'r-concr_t-org'}", 13), ("{'r-concr_t-space'}", 99)]
glava
Всего правильных контекстов
[("{'r-concr_der-shift_dt-partb'}", 139), ("{'r-concr_pt-partb_pc-hum'}", 8), ("{'r-concr_t-hum'}", 311), ("{'r-concr_t-text_pt-part_pc-text'}", 275)]
luk
Всего правильных контекстов
[("{'r-concr_t-plant_t-fruit_t-food_pt-aggr'}", 1976), ("{'r-concr_t-tool-weapon_top-arc'}", 662)]
organ
Всего правильных контекстов
[("{'r-concr_der-shift_dt-partb'}", 12), ("{'r-concr_pt-partb_pc-hum_pc-animal_hi-class'}", 170), ("{'r-concr_t-org_hi-class'}", 923), ("{'r-concr_t-tool-mus'}", 30)]
vid
Всего пра

In [None]:
# новый словарь - значения - тоже словари, где ключами в свою очередь являются различные смыслы целевого слова, а 

In [43]:
new_dict_with_all_assigned_clusters = dict()

for key in dict_with_cluster_sem_correspondence:
    new_dict_with_all_assigned_clusters[key] = dict()
    for pair in dict_with_cluster_sem_correspondence[key]:
        new_dict_with_all_assigned_clusters[key].setdefault(pair[0], [])
        new_dict_with_all_assigned_clusters[key][pair[0]].append(pair[1])

# посмотрели, что здесь всего столько же контекстов получается - уже хорошо
for key in new_dict_with_all_assigned_clusters:
    print(key)
    for sema in new_dict_with_all_assigned_clusters[key]:
        print(sema, len(new_dict_with_all_assigned_clusters[key][sema]))
        

dom
{'r-concr_t-constr_top-contain',_'r-concr_t-org'} 269
{'r-concr_t-constr_top-contain',_'r-concr_t-space'} 999
{'r-concr_t-constr_top-contain'} 2104
{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'} 46
{'r-concr_t-group_pt-set_sc-hum'} 23
{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'} 71
{'r-concr_t-org'} 13
{'r-concr_t-space'} 99
glava
{'r-concr_der-shift_dt-partb'} 139
{'r-concr_pt-partb_pc-hum'} 8
{'r-concr_t-hum'} 311
{'r-concr_t-text_pt-part_pc-text'} 275
luk
{'r-concr_t-plant_t-fruit_t-food_pt-aggr'} 1976
{'r-concr_t-tool-weapon_top-arc'} 662
organ
{'r-concr_der-shift_dt-partb'} 12
{'r-concr_pt-partb_pc-hum_pc-animal_hi-class'} 170
{'r-concr_t-org_hi-class'} 923
{'r-concr_t-tool-mus'} 30
vid
{'r-abstr_der-shift'} 1150
{'r-abstr_r-concr_pt-set_sc-X'} 656
{'r-abstr_t-ment'} 13
{'r-abstr_t-perc_der-v'} 1191
{'r-concr_t-doc'} 7
{'r-concr_t-workart'} 10


In [44]:
# смотрим для каждого значения - к каким кластерам отнесены его контексты (первые 10)
for key in new_dict_with_all_assigned_clusters:
    print(key)
    for sema in new_dict_with_all_assigned_clusters[key]:
        print(sema)
        print(new_dict_with_all_assigned_clusters[key][sema][:10])

dom
{'r-concr_t-constr_top-contain',_'r-concr_t-org'}
[1, 1, 2, 2, 2, 1, 2, 6, 1, 3]
{'r-concr_t-constr_top-contain',_'r-concr_t-space'}
[4, 1, 3, 3, 7, 7, 3, 0, 5, 5]
{'r-concr_t-constr_top-contain'}
[6, 6, 6, 6, 0, 6, 6, 7, 2, 5]
{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}
[0, 4, 0, 0, 1, 0, 7, 3, 0, 4]
{'r-concr_t-group_pt-set_sc-hum'}
[7, 0, 7, 7, 7, 1, 7, 0, 4, 7]
{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}
[4, 2, 7, 0, 4, 0, 0, 6, 7, 1]
{'r-concr_t-org'}
[2, 2, 2, 2, 1, 2, 1, 2, 1, 1]
{'r-concr_t-space'}
[3, 0, 0, 5, 4, 7, 3, 4, 4, 4]
glava
{'r-concr_der-shift_dt-partb'}
[2, 2, 2, 2, 2, 2, 2, 2, 0, 2]
{'r-concr_pt-partb_pc-hum'}
[2, 3, 3, 3, 3, 2, 2, 3]
{'r-concr_t-hum'}
[3, 2, 2, 0, 1, 2, 1, 0, 1, 2]
{'r-concr_t-text_pt-part_pc-text'}
[2, 2, 2, 2, 0, 0, 0, 0, 0, 3]
luk
{'r-concr_t-plant_t-fruit_t-food_pt-aggr'}
[0, 1, 1, 0, 1, 1, 1, 1, 1, 0]
{'r-concr_t-tool-weapon_top-arc'}
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
organ
{'

In [45]:
for key in new_dict_with_all_assigned_clusters:
    print(key)
    for sema in new_dict_with_all_assigned_clusters[key]:
        print(sema)
        small_dict = dict()
        for number in new_dict_with_all_assigned_clusters[key][sema]:
            small_dict.setdefault(number, 0)
            small_dict[number] += 1
        for element in small_dict:
            print(element, small_dict[element])

dom
{'r-concr_t-constr_top-contain',_'r-concr_t-org'}
1 59
2 59
6 19
3 37
7 44
4 20
5 11
0 20
{'r-concr_t-constr_top-contain',_'r-concr_t-space'}
4 183
1 44
3 265
7 215
0 194
5 61
6 13
2 24
{'r-concr_t-constr_top-contain'}
6 325
0 197
7 313
2 64
5 366
3 407
4 256
1 176
{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}
0 13
4 2
1 2
7 13
3 11
2 1
5 3
6 1
{'r-concr_t-group_pt-set_sc-hum'}
7 10
0 5
1 1
4 4
6 2
3 1
{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}
4 9
2 6
7 13
0 10
6 10
1 7
3 11
5 5
{'r-concr_t-org'}
2 6
1 5
0 1
5 1
{'r-concr_t-space'}
3 34
0 25
5 2
4 15
7 14
2 6
1 3
glava
{'r-concr_der-shift_dt-partb'}
2 71
0 20
3 13
1 35
{'r-concr_pt-partb_pc-hum'}
2 3
3 5
{'r-concr_t-hum'}
3 35
2 72
0 94
1 110
{'r-concr_t-text_pt-part_pc-text'}
2 129
0 45
3 87
1 14
luk
{'r-concr_t-plant_t-fruit_t-food_pt-aggr'}
0 620
1 1356
{'r-concr_t-tool-weapon_top-arc'}
0 661
1 1
organ
{'r-concr_der-shift_dt-partb'}
0 1
1 1
2 1
3 9
{'r-concr_pt-pa

# Подобие классификации

In [366]:
dom_mean_vectors = []
for sem in dict_for_dom:
    array = np.array(dict_for_dom[sem])
    mean = np.mean(array,axis=0)
    
    dom_mean_vectors.append([sem, mean.tolist()])    

In [367]:
glava_mean_vectors = []
for sem in dict_for_glava:
    array = np.array(dict_for_glava[sem])
    mean = np.mean(array,axis=0)
    
    glava_mean_vectors.append([sem, mean.tolist()])    

In [368]:
luk_mean_vectors = []
for sem in dict_for_luk:
    array = np.array(dict_for_luk[sem])
    mean = np.mean(array,axis=0)
    
    luk_mean_vectors.append([sem, mean.tolist()])    

In [369]:
organ_mean_vectors = []
for sem in dict_for_organ:
    array = np.array(dict_for_organ[sem])
    mean = np.mean(array,axis=0)
    
    organ_mean_vectors.append([sem, mean.tolist()])    

In [370]:
vid_mean_vectors = []
for sem in dict_for_vid:
    array = np.array(dict_for_vid[sem])
    mean = np.mean(array,axis=0)
    
    vid_mean_vectors.append([sem, mean.tolist()])     

In [356]:
translit = {"dom":"Дом",
"glava":"Глава",
"luk":"Лук",
"organ":"Орган",
"vid":"Вид"}

In [355]:
path = "C:\\Users\\boss\\Documents\\Diploma\\RNC_Subcorpus\\!raznoje\\raw_texts\\without_punctuation\\lemmatized_bad\\without_some_POS\\"


In [None]:
# отдельно плохие контексты для каждого слова (для 'дома' нет)

In [358]:
contexts_dict = dict()
windows_size = 5
vector_dimensions = 300

files = os.listdir(path)

for key in translit:
    # сначала просто список контекстов
    contexts_dict[key] = []
    target_word = translit_raznoje[key] + "_NOUN"
    for file in files:
        if key in file:
            with codecs.open(path + file, "r", "utf-8") as input_file:
                lines = input_file.read().split("\n")
            for line in lines:
                if len(line.split()) < 3:  # совсем короткие строки нам не нужны
                    continue
                else:
                    contexts = get_contexts(target_word, windows_size, line)
                    for context in contexts:
                        contexts_dict[key].append(ContextClass(context, sems, target_word))

In [359]:
for key in contexts_dict:
    print(key, len(contexts_dict[key]))

dom 0
glava 44
luk 83
organ 9
vid 176


In [360]:
# добавляем вектора
for key in contexts_dict:
    for i, context_instance in enumerate(contexts_dict[key]):
        # в первом варианте была просто model 
        contexts_dict[key][i].vector = get_context_vector(context_instance.context, windows_size, vector_dimensions, model_skipgram)

In [362]:
for key in contexts_dict:
    print(key, len(contexts_dict[key]))

dom 0
glava 44
luk 83
organ 9
vid 176


In [363]:
# убираем из словаря контексты, где все слова в окне получили нулевой вектор (видимо, таких нет)
for key in contexts_dict:
    for i, context_instance in enumerate(contexts_dict[key]):
        if context_instance.vector is None:
            contexts_dict[key].pop(i)

In [364]:
for key in contexts_dict:
    print(key, len(contexts_dict[key]))

dom 0
glava 44
luk 83
organ 9
vid 176


In [353]:
from sklearn.metrics.pairwise import cosine_similarity as cos

In [371]:
dict_with_mean_vectors['dom'] = dom_mean_vectors # словарь, пары [сема, вектор]
dict_with_mean_vectors['glava'] = glava_mean_vectors
dict_with_mean_vectors['luk'] = luk_mean_vectors
dict_with_mean_vectors['organ'] = organ_mean_vectors
dict_with_mean_vectors['vid'] = vid_mean_vectors

In [383]:
path = "C:\\Users\\boss\\Documents\\Diploma\\RNC_Subcorpus\\!raznoje\\raw_texts\\without_punctuation\\lemmatized_bad\\"

for key in contexts_dict:
    for instance in contexts_dict[key]:
        vector = np.array(instance.vector).reshape(1,300)
        cosine_values = []
        for pair in dict_with_mean_vectors[key]:
            sem_vector = np.array(pair[1]).reshape(1,300)
            cosine_values.append(cos(vector,sem_vector).tolist())

        cosine_values = list(flatten_list(cosine_values))
        index_max_cosine = cosine_values.index(max(cosine_values))
        the_sem = dict_with_mean_vectors[key][index_max_cosine][0]
        
        # это мы просто записали сему, к которой отнесён контекст, и сам контекст - но в неудобном виде
        # with codecs.open(path+"attributed_contexts_for_" + key + ".txt", "a", "utf-8") as output:
            # output.write(the_sem + " " + str(instance.context) + "\n")
        if key != 'organ':
            with codecs.open(path+"readable_contexts_for_" + key + ".txt", "a", "utf-8") as output:
                output.write(the_sem + "\n")

# SVM классификация 

In [None]:
# Context_Class entity in contexts_dict: context, sem_set, target_word, vector

In [64]:
import sklearn.svm as svm
from sklearn import metrics
import pandas as pd
import numpy as np

In [47]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.utils import shuffle

### Для значений дома 

In [264]:
# значений - 8

# словарь, в котором ключи - встречающиеся семы, значение - лист с векторами
dict_for_dom = dict()
for entity in contexts_dict["dom"]:
    dict_for_dom.setdefault(entity.sem_set, [])
    dict_for_dom[entity.sem_set].append(entity.vector)

In [49]:
for key in dict_for_dom:
    print(key, len(dict_for_dom[key]))

{'r-concr_t-constr_top-contain',_'r-concr_t-org'} 269
{'r-concr_t-constr_top-contain',_'r-concr_t-space'} 999
{'r-concr_t-constr_top-contain'} 2104
{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'} 46
{'r-concr_t-group_pt-set_sc-hum'} 23
{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'} 71
{'r-concr_t-org'} 13
{'r-concr_t-space'} 99


In [50]:
# сначала для первой семы: {'r-concr_t-constr_top-contain',_'r-concr_t-org'}
dfr_positive = pd.DataFrame(dict_for_dom["{'r-concr_t-constr_top-contain',_'r-concr_t-org'}"])
dfr_negative = pd.DataFrame(dict_for_dom["{'r-concr_t-constr_top-contain',_'r-concr_t-space'}"] + 
                            dict_for_dom["{'r-concr_t-constr_top-contain'}"] + 
                            dict_for_dom["{'r-concr_t-group_pt-set_sc-hum',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}"] + 
                            dict_for_dom["{'r-concr_t-group_pt-set_sc-hum'}"] +
                            dict_for_dom["{'r-concr_t-org',_'r-concr_t-constr_top-contain',_'r-concr_t-space'}"] +
                            dict_for_dom["{'r-concr_t-org'}"] + 
                            dict_for_dom["{'r-concr_t-space'}"])

In [51]:
print(dfr_negative.shape)
print(dfr_positive.shape)

(3355, 300)
(269, 300)


In [52]:
print(dfr_negative.head())

        0         1         2         3         4         5         6    \
0 -0.112655 -0.015101  0.062594 -0.082058  0.003030  0.083630 -0.039967   
1  0.015530 -0.070638  0.066764 -0.055343  0.004817  0.090311  0.000892   
2  0.083353 -0.007032  0.137873 -0.121726  0.017375  0.097917 -0.117281   
3 -0.040353 -0.066811  0.122380 -0.017725 -0.030651  0.074216 -0.076020   
4 -0.056214  0.024573  0.018005 -0.027554 -0.025000  0.056884 -0.015050   

        7         8         9      ...          290       291       292  \
0  0.046675  0.007770 -0.131968    ...     0.016537  0.042334 -0.088649   
1  0.036203 -0.027513 -0.091848    ...     0.012817  0.057783 -0.055656   
2  0.076937  0.090479 -0.074229    ...     0.032286  0.092625 -0.019842   
3  0.072242  0.004307 -0.112502    ...     0.042509 -0.039092 -0.045481   
4  0.039105 -0.015092 -0.026991    ...    -0.002306  0.047317 -0.026562   

        293       294       295       296       297       298       299  
0  0.122610  0.064857  0

In [53]:
# добавим столбец с целевой переменной
dfr_positive['class'] = pd.Series(np.ones(dfr_positive.shape[0], dtype=int), index=dfr_positive.index)
dfr_negative['class'] = pd.Series(np.zeros(dfr_negative.shape[0], dtype=int), index=dfr_negative.index)

In [54]:
frames = [dfr_negative, dfr_positive]
all_data = pd.concat(frames)
all_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,class
0,-0.112655,-0.015101,0.062594,-0.082058,0.00303,0.08363,-0.039967,0.046675,0.00777,-0.131968,...,0.042334,-0.088649,0.12261,0.064857,0.036174,0.0232,-0.016464,0.155182,0.124007,0
1,0.01553,-0.070638,0.066764,-0.055343,0.004817,0.090311,0.000892,0.036203,-0.027513,-0.091848,...,0.057783,-0.055656,0.001092,0.080691,-0.014756,-0.022548,0.081506,0.142143,0.117946,0
2,0.083353,-0.007032,0.137873,-0.121726,0.017375,0.097917,-0.117281,0.076937,0.090479,-0.074229,...,0.092625,-0.019842,0.101258,0.026017,0.045355,0.04264,-0.094668,0.178436,0.130513,0
3,-0.040353,-0.066811,0.12238,-0.017725,-0.030651,0.074216,-0.07602,0.072242,0.004307,-0.112502,...,-0.039092,-0.045481,0.106486,0.108514,-0.058858,-0.029696,-0.096235,0.077846,0.13269,0
4,-0.056214,0.024573,0.018005,-0.027554,-0.025,0.056884,-0.01505,0.039105,-0.015092,-0.026991,...,0.047317,-0.026562,0.038275,0.045017,0.025227,0.025171,-0.022316,0.025044,0.05488,0


In [55]:
data_shuffled = shuffle(all_data, random_state=11)
print(data_shuffled.shape)

(3624, 301)


In [None]:
X = data_shuffled[data_shuffled.columns[:-1]]
y = data_shuffled['class']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [58]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
530,-0.057357,-0.006248,0.190454,-0.035989,-0.028563,0.046286,-0.040437,-0.039086,-0.028037,-0.133084,...,0.071813,0.092946,-0.038081,0.141563,-0.057276,0.042845,-0.014302,-0.093607,0.137064,0.166098
3203,-0.078542,0.02466,0.061703,-0.036626,0.021298,0.029976,-0.014374,-0.045481,-0.045661,-0.008023,...,-0.025971,0.095802,-0.02834,0.072536,0.02261,-0.000515,0.023441,-0.056011,0.031867,0.119171
2207,-0.07824,0.026426,0.158875,-0.096029,0.004387,0.058469,-0.021011,0.083042,-0.079625,-0.001572,...,-0.014439,0.122838,-0.07568,0.154673,0.173832,0.036977,-0.07982,0.016101,0.155697,0.113778
2278,-0.046505,0.068337,0.053857,-0.034735,-0.000132,0.012133,-0.03297,-0.112619,-0.046146,-0.111943,...,-0.047041,0.159223,-0.004111,0.091835,0.029936,0.031678,-0.031677,-0.049278,0.137576,0.009447
3052,-0.071837,-0.027748,0.036022,-0.017426,-0.031876,0.038657,-0.051708,0.033583,-0.040096,0.020691,...,0.003473,0.061388,-0.035305,0.052678,0.030727,-0.013694,0.004543,-0.065728,0.006634,0.109456


### Собственно классификация. Сначала только для одной семы

In [59]:
# NOT BAD, вроде по убыванию
classifier = svm.LinearSVC()
classifier3 = svm.SVC(kernel='rbf', gamma='scale', class_weight='balanced', max_iter=3000) 
classifier8 = svm.LinearSVC(C=0.1, class_weight='balanced', max_iter=3000) 

In [65]:
# BAD
classifier1 = svm.SVC(kernel='rbf', class_weight='balanced') # хуже
classifier2 = svm.SVC(C=0.1, kernel='rbf', class_weight='balanced') # хуже
classifier5 = svm.SVC(kernel='poly', degree=2, gamma='scale', class_weight='balanced') # очень плохо
classifier6 = svm.SVC(kernel='sigmoid', gamma='scale', class_weight='balanced') # хуже
classifier7 = svm.LinearSVC(class_weight='balanced', max_iter=2000) # хуже

In [195]:
classifier4 = svm.LinearSVC(C=1.2) 

In [196]:
classifier4.fit(X_train, y_train)
predicted4 = classifier4.predict(X_test)

In [197]:
print(metrics.classification_report(y_test, predicted4))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       668
           1       0.86      0.32      0.46        57

   micro avg       0.94      0.94      0.94       725
   macro avg       0.90      0.66      0.72       725
weighted avg       0.94      0.94      0.93       725



In [198]:
scorer4 = metrics.make_scorer(metrics.f1_score)
cv_strategy4 = StratifiedKFold(n_splits=8, shuffle=True, random_state=11)
lr_scoring4 = cross_val_score(classifier4, X, y, scoring = scorer4, cv = cv_strategy4)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring4.mean(), lr_scoring4.max(), lr_scoring4.min(), lr_scoring4.std()))

Log mean:0.47984019694834457, max:0.6, min:0.30434782608695654, std:0.0889623078072546


In [116]:
# результаты для третьего

              precision    recall  f1-score   support

           0       0.97      0.90      0.93       668
           1       0.36      0.67      0.46        57

   micro avg       0.88      0.88      0.88       725
   macro avg       0.66      0.78      0.70       725
weighted avg       0.92      0.88      0.89       725



In [115]:
# резульаты для третьего

Log mean:0.4588176238945727, max:0.5263157894736843, min:0.3564356435643565, std:0.0457661598609683


In [61]:
# результаты для нулевого

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       668
           1       0.86      0.32      0.46        57

   micro avg       0.94      0.94      0.94       725
   macro avg       0.90      0.66      0.72       725
weighted avg       0.94      0.94      0.93       725



In [63]:
# результаты для нулевого

Log mean:0.47611470675226614, max:0.6, min:0.30434782608695654, std:0.08825071119361323


## Multi-Class

In [None]:
# здесь массивы:
# data = [[вектора для первой семы], [вектора для второй семы], ...]
# labels = [[0,0,...], [1,1,...], [2,2,...], ...]

In [248]:
# общая часть
scorer = metrics.make_scorer(metrics.f1_score, average='micro')  # specify average here for it not to be 'binary'
cv_strategy = StratifiedKFold(n_splits=8, shuffle=True, random_state=11)

In [201]:
# общая часть
classifier = svm.LinearSVC()
classifier3 = svm.SVC(kernel='rbf', gamma='scale', class_weight='balanced', max_iter=3000) 
classifier8 = svm.LinearSVC(C=0.1, class_weight='balanced', max_iter=3000) 

In [281]:
# отдельно данные со всеми семами, а не первая VS остальные
data = []
for sema in dict_for_dom:
    data.append(dict_for_dom[sema])

labels = []
for i, sema in enumerate(dict_for_dom):
    labels.append([i]*len(dict_for_dom[sema]))

In [286]:
for element in data:
    print(len(element))

269
999
2104
46
23
71
13
99


In [287]:
for el in labels:
    print(len(el))

269
999
2104
46
23
71
13
99


In [282]:
flattend_data = []
for vectors_list in data:
    flattend_data += vectors_list

In [290]:
len(flattend_data)

3624

In [288]:
flattend_labels = list(flatten_list(labels))

In [289]:
print(len(flattend_labels))

3624


In [270]:
# для 0 - micro average f1-score
lr_scoring = cross_val_score(classifier, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.6697211651832666, max:0.6873614190687362, min:0.6497797356828194, std:0.013366649484743719


In [268]:
# для 3 
lr_scoring = cross_val_score(classifier3, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.26102984049097105, max:0.2926829268292683, min:0.22787610619469026, std:0.022293182298738393


In [256]:
# для 8
lr_scoring = cross_val_score(classifier8, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.6051555095271108, max:0.6263736263736264, min:0.5912087912087912, std:0.012227919872749743


In [291]:
X_train, X_test, y_train, y_test = train_test_split(flattend_data, flattend_labels, test_size=0.3, random_state=11)

In [292]:
classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.65      0.45      0.53        82
           1       0.54      0.49      0.51       298
           2       0.72      0.85      0.78       639
           3       0.00      0.00      0.00        10
           4       0.00      0.00      0.00         7
           5       0.00      0.00      0.00        20
           6       0.00      0.00      0.00         5
           7       0.00      0.00      0.00        27

   micro avg       0.67      0.67      0.67      1088
   macro avg       0.24      0.22      0.23      1088
weighted avg       0.62      0.67      0.64      1088



  'precision', 'predicted', average, warn_for)


### Для значений главы

In [262]:
# значений - 4
# словарь, в котором ключи - встречающиеся семы, значение - лист с векторами
dict_for_glava = dict()
for entity in contexts_dict["glava"]:
    dict_for_glava.setdefault(entity.sem_set, [])
    dict_for_glava[entity.sem_set].append(entity.vector)

In [263]:
for key in dict_for_glava:
    print(key, len(dict_for_glava[key]))

{'r-concr_der-shift_dt-partb'} 139
{'r-concr_pt-partb_pc-hum'} 8
{'r-concr_t-hum'} 311
{'r-concr_t-text_pt-part_pc-text'} 275


In [293]:
data = []
for sema in dict_for_glava:
    data.append(dict_for_glava[sema])

labels = []
for i, sema in enumerate(dict_for_glava):
    labels.append([i]*len(dict_for_glava[sema]))

In [294]:
flattend_data = []
for vectors_list in data:
    flattend_data += vectors_list

In [295]:
flattend_labels = list(flatten_list(labels))

In [276]:
# для 0 - micro average f1-score
lr_scoring = cross_val_score(classifier, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.766768580881484, max:0.8131868131868132, min:0.7252747252747253, std:0.028206760449533155


In [277]:
# для 3 
lr_scoring = cross_val_score(classifier3, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.5771131198550554, max:0.6222222222222222, min:0.5164835164835165, std:0.0320968957602986


In [278]:
# для 8
lr_scoring = cross_val_score(classifier8, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.7776994958446571, max:0.8021978021978022, min:0.7634408602150536, std:0.012914587953852913


In [296]:
X_train, X_test, y_train, y_test = train_test_split(flattend_data, flattend_labels, test_size=0.3, random_state=11)

In [297]:
classifier8.fit(X_train, y_train)
predicted = classifier8.predict(X_test)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.72      0.57      0.64        40
           1       1.00      0.50      0.67         2
           2       0.82      0.78      0.80        95
           3       0.79      0.93      0.86        83

   micro avg       0.80      0.80      0.80       220
   macro avg       0.83      0.70      0.74       220
weighted avg       0.79      0.80      0.79       220



### Для значений лука

In [298]:
# значений - 2
dict_for_luk = dict()
for entity in contexts_dict["luk"]:
    dict_for_luk.setdefault(entity.sem_set, [])
    dict_for_luk[entity.sem_set].append(entity.vector)

In [299]:
for key in dict_for_luk:
    print(key, len(dict_for_luk[key]))

{'r-concr_t-plant_t-fruit_t-food_pt-aggr'} 1976
{'r-concr_t-tool-weapon_top-arc'} 662


In [300]:
data = []
for sema in dict_for_luk:
    data.append(dict_for_luk[sema])

labels = []
for i, sema in enumerate(dict_for_luk):
    labels.append([i]*len(dict_for_luk[sema]))

flattend_data = []
for vectors_list in data:
    flattend_data += vectors_list    

flattend_labels = list(flatten_list(labels))

In [301]:
# для 0 - micro average f1-score
lr_scoring = cross_val_score(classifier, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.9454038868932486, max:0.9636363636363636, min:0.9209726443768997, std:0.011700631422731686


In [302]:
# для 3 
lr_scoring = cross_val_score(classifier3, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.9074940130791194, max:0.9272727272727272, min:0.8848484848484849, std:0.016324960022168142


In [303]:
# для 8
lr_scoring = cross_val_score(classifier8, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.9336603113198858, max:0.9484848484848485, min:0.9212121212121213, std:0.008907821206948005


In [304]:
X_train, X_test, y_train, y_test = train_test_split(flattend_data, flattend_labels, test_size=0.3, random_state=11)

In [305]:
classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       599
           1       0.88      0.88      0.88       193

   micro avg       0.94      0.94      0.94       792
   macro avg       0.92      0.92      0.92       792
weighted avg       0.94      0.94      0.94       792



### Для значений органа

In [306]:
# значений - 4
# словарь, в котором ключи - встречающиеся семы, значение - лист с векторами
dict_for_organ = dict()
for entity in contexts_dict["organ"]:
    dict_for_organ.setdefault(entity.sem_set, [])
    dict_for_organ[entity.sem_set].append(entity.vector)

for key in dict_for_organ:
    print(key, len(dict_for_organ[key]))

{'r-concr_der-shift_dt-partb'} 12
{'r-concr_pt-partb_pc-hum_pc-animal_hi-class'} 170
{'r-concr_t-org_hi-class'} 923
{'r-concr_t-tool-mus'} 30


In [307]:
data = []
for sema in dict_for_organ:
    data.append(dict_for_organ[sema])

labels = []
for i, sema in enumerate(dict_for_organ):
    labels.append([i]*len(dict_for_organ[sema]))

flattend_data = []
for vectors_list in data:
    flattend_data += vectors_list    

flattend_labels = list(flatten_list(labels))

In [308]:
# для 0 - micro average f1-score
lr_scoring = cross_val_score(classifier, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.9295865973191229, max:0.9574468085106385, min:0.8680555555555556, std:0.026612554083787477


In [309]:
# для 3 
lr_scoring = cross_val_score(classifier3, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.8255338698485613, max:0.8723404255319149, min:0.8014184397163121, std:0.024681661831673626


In [310]:
# для 8
lr_scoring = cross_val_score(classifier8, flattend_data, flattend_labels, scoring = scorer, cv = cv_strategy)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.9241936512506073, max:0.9513888888888888, min:0.9027777777777778, std:0.015713531998846807


In [311]:
X_train, X_test, y_train, y_test = train_test_split(flattend_data, flattend_labels, test_size=0.3, random_state=11)
classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.79      0.76      0.78        55
           2       0.95      0.97      0.96       279
           3       1.00      0.40      0.57         5

   micro avg       0.92      0.92      0.92       341
   macro avg       0.69      0.53      0.58       341
weighted avg       0.92      0.92      0.92       341



  'precision', 'predicted', average, warn_for)


### Для значений вида 

In [312]:
# значений - 6
# словарь, в котором ключи - встречающиеся семы, значение - лист с векторами
dict_for_vid = dict()
for entity in contexts_dict["vid"]:
    dict_for_vid.setdefault(entity.sem_set, [])
    dict_for_vid[entity.sem_set].append(entity.vector)

for key in dict_for_vid:
    print(key, len(dict_for_vid[key]))

{'r-abstr_der-shift'} 1150
{'r-abstr_r-concr_pt-set_sc-X'} 656
{'r-abstr_t-ment'} 13
{'r-abstr_t-perc_der-v'} 1191
{'r-concr_t-doc'} 7
{'r-concr_t-workart'} 10


In [318]:
data = []
for sema in dict_for_vid:
    data.append(dict_for_vid[sema])

labels = []
for i, sema in enumerate(dict_for_vid):
    labels.append([i]*len(dict_for_vid[sema]))

flattend_data = []
for vectors_list in data:
    flattend_data += vectors_list    

flattend_labels = list(flatten_list(labels))

In [323]:
# вынуждены написать заново и изменить количество сплитов: их было 8, а в одном классе < 8 инстансов
scorer_vid = metrics.make_scorer(metrics.f1_score, average='micro')  # specify average here for it not to be 'binary'
cv_strategy_vid = StratifiedKFold(n_splits=6, shuffle=True, random_state=11)

In [324]:
# для 0 - micro average f1-score
lr_scoring = cross_val_score(classifier, flattend_data, flattend_labels, scoring = scorer_vid, cv = cv_strategy_vid)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.7277202331967865, max:0.7598425196850394, min:0.7111553784860557, std:0.01722749517254849


In [325]:
# для 3 
lr_scoring = cross_val_score(classifier3, flattend_data, flattend_labels, scoring = scorer_vid, cv = cv_strategy_vid)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.5480939158184526, max:0.5634920634920635, min:0.5197628458498024, std:0.014121923119055106


In [326]:
# для 8
lr_scoring = cross_val_score(classifier8, flattend_data, flattend_labels, scoring = scorer_vid, cv = cv_strategy_vid)
print('Log mean:%s, max:%s, min:%s, std:%s'%(lr_scoring.mean(), lr_scoring.max(), lr_scoring.min(), lr_scoring.std()))

Log mean:0.7237971531364235, max:0.7480314960629921, min:0.7023809523809523, std:0.014870203881450111


In [327]:
X_train, X_test, y_train, y_test = train_test_split(flattend_data, flattend_labels, test_size=0.3, random_state=11)
classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.67      0.68      0.68       325
           1       0.69      0.62      0.65       210
           2       0.00      0.00      0.00         3
           3       0.74      0.79      0.77       365
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         5

   micro avg       0.71      0.71      0.71       909
   macro avg       0.35      0.35      0.35       909
weighted avg       0.70      0.71      0.70       909



  'precision', 'predicted', average, warn_for)
