In [1]:
from data_load import *
# load data - > data_list
data_list = get_data("k_means_data.csv")

In [2]:
##vocabulary.py
from data_load import *
from collections import Counter
import jieba


#获得切完的词，和频率限制
def get_words(txt,min_frequency):
    vocab=[]
    c = Counter()
    for item in txt:
        seg_list = jieba.cut(item)
        for x in seg_list:
            if  x != '\r\n':
                c[x] += 1    
    for w, f in c.most_common():
        if f > 0:
            vocab.append(w)
    return vocab

#去除stopword，获得wordlist
def delete_stop_words(wordmax, stop_word_file='stopword_chinese.txt'):
    wordlist = []
    with open(stop_word_file,"r") as fp:
        words = fp.read()
        result = jieba.cut(words)
    new_words = []
    for r in result:
        new_words.append(r)
    stopword_set = set(new_words)
    for w in wordmax:
        if w not in stopword_set:
            wordlist.append(w)
    print("The trimed vocabulary is: {}".format(len(wordlist)))
    return wordlist


#构建属于自己的字典
def vocabulary(newlist):
    vocab = ["<pad>", "<unk>"]
    for w in newlist:
        vocab.append(w)
    print("The total size of our vocabulary is: {}".format(len(vocab)))
    return vocab

def get_vocabulary(data):
    list=get_words(data,1)
    #new_list = delete_stop_words(list)
    vocab_list=vocabulary(list)
    return vocab_list


if __name__ == '__main__':
    data = get_data("k_means_data.csv")
    vocab= get_vocabulary(data)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/vr/3b_f9c5j0yvgq7_41pp6_ktm0000gn/T/jieba.cache
Loading model cost 0.997 seconds.
Prefix dict has been built succesfully.


The total size of our vocabulary is: 1000


In [3]:
from data_load import *
from vocabulary import *
import tqdm
import numpy as np
import tensorflow as tf
#构建映射
def get_index(vocab):
    # 单词到编码的映射，例如不错-> 45
    word_to_token = {word: token for token, word in enumerate(vocab)}
    # 编码到单词的映射，例如6 -> 股市
    token_to_word = {token: word for word, token in word_to_token.items()}
    return word_to_token

#拆分句子，形成词组组成的句子
def dividesentence(data):
    sentence = []
    seg_list = jieba.cut(data, cut_all=False)
    for item in seg_list:
        sentence.append(item)
    return sentence
  #把句子变成对应的token数字，且固定句子的长度，统一格式  
def convert_text_to_token(sentence, limit_size,word_to_token_map):
    """
    根据单词-编码映射表将单个句子转化为token
    
    @param sentence: 句子，str类型
    @param word_to_token_map: 单词到编码的映射
    @param limit_size: 句子最大长度。超过该长度的句子进行截断，不足的句子进行pad补全
    
    return: 句子转换为token后的列表
    """
    # 获取unknown单词和pad的token
    unk_id = word_to_token_map["<unk>"]
    pad_id = word_to_token_map["<pad>"]
    
    # 对句子进行token转换，对于未在词典中出现过的词用unk的token填充
    tokens = [word_to_token_map.get(word, unk_id) for word in dividesentence(sentence)]
    
    # Pad
    if len(tokens) < limit_size:
        tokens.extend([0] * (limit_size - len(tokens)))
    # Trunc
    else:
        tokens = tokens[:limit_size]
    
    return tokens

#构建句子token
def get_sentence_token(data, SENTENCE_LIMIT_SIZE,word_to_token):
    sentence_token=[]
    for sentence in tqdm.tqdm(data):
        tokens = convert_text_to_token(sentence, SENTENCE_LIMIT_SIZE,word_to_token)
        sentence_token.append(tokens)
    return sentence_token

 #使用知乎预先训练的词向量
def loadword2vec(vocab):
    with open("/Users/james/Downloads/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5", 'r') as f:
        words = set()
        word_to_vec = {}
        for line in f:
            line = line.strip().split()
            # 当前单词
            curr_word = line[0]
            words.add(curr_word)
            # 当前词向量
            word_to_vec[curr_word] = np.array(line[1:], dtype=np.float32)
        print("have pretrained-vectors in vocab is: {}".format(len(set(vocab)&set(words))))
        print("do not have pretrained-vectors in vocab is : {}".format(len(set(vocab))-len(set(vocab)&set(words))))
        return word_to_vec


# 构建词向量矩阵
def word_matrix(VOCAB_SIZE, EMBEDDING_SIZE,word_to_token,word_to_vec):
     # 初始化词向量矩阵（这里命名为static是因为这个词向量矩阵用预训练好的填充，无需重新训练）
    static_embeddings = np.zeros([VOCAB_SIZE, EMBEDDING_SIZE])
    for word, token in tqdm.tqdm(word_to_token.items()):
    # 用glove词向量填充，如果没有对应的词向量，则用随机数填充
        word_vector = word_to_vec.get(word, 0.2 * np.random.random(EMBEDDING_SIZE) - 0.1)
        static_embeddings[token, :] = word_vector

    # 重置PAD为0向量
    pad_id = word_to_token["<pad>"]
    static_embeddings[pad_id, :] = np.zeros(EMBEDDING_SIZE)
    return static_embeddings


#构成数据集-句向量矩阵（20*300）,通过句向量求和来合成句向量，输出句向量numpy格式
def get_data_tokens(static_embeddings, sentence_token):
    embed = tf.nn.embedding_lookup(static_embeddings, sentence_token)
    # 相加词向量得到句子向量
    sum_embed = tf.reduce_sum(embed, axis=1)
    sess = tf.Session()
    with sess.as_default():
        result=sum_embed.eval()
    return result


def get_complete_data(data,vocab,sentence_length,VOCAB_SIZE, EMBEDDING_SIZE):
    word_to_token=get_index(vocab)
    sentence_token=get_sentence_token(data,sentence_length,word_to_token)
    word_to_vec=loadword2vec(vocab)
    static_embeddings=word_matrix(VOCAB_SIZE,300,word_to_token,word_to_vec)
    print("static embeddings shape is : {}".format(static_embeddings.shape))
    input=get_data_tokens(static_embeddings,sentence_token)
    return  sentence_token, input
if __name__ == '__main__':
    #data_list 是用list形式，可以后续用来查询输出的问句
    #data_list = get_data("k_means_data.csv")
    # 获得词汇表
    #vocab=get_vocabulary(data_list)
    VOCAB_SIZE = len(vocab)
    EMBEDDING_SIZE= 300
    #获得输入的句向量
    sentence_token,input_data=get_complete_data(data_list,vocab,20,VOCAB_SIZE,EMBEDDING_SIZE)
    print("input size is : {}".format(input_data.shape))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
100%|██████████| 266/266 [00:00<00:00, 4489.32it/s]
100%|██████████| 1000/1000 [00:00<00:00, 42804.26it/s]

have pretrained-vectors in vocab is: 885
do not have pretrained-vectors in vocab is : 115
static embeddings shape is : (1000, 300)





input size is : (266, 300)


In [4]:
#获得句向量对应的index，用来输出句子的
answer_index_list=list(input_data)

In [5]:
#获得最后的结果，通过index
def get_answer_index(data_list,answer_index_list,sentence):
    length = len(answer_index_list)
    for i in range (0,length):
        if (sentence==answer_index_list[i]).all():
            print(i)
            return i,data_list[i]

In [63]:
get_answer_index(data_list,answer_index_list,answer_index_list[53])

53


(53, '连这种传言都信的人，也配出来论？那个纯粹是民间无知愚昧人士根据封建那一套想象出来的但是安营如果活着，确实可能性不小')

In [9]:
from sklearn.cluster import KMeans

In [10]:
#  构造聚类器
estimator = KMeans(n_clusters=3)
s =estimator.fit(input_data)

In [11]:
#获得每个句子的簇
s.labels_

array([0, 0, 0, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 2, 0, 0, 2, 1, 1, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 2, 0, 2, 1, 2, 1, 2, 0, 0, 2, 2, 1, 2, 1, 1, 0, 2, 1, 1, 0, 2,
       1, 2, 2, 2, 2, 0, 1, 1, 2, 0, 0, 0, 2, 2, 1, 1, 0, 2, 1, 0, 2, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 2, 2, 1, 1, 1, 1, 1,
       2, 0, 1, 0, 2, 0, 2, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 2, 1, 1, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 1, 2,
       1, 2, 0, 0, 0, 2, 2, 1, 1, 1, 0, 2, 0, 1, 0, 2, 1, 0, 2, 2, 2, 0,
       1, 2, 2, 0, 0, 0, 2, 0, 1, 1, 0, 2, 1, 1, 2, 0, 2, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 0, 1, 0, 2, 0,
       2, 1], dtype=int32)

In [68]:
#获得比较k值的方法，用calinski_harabaz分数，值越大，表示效果越好
from sklearn import metrics
metrics.calinski_harabaz_score(input_data, s.labels_)  

94.69743812103479

In [13]:
#迭代器，比较聚内平均距离
for i in range(5,150,5):
    clf = KMeans(n_clusters=i).fit(input_data)

    print(i , clf.inertia_) 

5 44923.08013445907
6 43278.4261240268
7 42296.962979043536
8 40935.32202233313
9 40167.07604947268
10 39282.04507459506
11 38403.11659847312
12 37827.97562953392
13 36943.09684889734
14 36623.678432261324
15 35384.809354404395
16 34570.361877373056
17 34043.411623686756
18 33289.2083515297
19 32648.4822540667
20 31867.966197566424
21 31291.553885295074
22 30737.623641536287
23 29855.74671659542
24 29356.751687867156
25 29051.857215439337
26 27449.459160273847
27 27631.05149385948
28 27215.823396640146
29 26783.5622308888
30 25514.17488961103
31 25363.335110352982
32 24814.27569433394
33 24585.275654475878
34 23889.681583548958
35 23456.40200944695
36 22858.8693796908
37 22584.97518240666
38 21889.06811709075
39 21367.320370143763
40 21262.459794468694
41 20707.08164884008
42 20233.47275207853
43 19593.68082606832
44 19308.54843246861
45 18937.020971453934
46 18491.33705340013
47 17870.086264529502
48 17656.480703375208
49 17389.146084752178
50 17181.45770215269
51 16490.980202401326
5

  return_n_iter=True)


164 4.863165702572209e-30


  return_n_iter=True)


165 4.863165702572209e-30


  return_n_iter=True)


166 4.863165702572209e-30


  return_n_iter=True)


167 4.863165702572209e-30


  return_n_iter=True)


168 4.863165702572209e-30


  return_n_iter=True)


169 4.863165702572209e-30


  return_n_iter=True)


170 4.863165702572209e-30


  return_n_iter=True)


171 4.863165702572209e-30


  return_n_iter=True)


172 4.863165702572209e-30


  return_n_iter=True)


173 4.863165702572209e-30


  return_n_iter=True)


174 4.863165702572209e-30


  return_n_iter=True)


175 4.863165702572209e-30


  return_n_iter=True)


176 4.863165702572209e-30


  return_n_iter=True)


177 4.863165702572209e-30


  return_n_iter=True)


178 4.863165702572209e-30


  return_n_iter=True)


179 4.863165702572209e-30


  return_n_iter=True)


180 4.863165702572209e-30


  return_n_iter=True)


181 4.863165702572209e-30


  return_n_iter=True)


182 4.863165702572209e-30


  return_n_iter=True)


183 4.863165702572209e-30


  return_n_iter=True)


184 4.863165702572209e-30


  return_n_iter=True)


185 4.863165702572209e-30


  return_n_iter=True)


186 4.863165702572209e-30


  return_n_iter=True)


187 4.863165702572209e-30


  return_n_iter=True)


188 4.863165702572209e-30


  return_n_iter=True)


189 4.863165702572209e-30


  return_n_iter=True)


190 4.863165702572209e-30


  return_n_iter=True)


191 4.863165702572209e-30


  return_n_iter=True)


192 4.863165702572209e-30


  return_n_iter=True)


193 4.863165702572209e-30


  return_n_iter=True)


194 4.863165702572209e-30


  return_n_iter=True)


195 4.863165702572209e-30


  return_n_iter=True)


196 4.863165702572209e-30


  return_n_iter=True)


197 4.863165702572209e-30


  return_n_iter=True)


198 4.863165702572209e-30


  return_n_iter=True)


199 4.863165702572209e-30


  return_n_iter=True)


200 4.863165702572209e-30


  return_n_iter=True)


201 4.863165702572209e-30


  return_n_iter=True)


202 4.863165702572209e-30


  return_n_iter=True)


203 4.863165702572209e-30


  return_n_iter=True)


204 4.863165702572209e-30


  return_n_iter=True)


205 4.863165702572209e-30


  return_n_iter=True)


206 4.863165702572209e-30


  return_n_iter=True)


207 4.863165702572209e-30


  return_n_iter=True)


208 4.863165702572209e-30


  return_n_iter=True)


209 4.863165702572209e-30


  return_n_iter=True)


210 4.863165702572209e-30


  return_n_iter=True)


211 0.0


  return_n_iter=True)


212 0.0


  return_n_iter=True)


213 0.0


  return_n_iter=True)


214 0.0


  return_n_iter=True)


215 0.0


  return_n_iter=True)


216 0.0


  return_n_iter=True)


217 0.0


  return_n_iter=True)


218 0.0


  return_n_iter=True)


219 0.0


  return_n_iter=True)


220 0.0


  return_n_iter=True)


221 0.0


  return_n_iter=True)


222 0.0


  return_n_iter=True)


223 0.0
224 0.0


  return_n_iter=True)


In [26]:
#迭代器，比较calinski_harabaz分数
for i in range(158,164,1):
    clf = KMeans(n_clusters=i).fit(input_data)
    a=metrics.calinski_harabaz_score(input_data, clf.labels_)  
    print(i , a)

158 741.1473863973046
159 1054.0650676602409
160 1523.7126864689274
161 3224.066571948478
162 9530.638011862746
163 4.767210551143021e+33


In [69]:
#获得每个质心，然后找到最近的余弦点
np.shape(s.cluster_centers_)[0]

3

In [70]:
s.labels_

array([0, 0, 0, 0, 0, 2, 1, 0, 1, 2, 1, 2, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 2, 2, 1, 1, 1, 0, 1, 1, 1, 2, 2, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 2, 0, 0, 2, 1, 1, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 2, 0, 2, 1, 2, 1, 2, 0, 1, 2, 2, 1, 2, 1, 1, 0, 2, 1, 1, 0, 2,
       1, 2, 2, 2, 2, 0, 1, 1, 0, 0, 0, 0, 2, 2, 1, 1, 1, 2, 1, 0, 2, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 0, 2, 1, 1, 1, 1, 1,
       2, 0, 1, 0, 2, 0, 2, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 2, 2, 1, 2, 1, 0, 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 2, 1, 1, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 1, 2,
       1, 2, 0, 1, 0, 2, 2, 1, 1, 1, 0, 2, 0, 1, 0, 2, 1, 1, 2, 2, 2, 0,
       1, 2, 0, 0, 0, 0, 2, 0, 1, 1, 1, 2, 1, 1, 2, 0, 2, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 2, 2, 1, 1, 1, 1, 2, 0, 1, 0, 2, 0,
       2, 1], dtype=int32)

In [30]:
def get_cossimi(x,y):
    myx=np.array(x) #将列表转化为数组，更好的数学理解是向量
    myy=np.array(y) #将列表转化为数组，更好的数学理解是向量
    cos1=np.sum(myx*myy) #cos(a,b)=a*b/(|a|+|b|)
    cos21=np.sqrt(sum(myx*myx))
    cos22=np.sqrt(sum(myy*myy))
    return cos1/(float(cos21*cos22))


In [31]:
input_data[1]

array([-7.92458568e-02,  1.51802451e+00,  1.94196821e+00, -1.77186991e+00,
       -1.55591530e+00,  5.78934763e-01,  2.77724613e+00, -1.61457787e+00,
       -8.24363977e-01, -5.03943779e-01,  2.12878467e-01,  1.98830049e+00,
        2.83480695e+00,  2.82064379e+00, -1.31438138e+00,  8.64689571e-02,
        1.66321806e-01,  1.74179419e+00, -4.92142439e-01, -9.47234458e-01,
        1.78792463e+00, -2.10403770e+00,  2.65796839e-01, -1.13086977e-01,
       -2.35414854e+00,  3.76555696e+00,  2.47335220e+00,  3.93738790e+00,
       -2.77649963e-01, -1.50720212e-01,  2.54479603e+00, -9.29546436e-01,
       -1.42212332e+00,  8.50759219e-01,  2.09704793e+00,  6.34209707e-01,
        5.86274842e-01,  9.94656472e-01,  2.29648440e+00,  1.01480737e+00,
        1.18241357e+00,  8.02506438e-01, -4.95399224e-01, -2.68267610e+00,
        1.32411661e+00, -1.40049101e+00, -1.76784715e+00, -1.82391748e+00,
        4.52321786e-01, -1.14899606e+00,  4.00698935e+00,  1.58145852e+00,
       -7.91745573e-01, -

In [37]:
s.cluster_centers_[0]

array([ 2.44758547e-03,  9.24500430e-01,  1.15451855e+00, -1.29211043e+00,
       -1.45596855e+00,  4.91889666e-01,  1.92733880e+00, -1.78689181e+00,
       -7.91658566e-01,  5.38666809e-01,  8.23159508e-02,  1.15500532e+00,
        1.62244141e+00,  1.73319439e+00, -2.48987150e-01, -4.42853959e-01,
        2.80746980e-01,  7.33729432e-01,  8.64080024e-01, -2.69097914e-01,
        1.73238675e+00, -1.11757199e+00, -7.64594165e-01, -1.18314452e+00,
       -2.59510267e+00,  2.43209974e+00,  1.30043058e+00,  2.68484936e+00,
        4.44119618e-01,  8.10533470e-01,  1.27491106e+00, -6.67449493e-01,
       -1.09164275e+00,  9.06760525e-01,  1.40561438e+00,  8.08707802e-01,
        1.82318096e-01,  5.69673969e-01,  2.06793129e+00,  2.57056634e-01,
        4.22817422e-01,  2.73528108e-01, -6.45634356e-01, -1.57838642e+00,
        1.33684388e+00, -9.60186185e-01, -2.03306071e+00, -5.31882557e-01,
        7.08207115e-01, -9.52003624e-01,  3.04944894e+00,  1.10099416e+00,
        1.41439020e-01, -

In [38]:
len(input_data)

266

In [61]:
# 获得最接近的值，从数组中获得相关cos最大值
# 输出最接近的点

def get_max_cos(cluster,input_data):
    temp = 0
    a=0
    for i in input_data:
        if a < get_cossimi(cluster,i):
            a=get_cossimi(cluster,i)
            temp= i
    print(temp,a)
    return temp,a 

In [60]:
get_max_cos(s.cluster_centers_[0],input_data)

[-0.14514322  1.78785112  2.27519053 -1.66879513 -2.00052395 -0.06474336
  4.82958741 -1.90504948 -1.10369524  0.65133724 -1.00914583  2.96413337
  2.36997687  3.35831753 -0.61974663  0.13711979  1.64265158  2.0771303
  0.32199995  0.04923273  2.91874369 -1.72681911 -1.09984724 -1.84802167
 -4.28460097  4.96053053  1.60942303  3.31080918 -0.41039399  1.37185609
  3.8686929  -2.44731619 -2.11953144  1.41042194  2.39569283  2.75817739
  1.52847599 -0.19277446  1.75352129  2.98657009  0.35717511  0.77840959
 -0.71085591 -4.01879468  1.7440832  -0.94726283 -4.20468825  0.30867605
  0.60517148 -3.46434295  5.30843215  1.0968339  -0.04206695 -0.86133197
  1.27122234 -1.796129   -0.80524631 -1.67317214 -5.95734995  1.10715923
  2.17561861 -1.66124653 -2.49424541  2.68204607 -0.10958093  0.59728569
 -0.83438196 -1.70822391  0.5145805  -1.21205044 -4.25691302 -1.14641307
  0.86216378 -2.92337047  1.07571298 -3.28651592  1.2046419   0.93894882
 -0.45988042 -1.64820241 -2.19860975 -2.97696198  0.

(array([-0.14514322,  1.78785112,  2.27519053, -1.66879513, -2.00052395,
        -0.06474336,  4.82958741, -1.90504948, -1.10369524,  0.65133724,
        -1.00914583,  2.96413337,  2.36997687,  3.35831753, -0.61974663,
         0.13711979,  1.64265158,  2.0771303 ,  0.32199995,  0.04923273,
         2.91874369, -1.72681911, -1.09984724, -1.84802167, -4.28460097,
         4.96053053,  1.60942303,  3.31080918, -0.41039399,  1.37185609,
         3.8686929 , -2.44731619, -2.11953144,  1.41042194,  2.39569283,
         2.75817739,  1.52847599, -0.19277446,  1.75352129,  2.98657009,
         0.35717511,  0.77840959, -0.71085591, -4.01879468,  1.7440832 ,
        -0.94726283, -4.20468825,  0.30867605,  0.60517148, -3.46434295,
         5.30843215,  1.0968339 , -0.04206695, -0.86133197,  1.27122234,
        -1.796129  , -0.80524631, -1.67317214, -5.95734995,  1.10715923,
         2.17561861, -1.66124653, -2.49424541,  2.68204607, -0.10958093,
         0.59728569, -0.83438196, -1.70822391,  0.5

In [13]:
# 把点分到自己的相应的簇里面
label_list=list(s.labels_)
answer_index_list

[array([ 0.11983931,  1.32457309,  1.71132332, -2.4418267 , -2.34361808,
         1.45894349,  3.33205095, -2.19681989, -0.9233544 ,  1.38131509,
         1.72483127,  2.28144835,  1.51522469,  2.09573339, -0.21533816,
        -0.87411033,  1.26369235,  2.15247279,  0.16501201, -0.61063088,
         1.90680235, -3.35881002,  0.38145066, -1.05106322, -2.22741685,
         2.86688533,  1.94726701,  3.81425235,  0.07299277, -0.26878763,
         0.32593501, -0.58590259,  0.33923525,  0.25602556,  1.85529752,
         0.42557939, -1.29927943,  0.41820542,  1.64499246, -1.79987617,
        -0.43918035, -0.50698732, -0.72701349, -1.49708307,  1.14377687,
        -1.27406785, -2.94947973, -1.03189457,  1.09873038, -0.84618943,
         2.02629803,  1.04802767,  0.26076186, -0.73731023, -0.34545685,
        -0.49764589, -1.3246956 ,  1.36076113, -2.78654616,  0.03864638,
         2.95155376, -1.99253497,  0.28016661,  1.16875353,  1.84376524,
        -0.54559035, -0.37995524, -0.36666526, -1.2

In [14]:
type(label_list), type(answer_index_list)

(list, list)

In [71]:
len(answer_index_list)

266

In [7]:
#把
test_answer_list=[]
for i in answer_index_list:
    test_answer_list.append(list(i))
    

In [17]:
def make_index_map(test_label,test_list):
    map={}
    length = len(test_label)
    print(length)
    for i in range(0,length):
        key = str(test_label[i])
        print(key)
        if map.get(key) is not None:
            print("dwqdwqdwqodwqodhwqohdoqw")
            num=str(test_list[i])
            templist=map.setdefault(num)
            templist.append(test_list[i])
            dicttemp={num:templist}
            map.update(dicttemp)
            print(map)
        else:
            print("next")
            newlist=[]
            newlist.append(test_list[i])
            newkey=str(test_label[i])
            dict={newkey:newlist}
            map.update(dict)
        print(map)
    return map

        


In [24]:
test_label1=[0,1,2,2,3,3,3,3,3,3,3,1,2,2]

In [25]:
test_label1

[0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 2, 2]

In [26]:
type(label_list),type(answer_index_list)

(list, list)

In [36]:
test_list_5=test_answer_list[:5]
label_list_5 = label_list[:5]

In [37]:
map_test=make_index_map(label_list_5,test_list_5)

5
0
next
{'0': [[0.11983930619058775, 1.3245730908774958, 1.7113233229401712, -2.4418266953183725, -2.343618080539793, 1.4589434863734283, 3.3320509531866374, -2.1968198916302004, -0.9233544037165313, 1.3813150934422571, 1.7248312693426664, 2.281448352694574, 1.5152246871476833, 2.0957333925867165, -0.21533816071229822, -0.8741103316732622, 1.2636923524084644, 2.152472791475641, 0.16501200659961324, -0.6106308779063736, 1.9068023517583854, -3.358810020594622, 0.38145066270316275, -1.0510632192384262, -2.227416849079938, 2.8668853277373123, 1.9472670102818328, 3.814252346793187, 0.07299276909924923, -0.2687876319787796, 0.32593500520316376, -0.5859025890320142, 0.33923525180071556, 0.2560255630061896, 1.8552975238577405, 0.4255793903589342, -1.2992794308302043, 0.4182054219450545, 1.6449924557837017, -1.7998761698398889, -0.43918035466637484, -0.5069873240071616, -0.7270134929316592, -1.497083066753537, 1.1437768653301923, -1.2740678453572016, -2.9494797343669763, -1.0318945721499952, 1

KeyError: '[-0.26741705823366735, 1.622009319373614, 2.1089649650298896, -1.9232310139500708, -1.6157865455513964, 0.6188123010517877, 2.6553363231757174, -1.724009344096714, -0.7530132846517619, -0.7640862259089124, 0.2969341734953135, 2.236796113419394, 2.872240961218911, 2.8969347663395957, -1.1854266507637181, 0.03801621644590525, 0.16293439330267923, 1.5414611270848773, -0.22358143746105896, -0.8799455738230297, 1.7669267176903243, -1.9964739023114362, 0.42786168361329224, -0.3263336936027089, -2.3148998414922506, 3.585416277570447, 2.7047610333619616, 4.180989040601935, -0.4372350514203438, -0.21454213977005665, 2.529600795599024, -0.8993377698818847, -1.1308844458875595, 0.9005801576894434, 2.091618397342746, 0.6655346517069938, 0.7079119528315319, 1.0296886846703737, 2.3852277874098453, 0.9445893765541655, 1.0598932124211575, 0.6589803889475468, -0.5340975198786397, -2.796188157620572, 1.1126045326407878, -1.2740733973078582, -1.8656078669084366, -1.9704255986083123, 0.6006354871459219, -1.216590482713446, 4.082441956013078, 1.6852209009875643, -0.7267810094493041, -1.6132284919792075, 0.8322092243587211, -1.7458523174937595, -1.3890867269903528, -0.38270870792986145, -3.8922253526192367, 0.48387153869148625, 2.5241766867285333, -0.68994502829765, -1.180138001684136, 1.2088219557764233, 0.2501384299056315, -1.5931580909589087, -1.4416687041584626, -1.1554902773355495, -0.05396827247009872, 0.10736013585049442, -3.970234782229392, 0.11552154859696984, 1.6697554247520554, -2.093130954925635, 0.27201943403837514, -1.7343012007380365, 2.374600211369789, 1.9875796488045614, 0.09275757930489359, -0.14234860816133432, -0.7195987591228226, -1.4135864742815247, 1.0139398267966724, -0.11400630311559379, -2.5046668306785804, 1.5678816168169358, 0.4642693666945553, 2.270247448036014, -1.53335186086262, -1.6979427949459298, 1.0864966733526162, 0.181902956465388, -1.9901523005524195, -2.6406460306389574, -1.884292238394194, -0.7703991418698682, -4.852295760147109, -1.7485191831732125, 2.302885462710731, 0.13259889297214777, 1.7837545703046835, -1.6456506344933994, -1.214460200824193, -1.8140607542363778, 1.5742708117459776, -2.218705850443785, 0.22342276047703546, 0.4685631877143249, 0.26739789428351346, 4.152261574471684, -0.7964641149210556, 0.4612364835617585, -0.4009637045808288, -2.0837443786833827, 1.5979298745883388, -1.8502737411396069, -1.1738051537893823, 1.021247297551143, 1.2089210201250735, -1.5826496324720118, -2.974120062110024, 1.493176338449923, 1.3434513952867668, -1.1867635139765604, -1.5688391436422684, 2.1010982915536545, 1.0183844107218685, 1.6762241879996678, 1.220925999708858, 3.7132742643074605, -0.4489049181022545, -0.512035544745943, -2.294228367942776, -0.575360231935049, 1.2685323939598807, 1.6594689895769779, -0.1077820900868765, -2.768461546935805, -0.4065524731948491, 1.42455104920883, 0.7984579314123832, -0.8586657420716135, -1.089901432670639, -0.38803665575041135, -3.942606868161532, 2.262641743522028, 1.9531383970040472, 0.14740333525824012, -0.49347010690866444, 2.453122173735072, 0.6657200712085911, 0.40436228079768444, 0.8488290810243482, 1.173426663307659, -0.19192419414494583, -0.03985833388129567, 0.7994761164539075, -0.8103767955681991, 0.716361128312275, -1.193824854684296, -2.10343921348427, -1.4496990106574734, -0.6161259232219403, -0.43184256331337445, -4.204655710942932, 0.6282899930024438, 0.9871250747009175, 2.702263539306732, -1.2208298119457335, -0.5301608454572978, 1.5790944009143373, -1.8394905656370257, -1.0846333146545883, -1.6335284803047274, 2.4121420020955373, -2.2253382181354384, 0.4356187472303483, -0.9950452722481593, 1.808155725754473, -0.4214610574873834, -1.7476748308687227, -0.6340571097526434, -1.665423693538198, -2.002207236409268, 0.559378128771957, -0.2767908610607892, -1.3342482075861501, 1.3569928790013952, -2.3969895325362867, 3.0537956457120328, -2.1451636453823597, -1.7876458000286761, 3.136681372588062, -2.2547351424300226, -2.659378396789411, -1.774275757746968, -0.20471030491501363, -1.0146864289514985, -3.0729000475030963, 0.31175065541641145, -2.10574861588354, 2.2832502126399836, 0.7526003688587708, -0.2550064530243443, -0.817666987537063, -3.073658649298782, 0.9872303406598596, 1.1783662960785324, 1.141852893551626, -2.233650835311792, 2.5470463547453193, 1.03459536284962, 0.03550943338311585, -1.3659137699640447, 0.019618390912021555, 0.03924588229220055, -1.967979454374891, -1.9104188456098226, 4.071924279036973, -1.3377610434136464, -3.717069984251214, -1.8244966181800764, -0.9362308319811158, 1.502680729659219, 1.291369642540094, 1.9932767177440245, 0.7924863755255749, -1.586005564738926, 0.7042342465704452, -0.5293877759917001, -1.330216714481155, 1.6102356455294053, -2.3871334769764134, 0.5508556719481328, 0.7896023071736678, 0.6473041224353757, 2.965638379242812, -3.3286998796808547, -0.45229440718446867, 0.2789394236736732, -0.17708483752691961, 2.827073468713357, -0.8894009190730794, 1.5188288328871598, -3.2176365175075228, -0.3220848133385648, -3.150246188533248, 0.5403295379278403, -2.1975045836230658, -2.2382079347682238, 0.08471559186880084, -3.9927475901825376, -1.9736546907128574, -1.0279432238432777, -0.49657200728246553, -1.3943152356755775, -1.496453648965173, -0.7029503691162471, 1.8801909053162102, -2.542048084837022, 1.430542883403695, 0.35372763126057793, -0.8007410185048625, 1.5547634533796355, 0.6103512159937957, 2.780963391592753, 2.891348311553826, 0.5648778136878996, 1.517877539700283, 0.8767898423179783, 0.06552280779749664, 3.21408610608767, -2.3999850900807154, -2.336651971698973, 0.9816781324096134, 2.3899887118606586, 0.6016586005919526, 0.34250438206700085, -0.6748373626640383, -1.3202816489306528, -0.4717844104256803, -0.47664929157161173, -1.993017059476716, -2.6302088583351475, 3.9529772575654714, 1.0611848373813721, -1.9583455967226813, 0.76282107621569, 1.4439692626963212, -2.4294628454501006, 1.0166347136074705, 3.502003217908901, 1.61374750362249, 2.425760695655789, 0.35951371776076924, -0.6457293197025518, -2.4307902990227834, 2.282257156739056, -1.2868328466351246, -3.731845966770702]'

In [34]:
print(map_test)

{'0': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], '2': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], '1': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [30]:
map_test

{'0': [0], '1': [1, 1], '2': [2, 2, 2, 2], '3': [3, 3, 3, 3, 3, 3, 3]}

In [32]:
map_test['1']

[1, 1]

In [89]:
map_test={"a":test_list1}

In [35]:
templist = map_test["a"]
templist.append(0)
map_test={"a":templist}

In [44]:
a=0
c=str(a)

In [45]:
c

'0'

In [22]:
map_123={"a":"1","b":"1"}

In [23]:
dict={"c":"1"}
map_123.update(dict)

In [24]:
map_123

{'a': '1', 'b': '1', 'c': '1'}