In [4]:
import logging
import random
import pandas as pd

import numpy as np
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')


In [3]:
import torch

seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)  # 这样设置后每次的初始化参数都是固定的
torch.manual_seed(seed)

<torch._C.Generator at 0x25de7f4fde0>

In [38]:
# split data to fold
fold_num = 10
data_file = 'E:/Dataset/新闻文本分类/train_set.csv'

def all_data2fold(fold_num, num=10000):
    fold_data = []
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8', nrows=20000)

    texts = f['text'].tolist()[:num]
    labels = f['label'].tolist()[:num]

    total = len(labels)
    index = list(range(total))

    # 随机打乱index [0,1,2,3...] -> [366, 3664, 223...]
    np.random.shuffle(index)  
    all_texts = []
    all_labels = []
    # 随机打乱样本
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])

    # all_texts = texts 顺序不同
    label2id = {}
    # 统计文本属于第几个label
    # {'6': [0,20,32,51,64...], '1':[...], ...}
    for i in range(total):
        label = str(all_labels[i])
        if label not in label2id:
            label2id[label] = [i]
        else:
            label2id[label].append(i)

    # [[], [], [], [], [], [], [], [], [], []]
    all_index = [[] for _ in range(fold_num)]  # 抽取哪些样本
    # 分层抽样
    for label, data in label2id.items():
        # print(label, len(data))
        batch_size = int(len(data) / fold_num)  # ('6':505) => batch_size:505/10=50, other:505-500=5
        other = len(data) - batch_size * fold_num
        for i in range(fold_num):
            cur_batch_size = batch_size + 1 if i < other else batch_size  # [51, 51, 51, 51, 51, 50, 50, 50, 50, 50]
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data)

    # all_index 抽样索引
    batch_size = int(total / fold_num)  # 
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        num = len(all_index[fold])  
        print(num)
        texts = [all_texts[i] for i in all_index[fold]]  # 每折的样本
        labels = [all_labels[i] for i in all_index[fold]]

        if num > batch_size:
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
        elif num < batch_size:
            end = start + batch_size - num
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels

        assert batch_size == len(fold_labels)

        # shuffle
        index = list(range(batch_size))
        np.random.shuffle(index)

        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])

        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        fold_data.append(data)
    
    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
    return fold_data

In [39]:
fold_data = all_data2fold(10)

2021-09-05 22:07:02,405 INFO: Fold lens [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]


1006
1005
1004
1003
1002
999
997
997
994
993


In [43]:
fold_id = 9

train_texts = []
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    
logging.info('Total %d docs.' % len(train_texts))

2021-09-05 22:09:17,176 INFO: Total 9000 docs.


In [51]:
logging.info('Start training...')
from gensim.models.word2vec import Word2Vec

num_features = 100     # Word vector dimensionality
num_workers = 8       # Number of threads to run in parallel

train_texts = list(map(lambda x: list(x.split()), train_texts))
model = Word2Vec(train_texts, workers=num_workers, size=num_features)
model.init_sims(replace=True)

# save model
model.save("./word2vec.bin")

2021-09-05 22:16:21,209 INFO: Start training...
2021-09-05 22:16:25,990 INFO: collecting all words and their counts
2021-09-05 22:16:25,991 INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-09-05 22:16:26,812 INFO: collected 5266 word types from a corpus of 8158891 raw words and 9000 sentences
2021-09-05 22:16:26,812 INFO: Loading a fresh vocabulary
2021-09-05 22:16:26,871 INFO: effective_min_count=5 retains 4323 unique words (82% of original 5266, drops 943)
2021-09-05 22:16:26,871 INFO: effective_min_count=5 leaves 8156992 word corpus (99% of original 8158891, drops 1899)
2021-09-05 22:16:26,883 INFO: deleting the raw counts dictionary of 5266 items
2021-09-05 22:16:26,884 INFO: sample=0.001 downsamples 61 most-common words
2021-09-05 22:16:26,884 INFO: downsampling leaves estimated 7041426 word corpus (86.3% of prior 8156992)
2021-09-05 22:16:26,894 INFO: estimated required memory for 4323 words and 100 dimensions: 5619900 bytes
2021-09-05 22:16:26,895 INF

In [52]:
# load model
model = Word2Vec.load("./word2vec.bin")

# convert format
model.wv.save_word2vec_format('./word2vec.txt', binary=False)

# 所以要怎么分类呢？

2021-09-05 22:17:05,133 INFO: loading Word2Vec object from ./word2vec.bin
2021-09-05 22:17:05,167 INFO: loading wv recursively from ./word2vec.bin.wv.* with mmap=None
2021-09-05 22:17:05,168 INFO: setting ignored attribute vectors_norm to None
2021-09-05 22:17:05,168 INFO: loading vocabulary recursively from ./word2vec.bin.vocabulary.* with mmap=None
2021-09-05 22:17:05,169 INFO: loading trainables recursively from ./word2vec.bin.trainables.* with mmap=None
2021-09-05 22:17:05,169 INFO: setting ignored attribute cum_table to None
2021-09-05 22:17:05,170 INFO: loaded ./word2vec.bin
2021-09-05 22:17:05,179 INFO: storing 4323x100 projection weights into ./word2vec.txt
