In [3]:
import logging
import random

import numpy as np
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed 
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

# set cuda
gpu = 0
use_cuda = gpu >= 0 and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(gpu)
    device = torch.device("cuda", gpu)
else:
    device = torch.device("cpu")
logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)



2020-07-28 14:47:51,902 INFO: Use cuda: True, gpu id: 0.


In [5]:
fold_num = 10
data_file = '/home/hy/Documents/tianchi/train_set.csv'
import pandas as pd


def all_data2fold(fold_num, num=10000):
    fold_data = []
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
    texts = f['text'].tolist()[:num]
    labels = f['label'].tolist()[:num]

    total = len(labels)

    index = list(range(total))
    np.random.shuffle(index)

    all_texts = []
    all_labels = []
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])

    label2id = {}
    for i in range(total):
        label = str(all_labels[i])
        if label not in label2id:
            label2id[label] = [i]
        else:
            label2id[label].append(i)

    all_index = [[] for _ in range(fold_num)]
    for label, data in label2id.items():
        # print(label, len(data))
        batch_size = int(len(data) / fold_num)
        other = len(data) - batch_size * fold_num
        for i in range(fold_num):
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data)

    batch_size = int(total / fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        num = len(all_index[fold])
        texts = [all_texts[i] for i in all_index[fold]]
        labels = [all_labels[i] for i in all_index[fold]]

        if num > batch_size:
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
        elif num < batch_size:
            end = start + batch_size - num
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels

        assert batch_size == len(fold_labels)

        # shuffle
        index = list(range(batch_size))
        np.random.shuffle(index)

        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])

        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        fold_data.append(data)

    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))

    return fold_data


fold_data = all_data2fold(10)

2020-07-28 14:48:48,479 INFO: Fold lens [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]


In [6]:
# build train data for word2vec
fold_id = 9

train_texts = []
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    
logging.info('Total %d docs.' % len(train_texts))

2020-07-28 17:09:54,157 INFO: Total 9000 docs.


In [8]:
logging.info('Start training...')
from gensim.models.word2vec import Word2Vec

num_features = 100     # Word vector dimensionality
num_workers = 8       # Number of threads to run in parallel

train_texts = list(map(lambda x: list(x.split()), train_texts))
model = Word2Vec(train_texts, workers=num_workers, size=num_features)
model.init_sims(replace=True)

# save model
model.save("./word2vec.bin")

2020-07-28 17:12:54,045 INFO: Start training...
2020-07-28 17:12:55,871 INFO: 'pattern' package not found; tag filters are not available for English
2020-07-28 17:12:56,980 INFO: collecting all words and their counts
2020-07-28 17:12:56,981 INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-28 17:12:58,207 INFO: collected 5295 word types from a corpus of 8191447 raw words and 9000 sentences
2020-07-28 17:12:58,207 INFO: Loading a fresh vocabulary
2020-07-28 17:12:58,223 INFO: effective_min_count=5 retains 4335 unique words (81% of original 5295, drops 960)
2020-07-28 17:12:58,224 INFO: effective_min_count=5 leaves 8189498 word corpus (99% of original 8191447, drops 1949)
2020-07-28 17:12:58,238 INFO: deleting the raw counts dictionary of 5295 items
2020-07-28 17:12:58,239 INFO: sample=0.001 downsamples 61 most-common words
2020-07-28 17:12:58,239 INFO: downsampling leaves estimated 7070438 word corpus (86.3% of prior 8189498)
2020-07-28 17:12:58,251 INFO: e

In [9]:
# load model
model = Word2Vec.load("./word2vec.bin")

# convert format
model.wv.save_word2vec_format('./word2vec.txt', binary=False)

2020-07-28 17:13:55,469 INFO: loading Word2Vec object from ./word2vec.bin
2020-07-28 17:13:55,499 INFO: loading wv recursively from ./word2vec.bin.wv.* with mmap=None
2020-07-28 17:13:55,499 INFO: setting ignored attribute vectors_norm to None
2020-07-28 17:13:55,500 INFO: loading vocabulary recursively from ./word2vec.bin.vocabulary.* with mmap=None
2020-07-28 17:13:55,501 INFO: loading trainables recursively from ./word2vec.bin.trainables.* with mmap=None
2020-07-28 17:13:55,501 INFO: setting ignored attribute cum_table to None
2020-07-28 17:13:55,502 INFO: loaded ./word2vec.bin
2020-07-28 17:13:55,509 INFO: storing 4335x100 projection weights into ./word2vec.txt


In [None]:
model = model.wv.load_word2vec_format('/home/hy/Documents/tianchi/word2vec.txt', binary=False)


In [None]:
model.wv['6758']

2020-07-28 17:17:37,386 INFO: loading projection weights from /home/hy/Documents/tianchi/word2vec.txt
2020-07-28 17:17:38,048 INFO: loaded (5976, 100) matrix from /home/hy/Documents/tianchi/word2vec.txt


<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f7df71f1978>

In [11]:
model.wv['6758']

array([ 0.06302957, -0.0816782 ,  0.12224683,  0.05743894,  0.09141383,
        0.08220726,  0.133415  , -0.02033619,  0.12548573,  0.14604083,
       -0.06880651, -0.04293273, -0.07917397,  0.08346818,  0.05931692,
        0.01149672, -0.13207817,  0.03288741, -0.00528292,  0.1524673 ,
       -0.08425879,  0.00353841,  0.09706791,  0.14164776,  0.14601718,
        0.12156782,  0.05306974, -0.03622913,  0.03124568, -0.13916202,
       -0.03997386,  0.023746  , -0.08548971, -0.09077025,  0.08922168,
       -0.12293343, -0.11359778,  0.05834633, -0.05858888, -0.05669861,
        0.02146043, -0.09480041, -0.04206865,  0.00633098,  0.02165185,
        0.04812855,  0.08169613,  0.03259791,  0.01674422, -0.07448601,
        0.1639229 ,  0.02134857,  0.05822015, -0.07742783,  0.0232669 ,
        0.21945013, -0.11466961,  0.04866209,  0.08020634, -0.09274981,
        0.04626997,  0.12170841, -0.12567827, -0.06670913, -0.11654548,
        0.0465796 ,  0.08097442, -0.1162102 , -0.258183  ,  0.04