# word2vec 做分类
[参考地址](https://tianchi.aliyun.com/competition/entrance/531810/forum)


In [2]:
import pandas as pd
from sklearn.metrics import f1_score

## 1. 载入数据

In [3]:
split_ratio = 0.95

df_all = pd.read_csv('data/train_set.csv', sep='\t')
df_all['label_ft'] = '__label__' + df_all['label'].astype(str)

n_all = len(df_all)
n_train = int(n_all * split_ratio)
n_valid = n_all - n_train
df_train = df_all[['text', 'label_ft']].head(n_train)
df_valid = df_all.tail(n_valid)
df_train.to_csv('data/train.csv', index=None, header=None, sep='\t')
df_valid[['text', 'label_ft']].to_csv('data/valid.csv', index=None, header=None, sep='\t')
print('All:{}, Train:{}, Valid:{}'.format(n_all, n_train, n_valid))
df_train

All:200000, Train:190000, Valid:10000


Unnamed: 0,text,label_ft
0,2967 6758 339 2021 1854 3731 4109 3792 4149 15...,__label__2
1,4464 486 6352 5619 2465 4802 1452 3137 5778 54...,__label__11
2,7346 4068 5074 3747 5681 6093 1777 2226 7354 6...,__label__3
3,7159 948 4866 2109 5520 2490 211 3956 5520 549...,__label__2
4,3646 3055 3055 2490 4659 6065 3370 5814 2465 5...,__label__3
...,...,...
189995,1970 5036 1815 5036 2538 6093 2693 4986 1324 2...,__label__1
189996,5602 6250 6065 264 4876 6639 314 1152 1264 364...,__label__0
189997,669 4293 3099 1940 5917 4128 669 4293 3099 174...,__label__8
189998,2400 6811 2986 2252 1457 4893 62 2376 2490 219...,__label__4


## 2. word2vec 模型

In [10]:
from gensim.models.word2vec import Word2Vec

num_features = 100     # Word vector dimensionality
num_workers = 8       # Number of threads to run in parallel

train_texts = df_train.head(10)['text'].tolist()
train_texts = list(map(lambda x: list(x.split()), train_texts))

model = Word2Vec(train_texts, workers=num_workers, vector_size=num_features)
model.init_sims(replace=True)
#
# # save model
model.save("./word2vec.bin")

  model.init_sims(replace=True)


## 3. 评估、保存模型

In [5]:
now_str = pd.Timestamp.now().strftime('%Y%m%d%H%M%S')
model.save_model('model/fasttext_{}.mdl'.format(now_str))

df_valid = df_valid.copy()
df_valid['label_ft'] = '__label__' + df_valid['label'].astype(str)

val_pred = [model.predict(x)[0][0].split('__')[-1] for x in df_valid['text']]
y_true = df_valid['label'].to_numpy().astype(str)
y_pred = val_pred
score = f1_score(y_true, y_pred, average='macro')
score


NameError: name 'model' is not defined

## 4. 使用模型预测测试集

In [None]:
df_test = pd.read_csv('data/test_a.csv', sep='\t')
test_pred = [model.predict(x)[0][0].split('__')[-1] for x in df_test['text']]
df_submit = pd.DataFrame(test_pred, columns=['label'])
df_submit.to_csv('data/submit_fasttext.csv', index=None)
df_submit.head()