In [1]:
import os
os.chdir("D:/DataSet/自然语言")

文本表示方法
- one-hot 将每一个单词使用一个离散的向量表示，将每个字/词编码成一个索引，然后根据索引进行赋值。
- bags of words 每个文档的字/词可以使用其出现次数来进行表示
- N-gram 与Count Vectors类似，不过加入了相邻单词组合为新的单词，并进行计数。
- tf-idf 第一部分是词语频率，第二部分是逆文档频率

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000)

In [None]:
#Count Vectors + RidgeClassifier
vectorizer = CountVectorizer(max_features=3000)
train_test = vectorizer.fit_transform(train_df['text'])

clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

In [None]:
# TF-IDF + RidgeClassifier
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=3000)
train_test = tfidf.fit_transform(train_df['text'])

clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

In [None]:
#正则化参数对模型的影响
sample = train_df[0:5000]
n = int(2*len(sample)/3)
tfidf = TfidfVectorizer(ngram_range=(2,3), max_features=2500)
train_test = tfidf.fit_transform(sample['text'])
train_x = train_test[:n]
train_y = sample['label'].values[:n]
test_x = train_test[n:]
test_y = sample['label'].values[n:]

f1 = []
for i in range(10):
    clf = RidgeClassifier(alpha = 0.15*(i+1), solver = 'sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))

plt.plot([0.15*(i+1) for i in range(10)], f1)
plt.xlabel('alpha')
plt.ylabel('f1_score')
plt.show()

In [None]:
#max_features对模型的影响
f1 = []
features = [1000,2000,3000,4000]
for i in range(4):
    tfidf = TfidfVectorizer(ngram_range=(2,3), max_features=features[i])
    train_test = tfidf.fit_transform(sample['text'])
    train_x = train_test[:n]
    train_y = sample['label'].values[:n]
    test_x = train_test[n:]
    test_y = sample['label'].values[n:]
    clf = RidgeClassifier(alpha = 0.1*(i+1), solver = 'sag')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(test_x)
    f1.append(f1_score(test_y, val_pred, average='macro'))

plt.plot(features, f1)
plt.xlabel('max_features')
plt.ylabel('f1_score')
plt.show()

In [None]:
#ngram_range对模型的影响
#n-gram提取词语字符数的下边界和上边界，考虑到中文的用词习惯，ngram_range可以在(1,4)之间选取
f1 = []
tfidf = TfidfVectorizer(ngram_range=(1,1), max_features=2000)
train_test = tfidf.fit_transform(sample['text'])
train_x = train_test[:n]
train_y = sample['label'].values[:n]
test_x = train_test[n:]
test_y = sample['label'].values[n:]
clf = RidgeClassifier(alpha = 0.1*(i+1), solver = 'sag')
clf.fit(train_x, train_y)
val_pred = clf.predict(test_x)
f1.append(f1_score(test_y, val_pred, average='macro'))

tfidf = TfidfVectorizer(ngram_range=(2,2), max_features=2000)
train_test = tfidf.fit_transform(sample['text'])
train_x = train_test[:n]
train_y = sample['label'].values[:n]
test_x = train_test[n:]
test_y = sample['label'].values[n:]
clf = RidgeClassifier(alpha = 0.1*(i+1), solver = 'sag')
clf.fit(train_x, train_y)
val_pred = clf.predict(test_x)
f1.append(f1_score(test_y, val_pred, average='macro'))

tfidf = TfidfVectorizer(ngram_range=(3,3), max_features=2000)
train_test = tfidf.fit_transform(sample['text'])
train_x = train_test[:n]
train_y = sample['label'].values[:n]
test_x = train_test[n:]
test_y = sample['label'].values[n:]
clf = RidgeClassifier(alpha = 0.1*(i+1), solver = 'sag')
clf.fit(train_x, train_y)
val_pred = clf.predict(test_x)
f1.append(f1_score(test_y, val_pred, average='macro'))

tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=2000)
train_test = tfidf.fit_transform(sample['text'])
train_x = train_test[:n]
train_y = sample['label'].values[:n]
test_x = train_test[n:]
test_y = sample['label'].values[n:]
clf = RidgeClassifier(alpha = 0.1*(i+1), solver = 'sag')
clf.fit(train_x, train_y)
val_pred = clf.predict(test_x)
f1.append(f1_score(test_y, val_pred, average='macro'))

In [None]:
from sklearn import linear_model

tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
train_test = tfidf.fit_transform(train_df['text']) # 词向量 15000*max_features

reg = linear_model.LogisticRegression(penalty='l2', C=1.0,solver='liblinear')
reg.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = reg.predict(train_test[10000:])
print('预测结果中各类新闻数目')
print(pd.Series(val_pred).value_counts())
print('\n F1 score为')
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
train_test = tfidf.fit_transform(train_df['text']) # 词向量 15000*max_features

reg = linear_model.SGDClassifier(loss="log", penalty='l2', alpha=0.0001,l1_ratio=0.15)
reg.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = reg.predict(train_test[10000:])
print('预测结果中各类新闻数目')
print(pd.Series(val_pred).value_counts())
print('\n F1 score为')
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

In [None]:
from sklearn import svm
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
train_test = tfidf.fit_transform(train_df['text']) # 词向量 15000*max_features

reg = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',decision_function_shape='ovr')
reg.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = reg.predict(train_test[10000:])
print('预测结果中各类新闻数目')
print(pd.Series(val_pred).value_counts())
print('\n F1 score为')
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))

模型搭建遵循以下步骤：
添加输入层（embedding层）。Embedding层的输入是一批文档，每个文档由一个词汇索引序列构成。例如：[10, 30, 80, 1000] 可能表示“我 昨天 来到 达观数据”这个短文本，其中“我”、“昨天”、“来到”、“达观数据”在词汇表中的索引分别是10、30、80、1000；Embedding层将每个单词映射成EMBEDDING_DIM维的向量。于是：input_shape=(BATCH_SIZE, MAX_WORDS), output_shape=(BATCH_SIZE,MAX_WORDS, EMBEDDING_DIM)；
添加隐含层（投影层）。投影层对一个文档中所有单词的向量进行叠加平均。keras提供的GlobalAveragePooling1D类可以帮我们实现这个功能。这层的input_shape是Embedding层的output_shape，这层的output_shape=( BATCH_SIZE, EMBEDDING_DIM)；
添加输出层（softmax层）。真实的fastText这层是Hierarchical Softmax，因为keras原生并没有支持Hierarchical Softmax，所以这里用Softmax代替。这层指定了CLASS_NUM，对于一篇文档，输出层会产生CLASS_NUM个概率值，分别表示此文档属于当前类的可能性。这层的output_shape=(BATCH_SIZE, CLASS_NUM)
指定损失函数、优化器类型、评价指标，编译模型。损失函数我们设置为categorical_crossentropy，它就是我们上面所说的softmax回归的损失函数；优化器我们设置为SGD，表示随机梯度下降优化器；评价指标选择accuracy，表示精度。

用训练数据feed模型时，你需要：
将文档分好词，构建词汇表。词汇表中每个词用一个整数（索引）来代替，并预留“未知词”索引，假设为0；
对类标进行onehot化。假设我们文本数据总共有3个类别，对应的类标分别是1、2、3，那么这三个类标对应的onehot向量分别是[1, 0,0]、[0, 1, 0]、[0, 0, 1]；
对一批文本，将每个文本转化为词索引序列，每个类标转化为onehot向量。就像之前的例子，“我 昨天 来到 达观数据”可能被转化为[10, 30, 80, 1000]；它属于类别1，它的类标就是[1, 0, 0]。由于我们设置了MAX_WORDS=500，这个短文本向量后面就需要补496个0，即[10, 30, 80, 1000, 0, 0, 0, …, 0]。因此，batch_xs的 维度为( BATCH_SIZE,MAX_WORDS)，batch_ys的维度为（BATCH_SIZE, CLASS_NUM）。

In [None]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.layers import Dense

VOCAB_SIZE = 2000
EMBEDDING_DIM = 100
MAX_WORDS = 500
CLASS_NUM = 5


def build_fastText():
    model = Sequential()
    # 将词汇数VOCAB_SIZE映射为EMBEDDING_DIM维
    model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_WORDS))
    # 平均文档中所有词的embedding
    model.add(GlobalAveragePooling1D())
    # softmax分类
    model.add(Dense(CLASS_NUM, activation='softmax'))
    # 定义损失函数、优化器、分类度量指标
    model.compile(loss='categorical_crossentropy', optimizer='SGD', metrics=['accuracy'])
    return model


model = build_fastText()
print(model.summary())

主要超参数：
lr: 学习率
dim: 词向量的维度
epoch: 每轮的个数
wordNgrams: 词的n-gram，一般设置为2或3
loss: 损失函数 ns(negative sampling, 负采样)、hs(hierarchical softmax, 分层softmax)、softmax、ova(One-VS-ALL)

In [None]:
import time
import numpy as np
import fasttext
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold

def fasttext_model(nrows, train_num, lr=1.0, wordNgrams=2, minCount=1, epoch=25, loss='hs', dim=100):
    start_time = time.time()

    # 转换为FastText需要的格式
    train_df = pd.read_csv('../input/train_set.csv', 
                           sep='\t', nrows=nrows)

    # shuffle
    train_df = shuffle(train_df, random_state=666)

    train_df['label_ft'] = '__label__' + train_df['label'].astype('str')
    train_df[['text', 'label_ft']].iloc[:train_num].to_csv(
        '../input/fastText_train.csv', index=None, header=None, sep='\t')

    model = fasttext.train_supervised(
        '../input/fastText_train.csv', lr=lr, wordNgrams=wordNgrams,
        verbose=2, minCount=minCount, epoch=epoch, loss=loss, dim=dim)

    train_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[:train_num]['text']]
    print('Train f1_score:', f1_score(train_df['label'].values[:train_num].astype(str), train_pred, average='macro'))
    val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[train_num:]['text']]
    print('Val f1_score:', f1_score(train_df['label'].values[train_num:].astype(str), val_pred, average='macro'))
    train_time = time.time()
    print('Train time: {:.2f}s'.format(train_time - start_time))

     # 预测并保存
    test_df = pd.read_csv('../input/test_a.csv')

    test_pred = [model.predict(x)[0][0].split('__')[-1] for x in test_df['text']]
    test_pred = pd.DataFrame(test_pred, columns=['label'])
    test_pred.to_csv('../input/test_fastText_ridgeclassifier.csv', index=False)
    print('Test predict saved.')
    end_time = time.time()
    print('Predict time:{:.2f}s'.format(end_time - train_time))


if __name__ == '__main__':  
    nrows = 200000
    train_num = int(nrows * 0.7)
    lr=0.01
    wordNgrams=2
    minCount=1
    epoch=25
    loss='hs'

    fasttext_model(nrows, train_num)

In [None]:
def fasttext_kfold_model(nrows, train_num, n_splits, lr=1.0, wordNgrams=2, minCount=1, epoch=25, loss='hs', dim=100):
    start_time = time.time()

    # 转换为FastText需要的格式
    train_df = pd.read_csv('../input/train_set.csv', sep='\t', nrows=nrows)

    # shuffle
    train_df = shuffle(train_df, random_state=666)

    train_df['label_ft'] = '__label__' + train_df['label'].astype('str')

    models = []
    train_scores = []
    val_scores = []

    # K折交叉验证
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=666)
    for train_index, test_index in skf.split(train_df['text'], train_df['label_ft']):
        train_df[['text', 'label_ft']].iloc[train_index].to_csv('../input/fastText_train.csv', index=None, header=None, sep='\t')

        model = fasttext.train_supervised('../input/fastText_train.csv', lr=lr, wordNgrams=wordNgrams, verbose=2, 
                                          minCount=minCount, epoch=epoch, loss=loss)
        models.append(model)

        train_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[train_index]['text']]
        train_score = f1_score(train_df['label'].values[train_index].astype(str), train_pred, average='macro')
        # print('Train length: ', len(train_pred))
        print('Train score: ', train_score)
        train_scores.append(train_score)

        val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[test_index]['text']]
        val_score = f1_score(train_df['label'].values[test_index].astype(str), val_pred, average='macro')
        # print('Val length: ', len(val_pred))
        print('Val score', val_score)
        val_scores.append(val_score)

    print('mean train score: ', np.mean(train_scores))
    print('mean val score: ', np.mean(val_scores))
    train_time = time.time()
    print('Train time: {:.2f}s'.format(train_time - start_time))

    return models

def fasttext_kfold_predict(models, n_splits):

    pred_list = []

    start_time = time.time()
    # 预测并保存
    test_df = pd.read_csv('../input/test_a.csv')

    # 消耗时间较长
    for model in models:
        test_pred = [model.predict(x)[0][0].split('__')[-1] for x in test_df['text']]
        pred_list.append(test_pred)

    test_pred_label = pd.DataFrame(pred_list).T.apply(lambda row: np.argmax(np.bincount([row[i] for i in range(n_splits)])), axis=1)
    test_pred_label.columns='label'

    test_pred_label.to_csv('../input/test_fastText_ridgeclassifier.csv', index=False)
    print('Test predict saved.')
    end_time = time.time()
    print('Predict time:{:.2f}s'.format(end_time - start_time))


if __name__ == '__main__':
    nrows = 200000
    train_num = int(nrows * 0.7)
    n_splits = 3
    lr=0.1
    wordNgrams=2
    minCount=1
    epoch=25
    loss='hs'
    dim=200

    """
    Train score:  0.9635013320936988
    Val score 0.9086640111428032
    Train score:  0.9623510782430645
    Val score 0.9094998879044359
    Train score:  0.9628121318772955
    Val score 0.9096191534698315
    mean train score:  0.9628881807380196
    mean val score:  0.9092610175056901
    Train time: 740.60s
    """   

    models = fasttext_kfold_model(nrows, train_num, n_splits, lr=lr, wordNgrams=wordNgrams, minCount=minCount, epoch=epoch, loss=loss, dim=dim)
    fasttext_kfold_predict(models, n_splits=n_splits)