In [2]:
%load_ext tensorboard
import numpy as np
import pandas as pd
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn import preprocessing
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.optimizers import *
import tensorflow as tf
#import tensorflow_addons as tfa
import warnings
import os

warnings.filterwarnings('ignore')

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [9]:
print(tf.__version__)

2.3.0


(None, [])

# 1 读取数据

In [3]:
train = pd.read_csv('./train_set.txt')
test = pd.read_csv('./test_set.txt')
test_true = pd.read_csv('./test_true.txt')
data = pd.concat([train,test])

In [4]:
data['tagid']=data['tagid'].fillna('[0]')
data['time']=data['time'].fillna('[0]')
for col in [x for x in data.columns if x not in ['label']]:
    data[col] = data[col].fillna(-1)
    data[col] = data[col].astype('str')
data['tagid'] = data['tagid'].apply(lambda x:eval(x))
data['time'] = data['time'].apply(lambda x: eval(x)) # str -> list
data['tagid'] = data['tagid'].apply(lambda x:[str(i) for i in x])

# 2 特征处理：处理序列化数据

In [10]:
embed_size = 100
MAX_NB_WORDS = 224254 
MAX_SEQUENCE_LENGTH = 128
# 训练word2vec，这里可以考虑elmo，bert等预训练
w2v_model = Word2Vec(sentences=data['tagid'].tolist(), vector_size=embed_size, window=5, min_count=1,epochs=10)

# 这里是划分训练集和测试数据
X_train = data[:train.shape[0]]['tagid']
X_test = data[train.shape[0]:]['tagid']
# 创建词典，利用了tf.keras的API，其实就是编码一下，具体可以看看API的使用方法
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
word_index = tokenizer.word_index
# 计算一共出现了多少个单词，其实MAX_NB_WORDS我直接就用了这个数据
nb_words = len(word_index) + 1
print('Total %s word vectors.' % nb_words)

# 构建一个embedding的矩阵，之后输入到模型使用
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    try:
        embedding_vector = w2v_model.wv.get_vector(word)
    except KeyError:
        continue
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

y_categorical = train['label'].values

Total 204298 word vectors.


# 3 模型：lstm

In [11]:
def my_model():
    embedding_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    # 词嵌入（使用预训练的词向量）
    embedder = Embedding(nb_words,
                         embed_size,
                         input_length=MAX_SEQUENCE_LENGTH,
                         weights=[embedding_matrix],
                         trainable=False
                         )
    embed = embedder(embedding_input)
    l = LSTM(128)(embed)
    flat = BatchNormalization()(l)
    drop = Dropout(0.2)(flat)
    main_output = Dense(1, activation='sigmoid')(drop)
    model = Model(inputs=embedding_input, outputs=main_output)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer='adam', metrics=['accuracy'])
    return model

# 4 预测

In [None]:
#import tensorflow_addons as tfa
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
validations = np.zeros([len(train), 1])
predictions = np.zeros([len(test), 1])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
    print("fold n{}".format(fold_ + 1))
    model = my_model()
    if fold_ == 0:
        model.summary()

    X_tra, X_val = X_train[trn_idx], X_train[val_idx]
    y_tra, y_val = y_categorical[trn_idx], y_categorical[val_idx]
    
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
    bst_model_path = "./{}.h10".format(fold_)
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
    model.fit(X_tra, y_tra,
              validation_data=(X_val, y_val),
              epochs=128, batch_size=256, shuffle=True,
              callbacks=[early_stopping, model_checkpoint])
    model.load_weights(bst_model_path)

    validations[val_idx] = model.predict(X_val)
    predictions += model.predict(X_test) / folds.n_splits
    del model

fold n1
Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 100)          20429800  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 20,547,689
Trainable params: 117,633
Non-trainable params: 20,430,056
____________________________

# 5 评估

In [None]:
#训练集的f1
train['predict'] = validations
train['rank'] = train['predict'].rank()
train['p'] = 1
train.loc[train['rank'] <= train.shape[0] * 0.5, 'p'] = 0

bst_f1_tmp = f1_score(train['label'].values, train['p'].values)
print('train f1 score:',bst_f1_tmp)

#测试集f1
submit = test[['pid']]
submit['tmp'] = predictions
submit.columns = ['user_id', 'tmp']
submit['rank'] = submit['tmp'].rank()
submit['category_id'] = 1
submit.loc[submit['rank'] <= int(submit.shape[0] * 0.5), 'category_id'] = 0

bst_f1_test = f1_score(test_true['label'].values, submit['category_id'].values)
print('test f1 score:', bst_f1_test)

#submit[['user_id', 'category_id']].to_csv('sorted_lstm_{}.csv'.format(str(bst_f1_tmp).split('.')[1]), index=False)