In [37]:
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout, Dense,Input,Embedding,Flatten, Bidirectional
from tensorflow.keras.models import Model
import numpy as np
from sklearn import metrics
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [38]:
from textaugment import EDA
columns = ['学号', '性别', '生源地', '总分', '幻觉、妄想症状', '自杀意图', '焦虑指标总分', '抑郁指标总分', '偏执指标总分', '自卑指标总分',
               '敏感指标总分', '社交恐惧指标总分', '躯体化指标总分', '依赖指标总分', '敌对攻击指标总分', '冲动指标总分', '强迫指标总分',
               '网络成瘾指标总分', '自伤行为指标总分', '进食问题指标总分', '睡眠困扰指标总分', '学校适应困难指标总分', '人际关系困扰指标总分',
               '学业压力指标总分', '就业压力指标总分', '恋爱困扰指标总分']
data = pd.read_csv('student_data.csv', encoding='utf-8')
data.drop(columns=columns, inplace=True)

In [39]:
data.loc[data['可能问题'] != 0, '可能问题'] = 1
data['可能问题'].value_counts()

0    106
1     72
Name: 可能问题, dtype: int64

In [40]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=8)


In [41]:
texts = np.array(pd.concat([train_df['text'], test_df['text']]))
len(texts)

178

In [42]:
MAX_NB_WORDS = 5000
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
len(sequences)
# sequences即将每个句子中的每个单词使用词典序表示的形式



178

In [43]:
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Found 6375 unique tokens


In [44]:
MAX_SEQUENCE_LENGTH = 500
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [45]:
labels = to_categorical(np.array(pd.concat([train_df['可能问题'], test_df['可能问题']])))

In [46]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (178, 500)
Shape of label tensor: (178, 2)


In [47]:
indices = np.arange(data.shape[0])
data = data[indices]
labels = labels[indices]

X_train = data[:len(train_df)]
X_test = data[len(train_df):]
y_train = np.array(train_df['可能问题'])
y_test = np.array(test_df['可能问题'])

print('number of sequences in X_train: ', len(X_train))
print('number of sequences in X_test: ', len(X_test))
print(len(y_train))
print(len(y_test))

number of sequences in X_train:  142
number of sequences in X_test:  36
142
36


In [48]:
# Preparing the Embedding Layer
embedding_index = {}
f = open('./glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embedding_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embedding_index))

Found 400000 word vectors.


In [49]:
# embedding matrix
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, index in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[index]) != len(embedding_vector):
                print("could not broadcast input array from shape",str(len(embedding_matrix[index])),
                                 "into shape",str(len(embedding_vector))," Please make sure your"
                                 " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)

            embedding_matrix[index] = embedding_vector

In [50]:
from tensorflow.keras.layers import Embedding
embedding_layer = Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [51]:
metadata_input = Input(shape=(train_metadata.shape[1],), dtype='float32')
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_model = Dropout(0.5)(embedded_sequences)
l_model = Bidirectional(keras.layers.LSTM(128, recurrent_dropout=0.2))(l_model)
l_model = Dropout(0.5)(l_model)

c_dense = keras.layers.Dense(1024, activation='relu')(l_model)
preds = keras.layers.Dense(4)(c_dense)
preds = keras.layers.Activation('softmax')(preds)

model = Model([sequence_input, metadata_input], preds)
model.compile(loss='sparse_categorical_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])

In [52]:
# keras.utils.plot_model(model, "LSTM_metadata_model.png", show_shapes=True)

In [53]:
model.fit([X_train, train_metadata], y_train,
          validation_data=([X_test, test_metadata], y_test),
          epochs=45,
          batch_size=100,
          verbose=2)
predicted = model.predict([X_test, test_metadata])

print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))
print("precision: ", str(metrics.precision_score(y_test, predicted, average='macro')))
print("accuracy: ", str(metrics.accuracy_score(y_test, predicted)))
print("F1 score: ", str(metrics.f1_score(y_test, predicted, average='macro')))
print("recall: ", str(metrics.recall_score(y_test, predicted, average='macro')))

Train on 142 samples, validate on 36 samples
Epoch 1/20
142/142 - 31s - loss: 1.3311 - accuracy: 0.3239 - val_loss: 0.9499 - val_accuracy: 0.6111
Epoch 2/20
142/142 - 28s - loss: 0.9571 - accuracy: 0.6056 - val_loss: 0.7453 - val_accuracy: 0.6111
Epoch 3/20
142/142 - 28s - loss: 0.7578 - accuracy: 0.5845 - val_loss: 0.6871 - val_accuracy: 0.6111
Epoch 4/20
142/142 - 31s - loss: 0.6959 - accuracy: 0.5915 - val_loss: 0.6890 - val_accuracy: 0.5556
Epoch 5/20
142/142 - 31s - loss: 0.6821 - accuracy: 0.5775 - val_loss: 0.6862 - val_accuracy: 0.5556
Epoch 6/20
142/142 - 30s - loss: 0.6559 - accuracy: 0.6268 - val_loss: 0.6796 - val_accuracy: 0.6111
Epoch 7/20
142/142 - 31s - loss: 0.6943 - accuracy: 0.6127 - val_loss: 0.6866 - val_accuracy: 0.6111
Epoch 8/20
142/142 - 32s - loss: 0.7117 - accuracy: 0.5845 - val_loss: 0.6771 - val_accuracy: 0.5833
Epoch 9/20
142/142 - 33s - loss: 0.6721 - accuracy: 0.5915 - val_loss: 0.6911 - val_accuracy: 0.5556
Epoch 10/20
142/142 - 32s - loss: 0.6595 - acc