In [1]:
import pandas as pd
from tensorflow import keras
from textaugment import EDA
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout, Dense,Input,Embedding,Flatten
from tensorflow.keras.models import Model
import numpy as np
from sklearn import metrics
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
from sklearn.preprocessing import FunctionTransformer
columns = ['学号', '性别', '生源地', '总分', '幻觉、妄想症状', '自杀意图', '焦虑指标总分', '抑郁指标总分', '偏执指标总分', '自卑指标总分',
               '敏感指标总分', '社交恐惧指标总分', '躯体化指标总分', '依赖指标总分', '敌对攻击指标总分', '冲动指标总分', '强迫指标总分',
               '网络成瘾指标总分', '自伤行为指标总分', '进食问题指标总分', '睡眠困扰指标总分', '学校适应困难指标总分', '人际关系困扰指标总分',
               '学业压力指标总分', '就业压力指标总分', '恋爱困扰指标总分']
data = pd.read_csv('student_data.csv', encoding='utf-8')
data.drop(columns=columns, inplace=True)
train_df, test_df = train_test_split(data, test_size=0.2, random_state=8)

In [3]:
# augment training data
train_class_0 = data[data['可能问题']==0]
train_class_1 = data[data['可能问题']==1]
train_class_2 = data[data['可能问题']==2]
train_class_3 = data[data['可能问题']==3]

t = EDA(random_state=8)
new_text_1 = [t.random_swap(t.random_insertion(t.random_deletion(t.synonym_replacement(sent), p=0.4))) for sent in train_class_1['text']]
new_text_2 = [t.random_swap(t.random_insertion(t.random_deletion(t.synonym_replacement(sent), p=0.4))) for sent in train_class_2['text']]
new_text_3 = [t.random_swap(t.random_insertion(t.random_deletion(t.synonym_replacement(sent), p=0.4))) for sent in train_class_3['text']]

new_class_1 = train_class_1
new_class_1['text'] = new_text_1
train_1 = pd.concat([train_class_1, new_class_1])

new_class_2 = train_class_2
new_class_2['text'] = new_text_2
train_2 = pd.concat([train_class_2, new_class_2, new_class_2, new_class_2, train_class_2])

new_class_3 = train_class_3
new_class_3['text'] = new_text_3
train_3 = pd.concat([train_class_3, new_class_3, new_class_3, new_class_3, train_class_3])

train = pd.concat([train_class_0, train_1, train_2, train_3])
train['可能问题'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0    106
1     88
2     75
3     65
Name: 可能问题, dtype: int64

In [4]:
texts = np.array(pd.concat([train['text'], test_df['text']]))
len(texts)

370

In [5]:
MAX_NB_WORDS = 5000
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
len(sequences)
# sequences即将每个句子中的每个单词使用词典序表示的形式



370

In [6]:
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Found 6023 unique tokens


In [7]:
MAX_SEQUENCE_LENGTH = 500
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [8]:
labels = to_categorical(np.array(pd.concat([train['可能问题'], test_df['可能问题']])))

In [9]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (370, 500)
Shape of label tensor: (370, 4)


In [10]:
indices = np.arange(data.shape[0])
data = data[indices]
labels = labels[indices]

X_train = data[:len(train)]
X_test = data[len(train):]
y_train = np.array(train['可能问题'])
y_test = np.array(test_df['可能问题'])

print('number of sequences in X_train: ', len(X_train))
print('number of sequences in X_test: ', len(X_test))
print(len(y_train))
print(len(y_test))

number of sequences in X_train:  334
number of sequences in X_test:  36
334
36


In [11]:
# Preparing the Embedding Layer
embedding_index = {}
f = open('./glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embedding_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embedding_index))

Found 400000 word vectors.


In [12]:
# embedding matrix
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, index in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[index]) != len(embedding_vector):
                print("could not broadcast input array from shape",str(len(embedding_matrix[index])),
                                 "into shape",str(len(embedding_vector))," Please make sure your"
                                 " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)

            embedding_matrix[index] = embedding_vector

In [13]:
# Load embedding matrix into an Embedding Layer
from tensorflow.keras.layers import Embedding
embedding_layer = Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [14]:
convs = []
filter_sizes = []
layer = 3
for fl in range(0,layer):
    filter_sizes.append((fl+2))

In [15]:
dropout = 0.5
node = 128
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
# emb = Reshape((MAX_SEQUENCE_LENGTH,10,10), input_shape=(MAX_SEQUENCE_LENGTH,100))(embedded_sequences)

for fsz in filter_sizes:
    l_conv = keras.layers.Conv1D(node, kernel_size=fsz, activation='relu')(embedded_sequences)
    l_pool = keras.layers.MaxPooling1D(3)(l_conv)
    # l_pool = Dropout(0.25)(l_pool)
    convs.append(l_pool)

l_merge = keras.layers.Concatenate(axis=1)(convs)
l_cov1 = keras.layers.Conv1D(node, 5, activation='relu')(l_merge)
l_pool1 = keras.layers.MaxPooling1D(3)(l_cov1)
l_cov2 = keras.layers.Conv1D(node, 5, activation='relu')(l_pool1)
l_pool2 = keras.layers.MaxPooling1D(20)(l_cov2)
l_cov2 = Dropout(dropout)(l_pool2)
l_flat = Flatten()(l_cov2)
# l_dense = Dense(1024, activation='relu')(l_flat)
# l_dense = Dropout(dropout)(l_dense)
l_dense = Dense(512, activation='relu')(l_flat)
l_dense = Dropout(dropout)(l_dense)
l_dense = Dense(128, activation='relu')(l_dense)
l_dense = Dropout(dropout)(l_dense)

preds = Dense(4, activation='softmax')(l_dense)
model = Model(sequence_input, preds)
model.compile(loss='sparse_categorical_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])

In [16]:
# keras.utils.plot_model(model, "CNN_augmented_model.png", show_shapes=True)

In [17]:
model.fit(X_train, y_train,
          validation_data=(X_test, y_test),
          epochs=20,
          batch_size=100,
          verbose=2)
predicted = model.predict(X_test)
predicted = np.argmax(predicted, axis=1)

print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))
print("precision: ", str(metrics.precision_score(y_test, predicted, average='macro')))
print("accuracy: ", str(metrics.accuracy_score(y_test, predicted)))
print("F1 score: ", str(metrics.f1_score(y_test, predicted, average='macro')))
print("recall: ", str(metrics.recall_score(y_test, predicted, average='macro')))

Train on 334 samples, validate on 36 samples
Epoch 1/20
334/334 - 2s - loss: 1.5905 - accuracy: 0.3054 - val_loss: 1.0785 - val_accuracy: 0.6111
Epoch 2/20
334/334 - 2s - loss: 1.3171 - accuracy: 0.3802 - val_loss: 0.9901 - val_accuracy: 0.6111
Epoch 3/20
334/334 - 2s - loss: 1.2918 - accuracy: 0.4251 - val_loss: 0.9744 - val_accuracy: 0.6944
Epoch 4/20
334/334 - 2s - loss: 1.1997 - accuracy: 0.4641 - val_loss: 0.9196 - val_accuracy: 0.6944
Epoch 5/20
334/334 - 2s - loss: 1.1964 - accuracy: 0.4790 - val_loss: 0.9302 - val_accuracy: 0.6944
Epoch 6/20
334/334 - 2s - loss: 1.1124 - accuracy: 0.5329 - val_loss: 0.9408 - val_accuracy: 0.6944
Epoch 7/20
334/334 - 2s - loss: 1.1405 - accuracy: 0.5090 - val_loss: 0.9599 - val_accuracy: 0.6111
Epoch 8/20
334/334 - 2s - loss: 1.0869 - accuracy: 0.5269 - val_loss: 0.9137 - val_accuracy: 0.6389
Epoch 9/20
334/334 - 2s - loss: 1.0589 - accuracy: 0.5479 - val_loss: 0.8894 - val_accuracy: 0.6111
Epoch 10/20
334/334 - 2s - loss: 1.0270 - accuracy: 0.5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
