In [17]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.layers import Dense, ReLU, Embedding, BatchNormalization, Concatenate, Conv1D, GlobalMaxPooling1D, Dropout, Input
from tensorflow.keras.models import Sequential, Model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [18]:
# 加载数据
df_train = pd.read_csv('emotion_data/train.txt',sep=';', names=['sentence', 'label'])
df_test = pd.read_csv('emotion_data/test.txt',sep=';', names=['sentence', 'label'])
df_val = pd.read_csv('emotion_data/val.txt',sep=';', names=['sentence', 'label'])

In [19]:
# 训练数据根据文本内容进行均分,对多标签数据进行欠采样
def dataframe_processing(df):
    # 数据均等划分
    df = df[~df['label'].isin(['love','surprise'])]
    min_count = df.value_counts('label').min()
    df_banlanced = pd.DataFrame()
    labels = df['label'].value_counts().index
    
    for label in labels:
        df_sampled = df[df['label'] == label].sample(n=min_count, random_state=42)
        df_banlanced = pd.concat([df_banlanced, df_sampled])
    
    # XY划分
    X_df_banlanced = df_banlanced['sentence']
    y_df_banlanced =  df_banlanced['label']
    
    return X_df_banlanced, y_df_banlanced

X_train, y_train= dataframe_processing(df_train)
X_test, y_test = dataframe_processing(df_test)
X_val, y_val = dataframe_processing(df_val)

In [20]:
# 对多标签进行编码
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)
y_val_encoded = encoder.transform(y_val)

In [21]:
# 对文字进行分词处理
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)
X_train_seq = pad_sequences(sequences, maxlen=50)
y_train_cat = to_categorical(y_train_encoded)

sequences = tokenizer.texts_to_sequences(X_test)
X_test_seq = pad_sequences(sequences, maxlen=50)
y_test_cat = to_categorical(y_test_encoded)

sequences = tokenizer.texts_to_sequences(X_val)
X_val_seq = pad_sequences(sequences, maxlen=50)
y_val_cat = to_categorical(y_val_encoded)

In [22]:
# 构建模型
max_words = 10000
max_len = 50
embedding_dim = 32

# Branch 1
branch1 = Sequential()
# 嵌入层
branch1.add(Embedding(input_dim = max_words, output_dim = embedding_dim, input_length = max_len))
# 卷积层
branch1.add(Conv1D(64, 3, padding='same', activation='relu'))
# 批归一化层
branch1.add(BatchNormalization())
# ReLU激活函数
branch1.add(ReLU())
# Dropout 防止过拟合
branch1.add(Dropout(0.5))
# 全局池化层
branch1.add(GlobalMaxPooling1D())

# Branch 2
branch2 = Sequential()
branch2.add(Embedding(max_words, embedding_dim, input_length=max_len))
branch2.add(Conv1D(64, 3, padding='same', activation='relu'))
branch2.add(BatchNormalization())
branch2.add(ReLU())
branch2.add(Dropout(0.5))
branch2.add(GlobalMaxPooling1D())

concatenated = Concatenate()([branch1.output, branch2.output])

hid_layer = Dense(128, activation='relu')(concatenated)
dropout = Dropout(0.3)(hid_layer)
output_layer = Dense(4, activation='softmax')(dropout)

model = Model(inputs=[branch1.input, branch2.input], outputs=output_layer)

model.compile(optimizer='adamax',
              loss='categorical_crossentropy',
              metrics=['accuracy', Precision(), Recall()])

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 embedding_input (InputLaye  [(None, 50)]                 0         []                            
 r)                                                                                               
                                                                                                  
 embedding_1_input (InputLa  [(None, 50)]                 0         []                            
 yer)                                                                                             
                                                                                                  
 embedding (Embedding)       (None, 50, 32)               320000    ['embedding_input[0][0]']     
                                                                                              

In [23]:
# 训练模型
batch_size = 256
epochs = 25
history = model.fit([X_train_seq, X_train_seq], y_train_cat, epochs=epochs, batch_size=batch_size,
                    validation_data=([X_val_seq, X_val_seq], y_val_cat))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [24]:
(loss, accuracy, percision, recall) = model.evaluate([X_test_seq, X_test_seq], y_test_cat)
print(f'Loss: {round(loss, 2)}, Accuracy: {round(accuracy, 2)}, Precision: {round(percision, 2)}, Recall: {round(recall, 2)}')

Loss: 0.2, Accuracy: 0.95, Precision: 0.96, Recall: 0.93


In [26]:
# 测试一下数据
test_data = df_test['sentence'].head(10)

test_sequences = tokenizer.texts_to_sequences(test_data)
test_data = pad_sequences(test_sequences, maxlen=max_len)
predictions = model.predict([test_data,test_data])

predicted_label = [np.argmax(prediction) for prediction in predictions]
    

encoder.inverse_transform(predicted_label)



array(['sadness', 'sadness', 'sadness', 'joy', 'sadness', 'fear', 'anger',
       'joy', 'joy', 'anger'], dtype=object)