In [1]:
from src.module.execution_db import DB
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
import sys
import jieba
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# pip install scikit-learn tensorflow-gpu==2.10.1 pandas jieba
# tensorflow更高版本不支持windows下GPU训练

# 调用模型

In [None]:
model = tf.keras.models.load_model('sentiment_model.h5')
model.summary()  # 查看模型结构

VOCAB_SIZE = 10000  # 词典大小上限
MAX_LENGTH = 2000  # 修改为合理长度
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')

# 模拟训练数据初始化 tokenizer（实际应加载训练好的 tokenizer）
sample_texts = ["1", "2", "3", "4", "5"]
tokenizer.fit_on_texts(sample_texts)

text = '''最近央视一套和上海新闻综合频道都在热播《诺尔曼·白求恩》，当初决定看这部电视，一方面是是因为演白求恩的外籍演员还挺帅，一方面是因为看到介绍说将会展现他来到中国前的鲜为人知的前半生，勾起了一点我的好奇心。'''

# 1. 清洗和分词
text = re.sub(r'[^\w\s]', '', text).replace('\n', '').replace('  ', ' ')
text = " ".join(jieba.cut(text))

# 2. 转成序列并 pad
seq = tokenizer.texts_to_sequences([text])
if not seq or len(seq[0]) == 0:
    raise ValueError("输入文本未生成有效的序列，请检查分词和 tokenizer 初始化")
padded = pad_sequences(seq, maxlen=MAX_LENGTH, padding='post', truncating='post')

# 3. 预测
pred = model.predict(padded)
label_idx = np.argmax(pred, axis=1)[0]
if label_idx < 2:
    print("预测结果: 负面情感")
if label_idx == 2:
    print("预测结果: 中性情感")
if label_idx > 2:    
    print("预测结果: 正面情感")

print(f"预测结果: {pred[0]}")
print(f"预测概率: {pred[0][label_idx]}")



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2000, 16)          160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 5)                 85        
                                                                 
Total params: 160,357
Trainable params: 160,357
Non-trainable params: 0
_________________________________________________________________
预测结果: 正面
预测结果: [1.7012000e-03 4.5050649e-16 2.4162831e-11 9.9679273e-01 1.5060488e-03]
预测概率: 0.996792733669281


# 清洗数据 

In [None]:
# sql = f'SELECT `comment_ID`, `comment_text` FROM `douban`.`douban_long_comment` WHERE clean_comment_text is null'
# flag, data = DB().select(sql)
# for i in data:
#     text = re.sub(r'[^\w\s]', '', i[1]).replace('\n', '').replace('  ', '')
#     print(text)
#     words = jieba.cut(text)
#     text = " ".join(words)
#     sql = f"UPDATE `douban`.`douban_long_comment` SET  `clean_comment_text` = '{text}' WHERE `comment_ID` = {i[0]} "
#     DB().update(sql)
#     print("-"* 100)
# 列出所有物理设备

print("Built with CUDA:", tf.test.is_built_with_cuda())
print("GPU Available:", tf.config.list_physical_devices('GPU'))
print("Eager execution enabled:", tf.executing_eagerly())
print("Version:", tf.__version__)
print(tf.test.gpu_device_name())


# 创建训练数据集

In [3]:
df = pd.DataFrame(columns=['rank', 'text'])
sql = f'SELECT `star`, `clean_comment_text` FROM `douban`.`douban_long_comment`  WHERE star is not null  limit 3000'
flag, data = DB().select(sql)

for i in data:
    if i[0] == '力荐':
        rank = 5
    elif i[0] == '推荐':
        rank = 4
    elif i[0] == '还行':
        rank = 3
    elif i[0] == '较差':
        rank = 2
    else:
        rank = 1

    text = i[1]

    row = pd.DataFrame([{'rank': rank, 'text': text}])
    df = pd.concat([df, row], ignore_index=True)



# 将数据划分为训练集和测试集
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].values,
    df['rank'].values,
    test_size=0.2,  # 测试集占 20%
    random_state=42
)

# 构建词典，并将文本转化为序列
VOCAB_SIZE = 10000  # 词典大小上限
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# 设定句子最大长度，过长的句子会被截断，过短的会用 0 填充
MAX_LENGTH = 2000
train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

# 处理标签（1~5）为多分类所需的 0~4 (再 one-hot)
num_classes = 5
train_labels = train_labels - 1  # [1,5] -> [0,4]
test_labels = test_labels - 1  # 同上

train_labels_onehot = tf.keras.utils.to_categorical(train_labels, num_classes=num_classes)
test_labels_onehot = tf.keras.utils.to_categorical(test_labels, num_classes=num_classes)

# 构建与训练模型

In [None]:
embedding_dim = 16
model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE,
                    output_dim=embedding_dim,
                    input_length=MAX_LENGTH))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

model.summary()

# 训练模型
EPOCHS = 500 # 训练轮数
history = model.fit(
    train_padded,
    train_labels_onehot,
    epochs=EPOCHS,
    validation_data=(test_padded, test_labels_onehot),
    verbose=1
)


model.save('sentiment_model.h5')

sys.exit()