In [None]:
import re
import os
import math
import nltk
import time
import jieba
import torch
import pickle
import zipfile
import warnings
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import streamlit as st
import torch.nn.functional as F
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.util import cos_sim 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import BertTokenizer, BertForQuestionAnswering, BertModel
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator

warnings.filterwarnings("ignore")

In [None]:
label_mapping = {'位置': 0, '设施': 1, '服务': 2, '卫生': 3, '其他': 4}

# 训练分类模型

In [None]:
pd_beijing_man = pd.read_excel(r'../dataset/xiecheng/beijing_1211.xlsx')
pd_beijing_man['label'] = pd_beijing_man['dimension'].map(label_mapping)
pd_beijing_man_clean = pd_beijing_man[pd_beijing_man['label'] != 4]
combined_df = pd_beijing_man_clean

In [None]:
X = combined_df["reviews"].values
y = combined_df["label"].values
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(bert-base-chinese)
model = AutoModelForMaskedLM.from_pretrained(bert-base-chinese)

def encode_comments(comments):
    return  tokenizer(list(comments), padding=True, truncation=True, return_tensors="pt", max_length=128)

train_encodings = encode_comments(train_texts)
val_encodings = encode_comments(val_texts)

class HotelReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
train_dataset = HotelReviewDataset(train_encodings, train_labels)
val_dataset = HotelReviewDataset(val_encodings, val_encodings)

In [None]:
model = BertForSequenceClassification.from_pretrained(bert-base-chinese, num_labels=4)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

In [None]:
model.save_pretrained('../dataset/hotel_review_model_1211161')
tokenizer.save_pretrained('../dataset/hotel_review_model_1211161')

In [None]:
#准确度评估
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
X = combined_df["reviews"].values
y = combined_df["att"].values
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)
# 加载微调好的模型和分词器
model = BertForSequenceClassification.from_pretrained('../dataset/hotel_review_model_1211161')
tokenizer = BertTokenizer.from_pretrained('../dataset/hotel_review_model_1211161')

# 验证集数据编码
inputs = tokenizer(val_texts.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=128)

# 推理阶段
model.eval()  # 进入评估模式
with torch.no_grad():  # 禁用梯度计算
    outputs = model(**inputs)
    logits = outputs.logits  # 获取模型输出的 logits
    val_preds = torch.argmax(logits, dim=1).numpy()

# 评估准确率和生成报告
accuracy = accuracy_score(val_labels, val_preds)
print(f"验证集准确率: {accuracy:.2f}")
print("分类报告:")
print(classification_report(val_labels, val_preds))

# 绘制混淆矩阵
conf_matrix = confusion_matrix(val_labels, val_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=model.config.id2label.values(), yticklabels=model.config.id2label.values())
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

# 训练语义匹配模型

In [None]:
X = combined_df["reviews"].values
y = combined_df["label"].values
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(chinese-macbert-base)
model = AutoModelForMaskedLM.from_pretrained(chinese-macbert-base)

def encode_comments(comments):
    return  tokenizer(list(comments), padding=True, truncation=True, return_tensors="pt", max_length=128)

train_encodings = encode_comments(train_texts)
val_encodings = encode_comments(val_texts)

class HotelReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
train_dataset = HotelReviewDataset(train_encodings, train_labels)
val_dataset = HotelReviewDataset(val_encodings, val_encodings)

In [None]:
model = BertForSequenceClassification.from_pretrained(chinese-macbert-base, num_labels=4)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

In [None]:
model.save_pretrained('../dataset/mac_1213')
tokenizer.save_pretrained('../dataset/mac_1213')

In [None]:
#准确度评估
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
X = combined_df["reviews"].values
y = combined_df["att"].values
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)
# 加载微调好的模型和分词器
model = BertForSequenceClassification.from_pretrained('../dataset/mac_1213')
tokenizer = BertTokenizer.from_pretrained('../dataset/mac_1213')

# 验证集数据编码
inputs = tokenizer(val_texts.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=128)

# 推理阶段
model.eval()  # 进入评估模式
with torch.no_grad():  # 禁用梯度计算
    outputs = model(**inputs)
    logits = outputs.logits  # 获取模型输出的 logits
    val_preds = torch.argmax(logits, dim=1).numpy()

# 评估准确率和生成报告
accuracy = accuracy_score(val_labels, val_preds)
print(f"验证集准确率: {accuracy:.2f}")
print("分类报告:")
print(classification_report(val_labels, val_preds))

# 绘制混淆矩阵
conf_matrix = confusion_matrix(val_labels, val_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=model.config.id2label.values(), yticklabels=model.config.id2label.values())
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

# 情感分析模型

In [None]:
combined_df = pd.read_excel(r'../dataset/xiecheng/sentiment1000_1214.xlsx')
X = combined_df["reviews"].values
y = combined_df["att"].values
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(chinese-macbert-base)
model = AutoModelForMaskedLM.from_pretrained(chinese-macbert-base)

def encode_comments(comments):
    return  tokenizer(list(comments), padding=True, truncation=True, return_tensors="pt", max_length=128)

train_encodings = encode_comments(train_texts)
val_encodings = encode_comments(val_texts)

class HotelReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
        
train_dataset = HotelReviewDataset(train_encodings, train_labels)
val_dataset = HotelReviewDataset(val_encodings, val_encodings)

In [None]:
model = BertForSequenceClassification.from_pretrained(chinese-macbert-base, num_labels=2)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

In [None]:
model.save_pretrained('../dataset/mac_1214')
tokenizer.save_pretrained('../dataset/mac_1214')

In [None]:
#准确度评估
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
X = combined_df["reviews"].values
y = combined_df["att"].values
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)
# 加载微调好的模型和分词器
model = BertForSequenceClassification.from_pretrained('../dataset/mac_1214')
tokenizer = BertTokenizer.from_pretrained('../dataset/mac_1214')

# 验证集数据编码
inputs = tokenizer(val_texts.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=128)

# 推理阶段
model.eval()  # 进入评估模式
with torch.no_grad():  # 禁用梯度计算
    outputs = model(**inputs)
    logits = outputs.logits  # 获取模型输出的 logits
    val_preds = torch.argmax(logits, dim=1).numpy()

# 评估准确率和生成报告
accuracy = accuracy_score(val_labels, val_preds)
print(f"验证集准确率: {accuracy:.2f}")
print("分类报告:")
print(classification_report(val_labels, val_preds))

# 绘制混淆矩阵
conf_matrix = confusion_matrix(val_labels, val_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=model.config.id2label.values(), yticklabels=model.config.id2label.values())
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()