In [3]:
import re
import os
import math
import nltk
import time
import jieba
import torch
import pickle
import zipfile
import warnings
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import streamlit as st
import torch.nn.functional as F
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.util import cos_sim 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import BertTokenizer, BertForQuestionAnswering, BertModel
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator

warnings.filterwarnings("ignore")

In [4]:
pd_beijing = pd.read_excel(r'../dataset/xiecheng_beijing_comment.xlsx').astype(str)
pd_guangzhou = pd.read_excel(r'../dataset/xiecheng_guangzhou_comment.xlsx').astype(str)
pd_shanghai = pd.read_excel(r'../dataset/xiecheng_shanghai_comment.xlsx').astype(str)

In [None]:
#展示信息
print('评论数目（北京）：%d' % pd_beijing.shape[0])
print('评论数目（广州）：%d' % pd_guangzhou.shape[0])
print('评论数目（上海）：%d' % pd_shanghai.shape[0])

In [None]:
#调整格式
pd_beijing = pd_beijing.rename(columns={'酒店名字': 'name', '酒店分类': 'class','酒店位置': 'location','住客点评': 'total', 'Unnamed: 6': 'loc', 'Unnamed: 7': 'fac', 'Unnamed: 8': 'ser', 'Unnamed: 9': 'hyg'})
pd_guangzhou = pd_guangzhou.rename(columns={'酒店名字': 'name', '酒店分类': 'class','酒店位置': 'location','住客点评': 'total', 'Unnamed: 6': 'loc', 'Unnamed: 7': 'fac', 'Unnamed: 8': 'ser', 'Unnamed: 9': 'hyg'})
pd_shanghai = pd_shanghai.rename(columns={'酒店名字': 'name', '酒店分类': 'class','酒店位置': 'location','住客点评': 'total', 'Unnamed: 6': 'loc', 'Unnamed: 7': 'fac', 'Unnamed: 8': 'ser', 'Unnamed: 9': 'hyg'})

In [None]:
pd_beijing = pd_beijing.drop(index=0)
pd_guangzhou = pd_guangzhou.drop(index=0)
pd_shanghai = pd_shanghai.drop(index=0)
#除去空行
pd_beijing.dropna(inplace=True)
pd_guangzhou.dropna(inplace=True)
pd_shanghai.dropna(inplace=True)
#将所有字符全部变为小写形式
pd_beijing['review']=pd_beijing['review'].str.lower()
pd_guangzhou['review']=pd_guangzhou['review'].str.lower()
pd_shanghai['review']=pd_shanghai['review'].str.lower()

In [None]:
# 去除表情符号的函数
def remove_emojis(text):
    return re.sub(r'[^\w\s，。（）]', '', text)  # 保留字母、数字、空格和中文标点符号

pd_beijing['review'] = pd_beijing['review'].apply(remove_emojis)
pd_guangzhou['review'] = pd_guangzhou['review'].apply(remove_emojis)
pd_shanghai['review'] = pd_shanghai['review'].apply(remove_emojis)

In [None]:
#将文本中特殊字符替换为空格
for ch in '“”‘’：；/？！·~、【】!"#$%&*+,-./:;<=>?@[\\]^_\'{|};~\n':
    pd_beijing['review'] = pd_beijing['review'].str.replace(ch, " ")
    pd_guangzhou['review'] = pd_guangzhou['review'].str.replace(ch, " ")
    pd_shanghai['review'] = pd_shanghai['review'].str.replace(ch, " ")

In [None]:
# 分割文本
text_column = 'review'
def split_text(text):
    sentences = []
    current_sentence = ''
    number_start = True
    for char in text:
        if char == '（' :
            if current_sentence != '':
                sentences.append(current_sentence.strip())
                current_sentence = ''
            number_start = True
        elif char.isdigit() and number_start:
            continue
        elif char == '）' and number_start:
            number_start = False
        else:
            current_sentence += char
    if current_sentence.strip() != '':
        sentences.append(current_sentence.strip())
    return sentences

In [None]:
# 创建一个新的空列来存储分割后的句子
new_columns = [f'Sentence_{i+1}' for i in range(15)]  # 假设最多有10个句子
for col in new_columns:
    pd_beijing[col] = ''
    pd_guangzhou[col] = ''
    pd_shanghai[col] = ''

def split_text_into_columns(df):
    for index, row in df.iterrows():
        sentences = split_text(row[text_column])
        for i, sentence in enumerate(sentences):
            if i < len(new_columns):
                df.at[index, new_columns[i]] = sentence
    return df
pd_beijing = split_text_into_columns(pd_beijing)
pd_guangzhou = split_text_into_columns(pd_guangzhou)
pd_shanghai = split_text_into_columns(pd_shanghai)

In [None]:
#先将每个评论单独挑出
def process_reviews(df):
    id_vars = ["id", "name", "class", "location", "review", "total", "loc", "fac", "ser", "hyg"]
    value_vars = [
        "Sentence_1", "Sentence_2", "Sentence_3", "Sentence_4", "Sentence_5",
        "Sentence_6", "Sentence_7", "Sentence_8", "Sentence_9", "Sentence_10",
        "Sentence_11", "Sentence_12", "Sentence_13", "Sentence_14", "Sentence_15"
    ]
    df_melted = df.melt(
        id_vars=id_vars,
        value_vars=value_vars,
        var_name="review_col",
        value_name="reviews"
    )
    df_melted = df_melted[df_melted['reviews'] != ""]
    
    return df_melted
pd_beijing_reviews = process_reviews(pd_beijing)
pd_guangzhou_reviews = process_reviews(pd_guangzhou)
pd_shanghai_reviews = process_reviews(pd_shanghai)

In [None]:
#将每个评论的每句话截取出来
def split_reviews_by_sentence(df):
    expanded_rows = []
    for index, row in df.iterrows():
        review = row['reviews']
        sentences = re.split(r'。', review)
        for sentence in sentences:
            new_row = row.to_dict()
            new_row['reviews'] = sentence.strip() 
            expanded_rows.append(new_row) 
    expanded_df = pd.DataFrame(expanded_rows)
    expanded_df = expanded_df[expanded_df['reviews'] != ""]
    return expanded_df
    
pd_beijing_reviews_expanded = split_reviews_by_sentence(pd_beijing_reviews)
pd_guangzhou_reviews_expanded = split_reviews_by_sentence(pd_guangzhou_reviews)
pd_shanghai_reviews_expanded = split_reviews_by_sentence(pd_shanghai_reviews)

In [None]:
pd_beijing_reviews_expanded['text_length'] = pd_beijing_reviews_expanded['reviews'].apply(len)
pd_guangzhou_reviews_expanded['text_length'] = pd_guangzhou_reviews_expanded['reviews'].apply(len)
pd_shanghai_reviews_expanded['text_length'] = pd_shanghai_reviews_expanded['reviews'].apply(len)

pd_beijing_reviews_expanded = pd_beijing_reviews_expanded[pd_beijing_reviews_expanded['text_length'] <= 30]
pd_guangzhou_reviews_expanded = pd_guangzhou_reviews_expanded[pd_guangzhou_reviews_expanded['text_length'] <= 30]
pd_shanghai_reviews_expanded = pd_shanghai_reviews_expanded[pd_shanghai_reviews_expanded['text_length'] <= 30]

pd_beijing_reviews_expanded = pd_beijing_reviews_expanded[pd_beijing_reviews_expanded['text_length'] > 4]
pd_guangzhou_reviews_expanded = pd_guangzhou_reviews_expanded[pd_guangzhou_reviews_expanded['text_length'] > 4]
pd_shanghai_reviews_expanded = pd_shanghai_reviews_expanded[pd_shanghai_reviews_expanded['text_length'] > 4]

# 分类标签

In [None]:
# 定义维度及关键词
label_mapping = {'位置': 0, '设施': 1, '服务': 2, '卫生': 3, '其他': 4}
dimension_keywords = {
    "位置": ["位置", "交通", "地铁", "机场", "周边", "近", "距离", "景点", "出行", "便利", "远", "步行", "火车", "机场", "周围", "边上", "地段", 
           "肯德基", "全聚德", "麦当劳", "商城", "商务区", "对面", "庆丰", "旁边", "颐和园", "紧邻", "超市", "走路", "胡同", "小区", "号线", "直行", 
           "博物馆", "景点", "天安门", "走着", "购物", "公交", "偏僻", "街", "西站", "南站", "北站", "打车", "海景", "逛街", "尖沙咀", "中环",
           "烟花", "王府井", "国贸", "街角", "星巴克", "巴士", "自行车", "单车"],
    "设施": ["设施", "设备", "齐全", "装修", "条件", "冰箱", "墙", "床", "窗", "健身房", "灯", "淋", "平方", "空调", "暖气", "电", "隔音","安静", 
          "舒适", "噪音", "热水", "车位", "隔壁", "安检", "门禁", "浴", "沙发", "泳", "车位", "地漏", "旧", "洗漱", "阳台", "布局", "内部", "插座", 
           "家具", "停车", "室内", "精致", "停水", "加湿器", "智能", "咖啡机", "不热", "洗澡", "纸巾", "音箱", "视野", "信号","吹风筒", "香皂", "梳子", "写字台"
          "花潵", "花洒", "灭火", "安全", "房卡", "硬件", "很大", "较大", "很小", "气派", "奢华", "温泉", "较小", "日常用品", "网络", "无线网","卡顿", 
          "衣柜", "水壶", "空间", "凉快", "院子", "牙膏", "护发素", "透气", "闷", "干湿", "宽敞", "暖和", "齐备", "马桶", "暖风", "走廊", "物件","太小", 
          "明亮", "wifi", "网线", "枕头", "挺大", "采光", "平米", "洗衣", "烘干", "有点小", "地毯", "地板", "破损", "温度", "略小", "温馨", "排气扇", "usb",
          "房间小", "贵族", "套房", "观景台", "牙刷", "吹风机", "插头", "被子", "毯子", "有点小", "大小", "桌", "椅", "厨房", "单间", "光线", "维修", "简易"],
    "服务": ["服务", "态度", "员工", "前台", "餐", "热情", "耐心", "勤务", "退房", "行李", "和气", "管理", "规范", "接待", "办理", "客气", 
          "免费", "升级", "上菜", "工作人员", "死板", "指导", "早点", "姑娘", "小哥", "小伙", "大姐", "赠饮", "赠送", "经理", "认真", "帮忙", "礼貌",
          "接送", "早饭", "保安"],
    "卫生": ["卫生", "干净", "洁", "异味", "灰尘", "臭", "味道很大", "烟味", "浴巾", "脏", "锈", "层灰", "毛发", "异味", "厚灰","污", "垃圾", 
           "血", "味道大", "汗味", "毛巾", "霉味", "难闻", "清扫", "下水道", "熏晕", "打扫", "粪便", "潮湿", "拖鞋", "垢"]}

def classify_comment(comment):
    for dimension, keywords in dimension_keywords.items():
        if any(keyword in comment for keyword in keywords):
            return dimension
    return "其他" 

pd_beijing_reviews_expanded['dimension'] = pd_beijing_reviews_expanded['reviews'].apply(classify_comment)
pd_guangzhou_reviews_expanded['dimension'] = pd_guangzhou_reviews_expanded['reviews'].apply(classify_comment)
pd_shanghai_reviews_expanded['dimension'] = pd_shanghai_reviews_expanded['reviews'].apply(classify_comment)
pd_beijing_reviews_expanded['label'] = pd_beijing_reviews_expanded['dimension'].map(label_mapping)
pd_guangzhou_reviews_expanded['label'] = pd_guangzhou_reviews_expanded['dimension'].map(label_mapping)
pd_shanghai_reviews_expanded['label'] = pd_shanghai_reviews_expanded['dimension'].map(label_mapping)

pd_beijing_reviews_expanded_clean = pd_beijing_reviews_expanded[pd_beijing_reviews_expanded['label'] != 4]
pd_guangzhou_reviews_expanded_clean = pd_guangzhou_reviews_expanded[pd_guangzhou_reviews_expanded['label'] != 4]
pd_shanghai_reviews_expanded_clean = pd_shanghai_reviews_expanded[pd_shanghai_reviews_expanded['label'] != 4]

In [None]:
com_df = pd.concat([
    pd_beijing_reviews_expanded_clean
    pd_chongqing_reviews_expanded_clean,
    pd_shanghai_reviews_expanded_clean
])
com_df.to_excel('../dataset/xiecheng/all_1215.xlsx') 
pd_beijing_reviews_expanded.to_excel('../dataset/xiecheng/beijing_1215.xlsx') 

In [None]:
com_df_01 = com_df[(com_df['label'] == 1) | (com_df['label'] == 0)]
com_df_02 = com_df[(com_df['label'] == 2) | (com_df['label'] == 0)]
com_df_03 = com_df[(com_df['label'] == 3) | (com_df['label'] == 0)]
com_df_12 = com_df[(com_df['label'] == 2) | (com_df['label'] == 1)]
com_df_13 = com_df[(com_df['label'] == 3) | (com_df['label'] == 1)]
com_df_23 = com_df[(com_df['label'] == 2) | (com_df['label'] == 3)]
com_df_01.to_excel('../dataset/xiecheng/all_01_1215.xlsx') 
com_df_02.to_excel('../dataset/xiecheng/all_02_1215.xlsx') 
com_df_03.to_excel('../dataset/xiecheng/all_03_1215.xlsx') 
com_df_12.to_excel('../dataset/xiecheng/all_12_1215.xlsx') 
com_df_13.to_excel('../dataset/xiecheng/all_13_1215.xlsx') 
com_df_23.to_excel('../dataset/xiecheng/all_23_1215.xlsx') 

# 提前计算文本嵌入embeddings

In [None]:
import pickle
comments = com_df_01["reviews"].tolist()#
model = AutoModel.from_pretrained('../dataset/mac_1213')
tokenizer = AutoTokenizer.from_pretrained('../dataset/mac_1213')
# 第一次运行时保存评论嵌入
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

# 定义批量处理函数
def get_batch_embeddings(sentences, tokenizer, model, batch_size=32):
    embeddings_list = []
    data_loader = DataLoader(sentences, batch_size=batch_size, shuffle=False)
    for batch in tqdm(data_loader, desc="Processing batches"):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        inputs = {key: val.to(model.device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings_list.append(embeddings)
    return torch.cat(embeddings_list)

# 分批计算嵌入
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32  # 根据内存大小调整批量大小
comment_embeddings = get_batch_embeddings(comments, tokenizer, model, batch_size=batch_size)

# 保存嵌入
with open("./comment_embeddings_mac01.pkl", "wb") as f:
    pickle.dump(comment_embeddings.cpu(), f)

In [None]:
import pickle
comments = com_df_02["reviews"].tolist()#
model = AutoModel.from_pretrained('../dataset/mac_1213')
tokenizer = AutoTokenizer.from_pretrained('../dataset/mac_1213')
# 第一次运行时保存评论嵌入
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

# 定义批量处理函数
def get_batch_embeddings(sentences, tokenizer, model, batch_size=32):
    embeddings_list = []
    data_loader = DataLoader(sentences, batch_size=batch_size, shuffle=False)
    for batch in tqdm(data_loader, desc="Processing batches"):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        inputs = {key: val.to(model.device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings_list.append(embeddings)
    return torch.cat(embeddings_list)

# 分批计算嵌入
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32  # 根据内存大小调整批量大小
comment_embeddings = get_batch_embeddings(comments, tokenizer, model, batch_size=batch_size)

# 保存嵌入
with open("./comment_embeddings_mac02.pkl", "wb") as f:
    pickle.dump(comment_embeddings.cpu(), f)

In [None]:
import pickle
comments = com_df_03["reviews"].tolist()#
model = AutoModel.from_pretrained('../dataset/mac_1213')
tokenizer = AutoTokenizer.from_pretrained('../dataset/mac_1213')
# 第一次运行时保存评论嵌入
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

# 定义批量处理函数
def get_batch_embeddings(sentences, tokenizer, model, batch_size=32):
    embeddings_list = []
    data_loader = DataLoader(sentences, batch_size=batch_size, shuffle=False)
    for batch in tqdm(data_loader, desc="Processing batches"):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        inputs = {key: val.to(model.device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings_list.append(embeddings)
    return torch.cat(embeddings_list)

# 分批计算嵌入
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32  # 根据内存大小调整批量大小
comment_embeddings = get_batch_embeddings(comments, tokenizer, model, batch_size=batch_size)

# 保存嵌入
with open("./comment_embeddings_mac03.pkl", "wb") as f:
    pickle.dump(comment_embeddings.cpu(), f)

In [None]:
import pickle
comments = com_df_12["reviews"].tolist()#
model = AutoModel.from_pretrained('../dataset/mac_1213')
tokenizer = AutoTokenizer.from_pretrained('../dataset/mac_1213')
# 第一次运行时保存评论嵌入
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

# 定义批量处理函数
def get_batch_embeddings(sentences, tokenizer, model, batch_size=32):
    embeddings_list = []
    data_loader = DataLoader(sentences, batch_size=batch_size, shuffle=False)
    for batch in tqdm(data_loader, desc="Processing batches"):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        inputs = {key: val.to(model.device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings_list.append(embeddings)
    return torch.cat(embeddings_list)

# 分批计算嵌入
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32  # 根据内存大小调整批量大小
comment_embeddings = get_batch_embeddings(comments, tokenizer, model, batch_size=batch_size)

# 保存嵌入
with open("./comment_embeddings_mac12.pkl", "wb") as f:
    pickle.dump(comment_embeddings.cpu(), f)

In [None]:
import pickle
comments = com_df_13["reviews"].tolist()#
model = AutoModel.from_pretrained('../dataset/mac_1213')
tokenizer = AutoTokenizer.from_pretrained('../dataset/mac_1213')
# 第一次运行时保存评论嵌入
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

# 定义批量处理函数
def get_batch_embeddings(sentences, tokenizer, model, batch_size=32):
    embeddings_list = []
    data_loader = DataLoader(sentences, batch_size=batch_size, shuffle=False)
    for batch in tqdm(data_loader, desc="Processing batches"):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        inputs = {key: val.to(model.device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings_list.append(embeddings)
    return torch.cat(embeddings_list)

# 分批计算嵌入
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32  # 根据内存大小调整批量大小
comment_embeddings = get_batch_embeddings(comments, tokenizer, model, batch_size=batch_size)

# 保存嵌入
with open("./comment_embeddings_mac13.pkl", "wb") as f:
    pickle.dump(comment_embeddings.cpu(), f)

In [None]:
import pickle
comments = com_df_23["reviews"].tolist()#
model = AutoModel.from_pretrained('../dataset/mac_1213')
tokenizer = AutoTokenizer.from_pretrained('../dataset/mac_1213')
# 第一次运行时保存评论嵌入
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

# 定义批量处理函数
def get_batch_embeddings(sentences, tokenizer, model, batch_size=32):
    embeddings_list = []
    data_loader = DataLoader(sentences, batch_size=batch_size, shuffle=False)
    for batch in tqdm(data_loader, desc="Processing batches"):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        inputs = {key: val.to(model.device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings_list.append(embeddings)
    return torch.cat(embeddings_list)

# 分批计算嵌入
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32  # 根据内存大小调整批量大小
comment_embeddings = get_batch_embeddings(comments, tokenizer, model, batch_size=batch_size)

# 保存嵌入
with open("./comment_embeddings_mac23.pkl", "wb") as f:
    pickle.dump(comment_embeddings.cpu(), f)

# 情感标签

In [None]:
# 打印相似度和评论相关信息
positive_words = ["干净", "整洁", "清爽", "卫生良好", "没有异味", "没有味道", "表扬", "非常好", "没任何异味", "虽然不大", "虽然小", "虽然没电梯", "没有噪音", 
                 "但也", "不远", "不吵", "不费劲", "没有很重", "不差", "不算差", "没什么缺点", "没啥缺点", "没缺点", "灿烂", "无破损", "不错", "齐全", "出差", 
                  "没有什么异味", "没异味", "不坏", "不会感觉潮湿", "不小", "没有味道", "没味道", "无异味", "挺大的", '够大', "床还是挺舒服的", "感觉很好", 
                 "离南站不太远"]
negative_words = ["脏", "不干净", "不卫生", "恶心", "乱", "不建议", "不喜欢", "讨厌", "不推荐", "不太喜欢", "异味重", "不好", "狭小", 
                  "没希望", "别扭", "瑕疵", "不如", "烟味", "暗", "有味道", "该扔了", "一股味道", "没吃", "不评论", "凑活", "太贵", "尴尬", "不是特别好",
                 "不是很好", "不舒服", "不在", "地面存水", "不好", "没有", "不方便", "不相称", "弱", "不隔音", "郁闷", "生硬", "不明澈", "不是很方便", 
                 "没声", "年头", "远", "吵", "不畅", "陈旧", "不好吃", "不要吃", "太差", "费劲", "太小", "破", "不给力", "感冒","太少", "很贵", 
                 "太大", "摸黑", "只有", "不开", "不亮", "无人", "没法", "陈旧", "缺点", "用过", "坏的", "异味严重", "异味重", "没洗", "特别差", "很差", 
                 "失望", "烂", "不足", "压抑", "糟糕", "遗憾", "屏蔽", "没自助", "种类很少", "挺少", "不得不", "不近", "慢", "不太干净", "才", "污渍", 
                 "花屏", "没反应", "不是", "忽略", "醉了", "非常差", "嘈杂", "才能", "小贵", "老旧", "不是太干净", "隔音差", "鼾", "霉", "潮湿", "不劲"
                 "小飞虫", "不太方便", "不洁净", "还要", "不好", "较差", "坏", "雪花", "不清楚", "赔偿", "血", "才出来", "没早餐", "不能", "烟味", 
                 "简陋", "汗味", "漏水", "窄", "形同虚设", "闷闷", "过期", "味道大", "堪忧", "一般", "隔音不好", "有点差", "该换", "一塌糊涂", "不是太喜欢",
                 "不符合", "不上档次", "没敢", "不太干净", "不太卫生", "很贵", "不太好找", "不是太友好", "不咋样", "比较小", "住不了", "投诉", "才有",
                 "不起作用", "难道", "跟不上", "卡顿", "人多", "有点挤", "太旧", "不敢碰", "难闻", "喉咙", "呛", "很堵", "没睡好", "马虎", "不够好", "不够干净",
                 "一层灰", "一层绣", "木讷", "被解除", "旧漆", "松动", "不提供", "很旧了", "斑点", "种类比较少", "不冷不热", "这么差", "种类较少", "味道很大", 
                 "无法", "噪声", "凑合", "除了床", "推销", "反味", "超冷", "不够热", "没见", "差太多", "不提供", "看看", "品种少", "差评", "恶劣", "恶语",
                 "漫骂", "有些旧"]

def detect_sentiment(review):
    negative_count = sum(1 for word in negative_words if word in review)
    positive_count = sum(1 for word in positive_words if word in review)

    if positive_count > negative_count:
        return "1"
    elif negative_count > positive_count:
        return "-1"
    else:
        return "1"

# 应用到评论列
combined_df = pd_beijing_reviews_expanded_clean
combined_df['att'] = combined_df['reviews'].apply(detect_sentiment)
combined_df['att'] = combined_df['att'].astype(int)
combined_df_1000 = combined_df.head(1000)
combined_df_1000.to_excel('../dataset/xiecheng/sentiment1000_1214.xlsx') 