In [1]:
import pandas as pd 
import numpy as np 
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from textblob import TextBlob
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering

In [2]:
# 1️⃣ read data 
df = pd.read_csv(r"D:\li412\Documents\new\Sukhumvit Road\Sukhumvit_cleaned_data.xls")

In [3]:
# 2️⃣ data pre-processing 
def remove_emojis(text):                       # remove emojis 
    emoji_pattern = re.compile("[" 
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F700-\U0001F77F"  
        u"\U0001F780-\U0001F7FF"  
        u"\U0001F800-\U0001F8FF"  
        u"\U0001F900-\U0001F9FF"  
        u"\U0001FA00-\U0001FA6F"  
        u"\U0001FA70-\U0001FAFF"  
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def clean_text(text):                       
    text = re.sub(r"<.*?>", "", str(text))  # remove HTML 
    text = remove_emojis(text)              # remove emojis
    text = re.sub(r"[^a-zA-Z\u4e00-\u9fa5\s]", "", text)  
    return text.strip()



df["用户评论"] = df["用户评论"].apply(clean_text)

In [4]:
# 3️⃣ segmented words into a single string with spaces between them
df["用户评论_分词"] = df["用户评论"].apply(lambda x: " ".join(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\li412\AppData\Local\Temp\jieba.cache
Loading model cost 1.019 seconds.
Prefix dict has been built successfully.


In [5]:
df.head()

Unnamed: 0,酒店名称,用户评分,评分标准,用户评论,用户评论_分词
0,Cross Vibe Bangkok Sukhumvit,4.7,/ 5,每次到曼谷都住这里主打就是一个位置好服务好\n欧美客人会比较多所以前台和服务人员英文都很好\...,每次 到 曼谷 都 住 这里 主打 就是 一个 位置 好 服务 好 \n 欧美 客人 会 比...
1,Cross Vibe Bangkok Sukhumvit,5.0,/ 5,优点是酒店很新地点好因为是公寓改的酒店所以设施齐全连电磁炉排烟机都有WiFi好淋浴水流很大早...,优点 是 酒店 很 新 地点 好 因为 是 公寓 改 的 酒店 所以 设施 齐全 连 电磁炉...
2,Cross Vibe Bangkok Sukhumvit,5.0,/ 5,酒店位置安静走分钟到地铁干净且舒适推荐,酒店 位置 安静 走 分钟 到 地铁 干净 且 舒适 推荐
3,Cross Vibe Bangkok Sukhumvit,5.0,/ 5,設施多有泳池桌球足球机洒店好近Bts和超市有好多食店唯一小按摩店真的很好方便Lau Lai ...,設施 多 有 泳池 桌球 足球 机洒店 好 近 Bts 和 超市 有 好多 食店 唯一 小 ...
4,Cross Vibe Bangkok Sukhumvit,4.7,/ 5,房间不错虽然不大采光差些挺干净但洗澡间地有点滑差一点摔一跤酒店位置也挺好的有莲花超市里面的芒...,房间 不错 虽然 不 大 采光 差些 挺 干净 但 洗澡间 地 有点 滑 差一点 摔 一跤 ...


In [6]:
# read stop words decoment
def load_stopwords(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        stopwords = set(f.read().splitlines())  # 读取文件，并去掉换行符
    return stopwords

# load_stop_words -( stop words list from Harbin Institute of Technology）
stopwords = load_stopwords(r"D:\App download\哈工大停用词表.txt")  

# Need to manually add hotel industry related stop words
custom_stopwords = ["酒店", "一个", "下次", "不错", "非常", "很","分钟","很多", "很棒", "没有", "这家", "这", "有点", "这次", "这里",
                   "喜欢", "地方","先择","on","一家", "nut","不是", "位于","再次", "住宿", "体验","感觉", "提供", "比较","到达","来说", 
                   "晚上","真的", "要求", "这是", "问题","不会","人员","即可", "可能", "唯一","很大", "推荐","整体","楼下","满意",
                   "选择","适合", "需要","一点","东西","优越","使用","出行","地点", "小时","总体","曼谷","特别","选择","on","On", "ON","已经","所有",
                   "办理","周围","声音","每天","距离","NUT", "Nut","NUt","NuT", "不好", "泰国", "免费","时间", "步行", "缺点", "退房", "隔壁",
                   "回来","客人", "泰铢","一定","一次"]
# merge stop word 
stopwords.update(custom_stopwords)

# handle the stop words 
def remove_stopwords(text):
    words = text.split() # Split the text into a list of words
    words = [word for word in words if word not in stopwords]  # emove stopwords
    return " ".join(words)  # Join the words back into a sentence

# 应用停用词处理
df["用户评论_分词_去停用词"] = df["用户评论_分词"].apply(remove_stopwords)

# 预览数据
df[["用户评论_分词", "用户评论_分词_去停用词"]].head()


Unnamed: 0,用户评论_分词,用户评论_分词_去停用词
0,每次 到 曼谷 都 住 这里 主打 就是 一个 位置 好 服务 好 \n 欧美 客人 会 比...,每次 都 住 主打 位置 好 服务 好 欧美 会 前台 服务 英文 都 好 地理位置 门口 ...
1,优点 是 酒店 很 新 地点 好 因为 是 公寓 改 的 酒店 所以 设施 齐全 连 电磁炉...,优点 新 好 公寓 改 设施 齐全 电磁炉 排烟 机都 WiFi 好 淋浴 水流 早餐 丰富...
2,酒店 位置 安静 走 分钟 到 地铁 干净 且 舒适 推荐,位置 安静 走 地铁 干净 舒适
3,設施 多 有 泳池 桌球 足球 机洒店 好 近 Bts 和 超市 有 好多 食店 唯一 小 ...,設施 泳池 桌球 足球 机洒店 好 近 Bts 超市 好多 食店 小 按摩 店 好 方便 L...
4,房间 不错 虽然 不 大 采光 差些 挺 干净 但 洗澡间 地 有点 滑 差一点 摔 一跤 ...,房间 不 大 采光 差些 挺 干净 洗澡间 滑 差一点 摔 一跤 位置 挺 好 莲花 超市 ...


In [7]:
df.shape

(7039, 6)

In [8]:
# 4️⃣ calculate TF-IDF
# Extracting Comments
comments = df["用户评论_分词_去停用词"].dropna().tolist()


# creat Tfidf vectorizer 
tfidf = TfidfVectorizer(
    max_features=60,  # extract the top 60 important words 
    min_df=2,  #  Ignore words that appear in fewer than 2 comments
    max_df=0.6   # Ignore words that appear in more than 60% of comments
)

#Applying TF-IDF to Comments
tfidf_matrix = tfidf.fit_transform(comments) 


feature_names = tfidf.get_feature_names_out() # Extracting Feature Names
tfidf_scores = tfidf.idf_   # Extracting  TF-IDF Scores

# creat dataframe save result 
tfidf_df = pd.DataFrame({"词语": feature_names, "TF-IDF": tfidf_scores})
tfidf_df

Unnamed: 0,词语,TF-IDF
0,bts,2.749484
1,乐于助人,4.551096
2,交通,3.932437
3,价格,3.932437
4,位置,2.476617
5,便利,4.231742
6,便利店,4.329934
7,便宜,4.88955
8,健身房,4.74137
9,入住,3.20007


In [9]:
# train word2Vec 
# we have to translae data fron datafram to list that Word2Vec needed 
tokenized_sentences = df["用户评论_分词_去停用词"].dropna().apply(lambda x: x.split()).tolist()

# *train word2Vec
word2vec_model = Word2Vec(
    sentences=tokenized_sentences,  # 
    vector_size=100,  # 词向量维度
    window=5,  # 上下文窗口大小
    min_count=1,  # 词出现的最小次数
    workers=4  # 线程数
)

# save model
word2vec_model.save("word2vec_hotel_reviews.model")
print("Word2Vec 模型训练完成，并已保存！")


Word2Vec 模型训练完成，并已保存！


In [10]:
# **步骤 5：验证训练好的 Word2Vec**
# 获取某个词的词向量
try:
    print("“便利” 的词向量：", word2vec_model.wv["便利"])
except KeyError:
    print("“便利” 不在词汇表中，请检查数据！")

# 查看最相似的词
try:
    print("与 '便利' 最相似的词：", word2vec_model.wv.most_similar("便利"))
except KeyError:
    print("“便利” 不在词汇表中，请检查数据！")

# **步骤 6：加载模型（如果以后要使用）**
# 你可以在未来加载训练好的模型：
# word2vec_model = Word2Vec.load("word2vec_hotel_reviews.model")

“便利” 的词向量： [-0.41642076  0.3088078   0.5689611  -0.0676811   0.07279406 -1.4474684
  0.3070778   2.1259055  -0.18320823 -1.1506426  -0.3455078  -1.2946401
 -0.5753335   0.41754678  0.3398576  -0.22092918  0.19716395 -1.0780122
  0.24749373 -1.4418643   0.21001472  0.33221987  0.38034624 -0.55673504
  0.44131327  0.14101137 -0.5343605  -0.5717232  -0.6781834   0.14576449
  0.7827899  -0.19478011  0.36760315 -0.91631544 -0.4136805   0.69436485
 -0.1825869  -0.45316708 -0.22158363 -1.598254    0.419015   -0.5173295
 -0.49635306 -0.01996767  0.4123392  -0.66721165 -0.58400345 -0.09426606
  0.2541263   0.5188786   0.1767735  -0.55676335 -0.37355143  0.08846386
 -0.16963321  0.18669441  0.7410413  -0.44867754 -0.5221871   0.49475953
 -0.2373867  -0.24329022  0.5141055   0.04724699 -1.0442342   0.8692941
 -0.00438797  0.12912424 -0.42880407  0.4329978   0.6028354   0.33762762
  0.95926124 -0.36745945  0.6696728   0.17601791  0.05919747 -0.3721061
 -0.714835    0.04274123 -0.45451674 -0.267385

In [11]:

# 1️⃣ Extract 60 High-Frequency Words from TF-IDF
feature_names = tfidf.get_feature_names_out()  


# 2️⃣ Ensure Words Exist in Word2Vec & Extract Vectors
valid_words_with_vectors = [word for word in feature_names if word in word2vec_model.wv]
word_vectors = np.array([word2vec_model.wv[word] for word in valid_words_with_vectors])

pca = PCA(n_components=30)  # Reduce Dimensionality with PCA
word_vectors_reduced = pca.fit_transform(word_vectors)

# 3️⃣  Hierarchical Clustering
n_clusters = 10  # clusters numver 
hierarchical_model = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
labels = hierarchical_model.fit_predict(word_vectors_reduced)

# 4️⃣  Set category name 
cluster_names = [
    "Location", " Cost-effectiveness", "Facilities & Environment",
    "Room", "Service & Staff", "Check-in Process", "Dining", "Hygiene","Guest Impressions","Convenience"
]

# 5️⃣  create  DataFrame，mapping category name 
clustered_words = pd.DataFrame({
    "valid_words": valid_words_with_vectors,
    "labels": labels
})
clustered_words["cluster names"] = clustered_words["labels"].map(lambda x: cluster_names[x])


# 7️⃣  **手动修正错误分类**
manual_corrections = {
    "bts": "Location",
    "交通": "Location",
    "价格": "Cost-effectiveness",
    "位置": "Location",
    "便宜": "Cost-effectiveness",
    "健身房": "Facilities & Environment",
    "入住" : "Check-in Process", 
    "卫生" : "Hygiene",
    "友好" :"Guest Impressions",
    "员工" : "Service & Staff",
    "地理位置" :"Location",
    "地铁站":"Location",
    "安静" : "Guest Impressions",
    "宽敞" : "Guest Impressions",
    "工作人员" :  "Service & Staff",
    "市中心" :  "Location",
    "干净":"Guest Impressions",
    "很近":"Location",
    "房间": "Room",
    "按摩": "Service & Staff",
    "整洁": "Guest Impressions",
    "早餐": "Dining",
    "服务":"Service & Staff",
    "浴室":"Room",
    "淋浴":"Facilities & Environment",
    "游泳池": "Facilities & Environment",
    "漂亮":"Guest Impressions",
    "热情":"Guest Impressions",
    "牙刷": "Facilities & Environment",
    "舒服":"Guest Impressions",
    "舒适": "Guest Impressions",
    "行李": "Convenience",
    "设施":"Facilities & Environment",
    "超级":"Guest Impressions",
    "车站": "Location",
    "酒吧": "Facilities & Environment",
    "附近":"Location",
    "靠近":"Location",
    "预订": "Check-in Process",
    "食物": "Dining",
    "餐厅": "Facilities & Environment",
    "饭店":"Facilities & Environment",
    "超市":"Facilities & Environment",
    "便利":"Guest Impressions",
}

clustered_words["cluster names"] = clustered_words["valid_words"].map(lambda x: manual_corrections[x] if x in manual_corrections else clustered_words.loc[clustered_words["valid_words"] == x, "cluster names"].values[0])


clustered_words

Unnamed: 0,valid_words,labels,cluster names
0,bts,1,Location
1,乐于助人,9,Convenience
2,交通,1,Location
3,价格,3,Cost-effectiveness
4,位置,1,Location
5,便利,1,Guest Impressions
6,便利店,1,Cost-effectiveness
7,便宜,5,Cost-effectiveness
8,健身房,2,Facilities & Environment
9,入住,3,Check-in Process


In [12]:
grouped_dict = clustered_words.groupby("cluster names")["valid_words"].apply(list).to_dict()
grouped_dict

{' Cost-effectiveness': ['便利店', '商场'],
 'Check-in Process': ['入住', '购物中心', '预订'],
 'Convenience': ['乐于助人', '行李'],
 'Cost-effectiveness': ['价格', '便宜'],
 'Dining': ['早餐', '食物'],
 'Facilities & Environment': ['健身房',
  '性价比',
  '淋浴',
  '清洁',
  '游泳池',
  '牙刷',
  '设施',
  '超市',
  '酒吧',
  '餐厅',
  '饭店'],
 'Guest Impressions': ['便利',
  '友好',
  '安静',
  '宽敞',
  '干净',
  '整洁',
  '漂亮',
  '热情',
  '舒服',
  '舒适',
  '超级'],
 'Hygiene': ['卫生', '友善', '态度', '愉快'],
 'Location': ['bts',
  '交通',
  '位置',
  '地理位置',
  '地铁站',
  '市中心',
  '很近',
  '方便',
  '旁边',
  '车站',
  '附近',
  '靠近'],
 'Room': ['前台', '房间', '泳池', '浴室', '空调', '隔音'],
 'Service & Staff': ['员工', '工作人员', '按摩', '服务', '环境']}

In [13]:
# 2️⃣ 初始化 DataFrame 用于存放转换后的数据
features = list(grouped_dict.keys())  # 10 个一级影响因素
df_transformed = pd.DataFrame(0, index=df.index, columns=features)

# 3️⃣ 遍历每条评论，统计 60 个二级因素的出现次数
for index, row in df.iterrows():
    review_text = row["用户评论_分词_去停用词"]  # 获取用户评论
    for category, keywords in grouped_dict.items():
        for word in keywords:
            if word in review_text:  # 如果评论中包含这个关键词
                df_transformed.at[index, category] += 1  # 计数 +1

# 4️⃣ 添加满意度评分作为目标变量（Y）
df_transformed["Satisfaction rating"] = df["用户评分"]

# 5️⃣ 输出最终数据
df_transformed

Unnamed: 0,Cost-effectiveness,Check-in Process,Convenience,Cost-effectiveness.1,Dining,Facilities & Environment,Guest Impressions,Hygiene,Location,Room,Service & Staff,Satisfaction rating
0,1,0,0,1,0,1,2,1,3,4,1,4.7
1,0,0,0,0,1,2,1,0,0,2,0,5.0
2,0,0,0,0,0,0,3,0,1,0,0,5.0
3,0,0,0,0,0,1,0,0,1,1,1,5.0
4,0,0,0,0,0,1,1,0,1,1,0,4.7
...,...,...,...,...,...,...,...,...,...,...,...,...
7034,0,0,0,0,0,0,1,0,0,1,1,5.0
7035,0,0,0,0,0,0,0,0,0,0,0,3.0
7036,0,0,0,0,0,0,0,0,1,0,0,5.0
7037,0,0,0,0,0,0,0,0,0,0,0,3.0


In [14]:
df_transformed.to_csv("sukhumvit_hotel_Nclean.csv", index=False, encoding="utf-8-sig")

In [32]:
# 将字典转化为 DataFrame
df = pd.DataFrame([(key, word) for key, words in grouped_dict.items() for word in words], columns=['Category', 'Keyword'])

# 按 Category 分组，并将关键词列表化
grouped_df = df.groupby('Category')['Keyword'].apply(list).reset_index()

# 展示分组后的 DataFrame
grouped_df

Unnamed: 0,Category,Keyword
0,Cost-effectiveness,"[便利店, 商场]"
1,Check-in Process,"[入住, 购物中心, 预订]"
2,Convenience,"[乐于助人, 行李]"
3,Cost-effectiveness,"[价格, 便宜]"
4,Dining,"[早餐, 食物]"
5,Facilities & Environment,"[健身房, 性价比, 淋浴, 清洁, 游泳池, 牙刷, 设施, 超市, 酒吧, 餐厅, 饭店]"
6,Guest Impressions,"[便利, 友好, 安静, 宽敞, 干净, 整洁, 漂亮, 热情, 舒服, 舒适, 超级]"
7,Hygiene,"[卫生, 友善, 态度, 愉快]"
8,Location,"[bts, 交通, 位置, 地理位置, 地铁站, 市中心, 很近, 方便, 旁边, 车站, ..."
9,Room,"[前台, 房间, 泳池, 浴室, 空调, 隔音]"
