In [7]:
import numpy as np
import os
import jieba

In [8]:
# 创建空的句子和标签列表
sentences = []
labels = []

# 读取语料库
file_path = "./user_comments/"
file_list = os.listdir(file_path)
for file in file_list:
    new_path = file_path + file + "/"
    file_names = os.listdir(new_path)
    for file_name in file_names:
        if file_name.endswith(".txt"):
            with open(file=new_path + file_name, mode='r', encoding='gbk', errors='ignore') as f:
                content = f.read().strip().replace('\n', '').replace('\t', '').replace(' ', '')
                sentences.append(content)
                label = 1 if file == "pos" else 0
                labels.append(label)


In [3]:
# 查看数据集的大小
len(sentences)

6000

In [42]:
# 检测标签是否读取正确
print('前10行标签为：', labels[:10])
print('后10行标签为：', labels[-10:])

前10行标签为： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
后10行标签为： [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [54]:
# jieba加载自定义字典
jieba.load_userdict(f="./dicts/dict.txt")

# 停用词 stopwords todo

In [9]:
'''
    构建字典
        - 词
'''
# 生词标记为"<UNK>"，测试集有可能会有生词
words_set = {"<UNK>"}

for sentence in sentences:
    words_set = words_set.union(set(jieba.lcut(sentence)))



In [5]:
' '.join(jieba.lcut(sentences[0]))

'标准间 太 差 房间 还 不如 3 星 的 而且 设施 非常 陈旧 . 建议 酒店 把 老 的 标准间 从 新 改善 .'

In [10]:
# 构建列表
words_list = list(words_set)

# 构建字典
words_dict = {word:idx for idx, word in enumerate(words_list)}

# 字典长度
dict_len = len(words_set)

In [53]:
dict_len

23660

In [69]:
'''
    数据向量化
        - 词袋模型
'''
X = []
for sentence in sentences:
    x = [0] * dict_len
    for word in set(jieba.lcut(sentence)):
        idx = words_dict[word] if word in words_dict else words_dict["<UNK>"]
        x[idx] = 1
    X.append(x)


In [11]:
'''
    数据向量化
        - Count 词频
'''
X2 = []
for sentence in sentences:
    x = [0] * dict_len
    for word in sentence:
        idx = words_dict[word] if word in words_dict else words_dict["<UNK>"]
        x[idx] += 1
    X2.append(x)

In [12]:
'''
    词频 term frequency： 句内重复多比较好
    逆文档频率 idf：所有文档中出现的越少越好
    tf-idf ： 词频 * 逆文档频率，强调句内重要性和句间重要性
'''
word_doc_freq = [0] * dict_len
for word in words_list:
    for sentence in sentences:
        if word in sentence:
            idx = words_dict[word] if word in words_dict else words_dict["<UNK>"]
            word_doc_freq[idx] += 1


In [20]:
word_doc_freq = np.array(word_doc_freq)
# 出现最多的字
words_list[word_doc_freq.argmax()]

'的'

In [27]:
# 逆文档频率
idf1 = (len(sentences) + 1) / (word_doc_freq + 1)
# MinMax标准化
# idf1 = (idf1 - idf1.min()) / (idf1.max() - idf1.min())
# log
idf1 = np.log(idf1)
print('逆文档频率：', idf1)

X2 = X2 * idf1

逆文档频率： [5.92709268 7.31338704 5.56418719 ... 7.09024349 8.00653422 8.00653422]


In [70]:
X = np.array(X)
y = np.array(labels)

In [71]:
X.shape, y.shape

((6000, 23660), (6000,))

In [72]:
# 整个数据集，一共多少非0值元素
X[X>0].size

343899

In [73]:
# 计算稀疏性
X[X>0].size / 6000 / dict_len

0.002422506339814032

In [76]:
# 切分数据集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,shuffle=True)

In [82]:
"""机器学习分类算法"""

from sklearn.ensemble import RandomForestClassifier

# 构建模型
rfc = RandomForestClassifier()

# 训练模型
rfc.fit(X_train, y_train)

# 预测结果
y_pred = rfc.predict(X_test)

# 模型评估
print(rfc.score(X=X_test, y=y_test))

0.9008333333333334


In [89]:
def predict(s):
    # 向量化
    x = [0] * dict_len
    for word in set(jieba.lcut(s)):
        idx = words_dict[word] if word in words_dict else words_dict["UNK"]
        x[idx] = 1

    X = np.array(x).reshape(1, -1)
    y_pred = rfc.predict(X)
    result = "负面评论" if y_pred[0] == 0 else "正面评论"
    return result

In [91]:
s = '房间明亮，干净整洁'
predict(s)

'正面评论'

In [92]:
s = '房间很暗，没有窗户'
predict(s)

'负面评论'

In [93]:
# 保存模型
import joblib
joblib.dump(value={"words_dict":words_dict, "model":rfc}, filename="./models/MLP.team1")

['./models/MLP.team1']

In [94]:
# 读取模型
result = joblib.load("./models/MLP.team1")
result.keys()

dict_keys(['words_dict', 'model'])

In [95]:
words_dict = result["words_dict"]
model = result["model"]

In [97]:
model

In [4]:
file_path = "./user_comments/"
# 读取file_path路径下的文件和文件夹
import os
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

def vectorize_data(data):
    vectorizer = CountVectorizer()
    vectorizer.fit(data)
    vectorizer_data = vectorizer.transform(data)
    return vectorizer_data
def tfidf_data(vectorizer_data):
    transformer = TfidfTransformer()
    transformer.fit(vectorizer_data)
    transformer_data = transformer.transform(vectorizer_data)
    return transformer_data
def train_model(transformer_data, data):
    model = MultinomialNB()
    model.fit(transformer_data, data)
    return model
def predict(model, transformer_data):
    predict = model.predict(transformer_data)
    return predict
def main():
    file_list = read_file(file_path)
    data = read_data(file_list)
    data = clean_data(data)
    vectorizer_data = vectorize_data(data)
    transformer_data = tfidf_data(vectorizer_data)
    model = train_model(transformer_data, data)
    predict = predict(model, transformer_data)
    print(predict)
    main()
    if __name__ == '__main__':
        main()




TypeError: 'module' object is not callable