In [1]:
import pandas as pd

df = pd.read_excel('data/data.xlsx')

In [2]:
import jieba

df['text'] = df['text'].apply(lambda x: ' '.join(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ikkk\AppData\Local\Temp\jieba.cache
Loading model cost 0.576 seconds.
Prefix dict has been built successfully.


In [3]:
with open('data/stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read().splitlines()

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])

In [5]:
from sklearn.naive_bayes import MultinomialNB

labels = ['味道好', '味道差', '态度好', '态度差', '卫生状态', '不新鲜']
classifiers = {}
for label in labels:
    y = df[label]
    clf = MultinomialNB()
    clf.fit(X, y)
    classifiers[label] = clf

In [6]:
new_text = '这家店的味道很好，态度也很好，但是卫生有点问题'
new_text = ' '.join(jieba.cut(new_text))
new_text = ' '.join([word for word in new_text.split() if word not in stopwords])

In [7]:
new_X = vectorizer.transform([new_text])

In [8]:
for label, clf in classifiers.items(): 
    proba = clf.predict_proba(new_X)[0, 1] 
    print(f'{label}的概率为{proba:.2f}') 

味道好的概率为0.99
味道差的概率为0.00
态度好的概率为0.20
态度差的概率为0.02
卫生状态的概率为0.96
不新鲜的概率为0.00


In [9]:
#测试 

In [21]:
text = '一般但是食材不新鲜，偶尔吃可以，不建议经常吃,鸡肉就算了，闻起来有点发臭，店内不太卫生' 
text = ' '.join(jieba.cut(text)) 
text = ' '.join([word for word in text.split() if word not in stopwords]) 
new_X = vectorizer.transform([text])  
for label, clf in classifiers.items(): 
    proba = clf.predict_proba(new_X)[0, 1] 
    print(f'{label}的概率为{proba:.2f}') 

味道好的概率为0.01
味道差的概率为0.47
态度好的概率为0.48
态度差的概率为0.00
卫生状态的概率为0.78
不新鲜的概率为0.70


In [23]:
import pickle
with open('classifiers.pickle', 'wb') as f:
    pickle.dump(classifiers, f)
with open('data/vectorizer.pickle', 'wb') as f:
    pickle.dump(vectorizer, f)