In [1]:
import jieba
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import sklearn.feature_extraction.text as ft


# 加载文件
data = pd.read_csv("htl_all.csv")
data["label"].value_counts()

1    5322
0    2443
Name: label, dtype: int64

In [5]:
def func(item):
    # 分词
    seg_list = jieba.cut_for_search(item)
    return ' '.join(seg_list)


# print(data['discuss'])
data['new_review'] = data['review'].apply(func)
data.head()

# 整理输入集与输出集     TFIDF
cv = ft.CountVectorizer()
bow = cv.fit_transform(data['new_review'])
tt = ft.TfidfTransformer()
tfidf = tt.fit_transform(bow)
print(tfidf.shape)

# 整理数据集。拆分测试集训练集
train_x, test_x, train_y, test_y = ms.train_test_split(tfidf,
                                                       data["label"],
                                                       test_size=0.3,
                                                       random_state=7)
model = svm.SVC(kernel='linear')

scores = ms.cross_val_score(model, tfidf, data["label"],
                            cv=10, scoring='f1_weighted')
print(scores.mean())

model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))

(7765, 29936)
0.8764324141561269
              precision    recall  f1-score   support

           0       0.82      0.69      0.75       717
           1       0.87      0.93      0.90      1613

    accuracy                           0.86      2330
   macro avg       0.84      0.81      0.83      2330
weighted avg       0.85      0.86      0.85      2330



# 暴力绘制分类边界线
# 从x的min~max 拆出100个x坐标
# 从y的min~max 拆出100个x坐标 一共组成1万个坐标点
xs = np.linspace(data['x1'].min(), data["x1"].max(), 100)
ys = np.linspace(data['x2'].min(), data["x2"].max(), 100)
points = []
for x in xs:
    for y in ys:
        points.append([x, y])
points = np.array(points)
# 预测每个坐标点的类别标签 绘制散点
point_labels = model.predict(points)
plt.scatter(points[:, 0], points[:, 1], c=point_labels, cmap="gray")
plt.scatter(test_x['x1'], test_x['x2'], c=test_y, cmap="brg")

# 多项式核函数
model = svm.SVC(kernel='poly', degree=2)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))

# 预测每个坐标点的类别标签 绘制散点
point_labels = model.predict(points)
plt.scatter(points[:, 0], points[:, 1], c=point_labels, cmap="gray")
plt.scatter(test_x['x1'], test_x['x2'], c=test_y, cmap="brg")


# 经向基核函数
model = svm.SVC(kernel='rbf', C=1, gamma=50)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
# print(sm.classification_report(test_y, pred_test_y))

# 预测每个坐标点的类别标签 绘制散点
point_labels = model.predict(points)
plt.scatter(points[:, 0], points[:, 1], c=point_labels, cmap="gray")
plt.scatter(test_x['x1'], test_x['x2'], c=test_y, cmap="brg")

## 通过网格搜索寻求最优超参数组合

In [None]:
model = svm.SVC()

params = [{'kernel': ['linear'], 'C': [1, 10, 100]},
          {'kernel': ['poly'], 'degree': [2, 3]},
          {'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [1, 0.01, 0.1]}]

model = ms.GridSearchCV(model, params, cv=10)

scores = ms.cross_val_score(model, tfidf, data["label"],
                            cv=10, scoring='f1_weighted')
print(scores.mean())

model.fit(train_x, train_y)


# pred_test_y = model.predict(test_x)
# print(sm.classification_report(test_y, pred_test_y))
# 预测每个坐标点的类别标签 绘制散点
# point_labels = model.predict(points)
# plt.scatter(points[:, 0], points[:, 1], c=point_labels, cmap="gray")
# plt.scatter(test_x['x1'], test_x['x2'], c=test_y, cmap="brg")
# print(model.best_params_)
# print(model.best_score_)
# print(model.best_estimator_)