In [1]:
# 引用需要的library
import numpy as np
import helper
import joblib
import os
import jieba
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import plot_tree

In [2]:
# 从文本文件中读数据
# 读取到的内容
# X=评论（string）， y=情感分类（0/1）
X = []
y = []
folder_path = './user_comments'  # 请将此处替换为您的文件夹路径
stop_words_path = './stop_words.txt'
# 文本向量化
vectorizers = [CountVectorizer(), TfidfVectorizer(), helper.OneHotVectorizer()]
# vectorizers = [CountVectorizer()]
for vectorizer in vectorizers:
    i, j = helper.Read_comments_from_file(folder_path=folder_path, vectorizer=vectorizer, stop_words_path=stop_words_path)
    X.append(i)
    y.append(j)
# X=稀疏矩阵（int），y=情感分类（0/1）
# CountVectorizer
print(X[0].shape, y[0].shape)
# TfidfVectorizer
print(X[1].shape, y[1].shape)
# OneHotVectorizer
print(X[2].shape, y[2].shape)



Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/d6/mm5k9h3n5_924kj876nnys4w0000gn/T/jieba.cache
Loading model cost 0.343 seconds.
Prefix dict has been built successfully.


(5999, 21410) (5999,)
(5999, 21410) (5999,)
(5999, 23431) (5999,)


In [4]:
for vectorizer in vectorizers:
    joblib.dump(value=vectorizer, filename="./models/"+type(vectorizer).__name__)

In [3]:
# 产生Dataset对象list，3组数据
data_sets = []
data_sets.append(helper.DataSet(X=X[0], y=y[0], vectorizer="Counter"))
data_sets.append(helper.DataSet(X=X[1], y=y[1], vectorizer="Tfidf"))
data_sets.append(helper.DataSet(X=X[2], y=y[2], vectorizer="OneHot"))

all_classification_models = helper.Get_test_model()

my_classification_models = []
for data_set in data_sets:
    for model in all_classification_models:
        my_classification_models.append(helper.PredictModel(model, data_set))
        
# 训练 & 预测
for model in my_classification_models:
    model.fit()
    model.predict()
    # 打印预测结果
    print(f"({model.model_name},{model.classification_data.vectorizer})({model.X_train_pre.shape[0]}), {model.train_duration}, {model.pred_duration}, {model.get_eval()}")
    model.save()

print("执行结束!!!")

(RandomForestClassifier,Counter)(4799), 10.67, 0.1, 0.8941666666666667
(RandomForestClassifier,Tfidf)(4799), 9.9, 0.1, 0.8841666666666667
(RandomForestClassifier,OneHot)(4799), 8.51, 0.1, 0.8983333333333333
执行结束!!!


In [4]:
# 产生Dataset对象list，3组数据
data_sets = []
data_sets.append(helper.DataSet(X=X[0], y=y[0], vectorizer="Counter"))
data_sets.append(helper.DataSet(X=X[1], y=y[1], vectorizer="Tfidf"))
data_sets.append(helper.DataSet(X=X[2], y=y[2], vectorizer="OneHot"))

# 9个模型
all_classification_models = helper.Make_model_classifier()
# 构建预测模型列表
my_classification_models = []
for data_set in data_sets:
    for model in all_classification_models:
        my_classification_models.append(helper.PredictModel(model, data_set))


# 训练 & 预测
for model in my_classification_models:
    model.fit()
    model.predict()
    # 打印预测结果
    print(f"({model.model_name},{model.classification_data.vectorizer})({model.X_train_pre.shape[0]}), {model.train_duration}, {model.pred_duration}, {model.get_eval()}")
    model.save()

print("执行结束!!!")

(KNeighborsClassifier,Counter)(4799), 1.85, 1.5, 0.6783333333333333
(LogisticRegression,Counter)(4799), 4.68, 0.53, 0.8441666666666666
(DecisionTreeClassifier,Counter)(4799), 7.03, 0.37, 0.8125
(SVC,Counter)(4799), 122.86, 28.37, 0.8133333333333334
(RandomForestClassifier,Counter)(4799), 14.04, 0.54, 0.88
(AdaBoostClassifier,Counter)(4799), 139.94, 4.02, 0.8366666666666667
(GradientBoostingClassifier,Counter)(4799), 300.47, 0.49, 0.8516666666666667
(XGBClassifier,Counter)(4799), 68.24, 0.11, 0.8616666666666667
[LightGBM] [Info] Number of positive: 2384, number of negative: 2415
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4292
[LightGBM] [Info] Number of data points in the train set: 4799, number of used features: 994
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496770 -> in

In [None]:
# 柱状图，能够一眼看出各数据的大小，比较数据之间的差别
# 分别打印train_duration, pred_duration, acc图
result_data = helper.Result_analysis(my_classification_models)
helper.Plot_analysis(result_data)


In [6]:
# TODO
# 1. 实现TF-IDF - done
# 2. 实现one-hot - done
# 3. 对稀疏矩阵的数据预处理：标准化/归一化？PCA降维？
# 4. 优化图表显示：显示中文？
# 5. 优化调用文本向量化的算法的方式 - done
# 6. 过滤掉停用词（传参数给jieba），需要找到合适的停用词列表 - 停用词逻辑已加入
# 7. jieba能不能接受专用词列表？
# 8. 英文字符串问题，修改去掉空格的方式？- done
# 9. 怎么判断是否过拟合？
# 10. 怎样优化训练策略和推理策略