In [1]:
# 引用需要的library
import numpy as np
import helper
import joblib
import os
import jieba
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import plot_tree
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')



In [2]:
# 从文本文件中读数据
# 读取到的内容
# X=评论（string）， y=情感分类（0/1）
X = []
y = []
folder_path = './user_comments'  # 请将此处替换为您的文件夹路径
stop_words_path = './stop_words.txt'
# 文本向量化
# vectorizers = [CountVectorizer(ngram_range=(1,1)), TfidfVectorizer(ngram_range=(1,1)), helper.OneHotVectorizer()]
vectorizers = [CountVectorizer(ngram_range=(1,2)), TfidfVectorizer(ngram_range=(1,2)), helper.OneHotVectorizer()]
# vectorizers = [CountVectorizer(), TfidfVectorizer()]

# 读文件
comments, labels = helper.Read_comments_from_file(folder_path=folder_path, stop_words_path=stop_words_path)

print(len(comments), len(labels))

for vectorizer in vectorizers:
    # 文本向量化
    # 返回的类型是scipy.sparse._csr.csr_matrix，是一个稀疏矩阵
    comments_vector = vectorizer.fit_transform(comments)
    X.append(comments_vector.toarray())
    
labels=np.array(labels)
y.append(labels)
# 打印完整的稀疏矩阵，需要设置：
# np.set_printoptions(threshold=np.inf)

# X=稀疏矩阵（int），y=情感分类（0/1）
# CountVectorizer
print(X[0].shape, y[0].shape)
# TfidfVectorizer
print(X[1].shape, y[0].shape)
# OneHotVectorizer
print(X[2].shape, y[0].shape)

# 打印词典
# voca = vectorizers[0].vocabulary_
# print(len(voca))
# for i in voca:
#     print(i, voca[i])
# vocabulary dict

# 对词典dict重新排序，按照value的顺序打印dict
# voca=sorted(voca.items(), key=lambda x: x[1])
# for item in voca:
#     print(item)



Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\gh\AppData\Local\Temp\jieba.cache
Loading model cost 0.332 seconds.
Prefix dict has been built successfully.


5999 5999
(5999, 150149) (5999,)
(5999, 150149) (5999,)
(5999, 23226) (5999,)


In [3]:
# 保存vectorizer到本地
for vectorizer in vectorizers:
    joblib.dump(value=vectorizer, filename="./models/"+type(vectorizer).__name__)

In [4]:
# 用于针对某一种模型的调优训练，执行这个地方
# 产生Dataset对象list，3组数据
data_sets = []
data_sets.append(helper.DataSet(X=X[0], y=y[0], vectorizer="Counter"))
data_sets.append(helper.DataSet(X=X[1], y=y[0], vectorizer="Tfidf"))
data_sets.append(helper.DataSet(X=X[2], y=y[0], vectorizer="OneHot"))

all_classification_models = helper.Get_test_model()
# all_classification_models = [LGBMClassifier(n_estimators=200, learning_rate=0.1)]
print(all_classification_models)

my_classification_models = []
for data_set in data_sets:
    for model in all_classification_models:
        my_classification_models.append(helper.PredictModel(model, data_set))
        
# 训练 & 预测
for model in my_classification_models:
    model.fit()
    model.predict()
    # 打印预测结果
    print(f"({model.model_name},{model.classification_data.vectorizer})({model.X_train_pre.shape[0]}), {model.train_duration}, {model.pred_duration}, {model.get_eval()}")
    model.save()

print("执行结束!!!")

[LogisticRegression(C=10)]
(LogisticRegression,Counter)(4799), 43.87, 0.33, 0.9083333333333333
(LogisticRegression,Tfidf)(4799), 40.08, 0.26, 0.9116666666666666
(LogisticRegression,OneHot)(4799), 6.87, 0.04, 0.9025
执行结束!!!


In [5]:
# 用于对比所有模型的指标，执行这个地方
# 产生Dataset对象list，3组数据
data_sets = []
data_sets.append(helper.DataSet(X=X[0], y=y[0], vectorizer="Counter"))
data_sets.append(helper.DataSet(X=X[1], y=y[0], vectorizer="Tfidf"))
data_sets.append(helper.DataSet(X=X[2], y=y[0], vectorizer="OneHot"))

# 9个模型
all_classification_models = helper.Make_model_classifier()
# 构建预测模型列表
my_classification_models = []
for data_set in data_sets:
    for model in all_classification_models:
        my_classification_models.append(helper.PredictModel(model, data_set))


# 训练 & 预测
for model in my_classification_models:
    model.fit()
    model.predict()
    # 打印预测结果
    print(f"({model.model_name},{model.classification_data.vectorizer})({model.X_train_pre.shape[0]}), {model.train_duration}, {model.pred_duration}, {model.get_eval()}")
    model.save()

print("执行结束!!!")

(KNeighborsClassifier,Counter)(4799), 0.0, 6.55, 0.7725
(LogisticRegression,Counter)(4799), 43.6, 0.33, 0.9083333333333333
(DecisionTreeClassifier,Counter)(4799), 40.21, 0.38, 0.83
(SVC,Counter)(4799), 323.86, 78.09, 0.8833333333333333
(RandomForestClassifier,Counter)(4799), 93.99, 0.65, 0.9058333333333334
(AdaBoostClassifier,Counter)(4799), 1871.06, 64.52, 0.8516666666666667
(GradientBoostingClassifier,Counter)(4799), 3560.96, 0.56, 0.8675
(XGBClassifier,Counter)(4799), 133.02, 0.38, 0.8841666666666667
(LGBMClassifier,Counter)(4799), 3.86, 0.32, 0.9016666666666666
(KNeighborsClassifier,Tfidf)(4799), 0.85, 3.11, 0.8566666666666667
(LogisticRegression,Tfidf)(4799), 40.93, 0.27, 0.9116666666666666
(DecisionTreeClassifier,Tfidf)(4799), 41.59, 0.4, 0.8391666666666666
(SVC,Tfidf)(4799), 569.67, 117.55, 0.9058333333333334
(RandomForestClassifier,Tfidf)(4799), 88.05, 0.5, 0.9058333333333334
(AdaBoostClassifier,Tfidf)(4799), 1877.0, 65.32, 0.8516666666666667
(GradientBoostingClassifier,Tfidf)(

In [None]:
# 这里用来画图
# 柱状图，能够一眼看出各数据的大小，比较数据之间的差别
# 分别打印train_duration, pred_duration, acc图
result_data = helper.Result_analysis(my_classification_models)
helper.Plot_analysis(result_data)


In [None]:
# TODO
# PCA降维？
# 怎么判断是否过拟合？
# 怎样优化训练策略和推理策略

# 1. 实现TF-IDF - done
# 2. 实现one-hot - done
# 3. 对稀疏矩阵的数据预处理：标准化/归一化？- done
# 4. 优化图表显示：显示中文？- no need
# 5. 优化调用文本向量化的算法的方式 - done
# 6. 过滤掉停用词（传参数给jieba），需要找到合适的停用词列表 - done
# 7. jieba能不能接受专用词列表？
# 8. 英文字符串问题，修改去掉空格的方式？- done
# 9. 读文件优化 - done