In [None]:
import numpy as np
import matplotlib.pyplot as mp
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.tree as st
import sklearn.datasets as sd  # sklearn提供的数据集
import sklearn.utils as su  # 可以把数据集按照行进行打乱
import sklearn.metrics as sm
import sklearn.ensemble as se
import joblib
import os
import random
from dataReadFeature import *
import warnings
warnings.filterwarnings("ignore")

In [None]:
#加载数据
databasefoler=r"../../data/temp/picFlex"
classLabel=["word","word_testData","digit","digit_testData"]

# 读取测试数据
test_x,test_y = FlexSensorDataRead(basefolder=databasefoler,classtype=classLabel[1]).getDataLabel()
# 读取训练数据
train_data = TorchDataset(basefolder=databasefoler, classtype=classLabel[0])
train_x,train_y=train_data.getAllData()

#查看一组数据
print("test_x",test_x[0],"test_y",test_y[0])
print("train_x",train_x[0],"label_y",train_y[0])

In [None]:
def printPredictionResult(test_y,pred_test_y):
    '''
    ;function: 模型执行结果参数显示
    ;parameters:
        test_y: 真实标签数据；
        pred_test_y: 模型预测的y数据
    '''
    print(sm.accuracy_score(test_y, pred_test_y))  #https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
    bg = sm.classification_report(test_y, pred_test_y) #y_true, y_pred,Build a text report showing the main classification metrics.
    print('分类报告：', bg, sep='\n')

In [None]:
#------------------SVM 分类模型--------------
from sklearn.model_selection import GridSearchCV

param_grid = [
	{'kernel':['linear'],'C':[1,10,100,1000]},
	{'kernel':['poly'],'C':[1,10],'degree':[2,3]},
	{'kernel':['rbf'],'C':[1,10,100,1000],'gamma':[1,0.1, 0.01, 0.001]}]

grid = GridSearchCV(svm.SVC(),param_grid,cv=5) #实例化一个GridSearchCV类
grid.fit(train_x, train_y)
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))


In [None]:
#------------------SVM 分类模型--------------
svm_model=svm.SVC(gamma='0.01',C=1)   #需要设置参数
svm_model.fit(train_x, train_y)
pred_test_y = svm_model.predict(test_x)
#统计准确率
printPredictionResult(test_y,pred_test_y)

In [None]:
#------------------k-近邻分类器--------------

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
#调参数
knn = KNeighborsClassifier()
grid_param={'n_neighbors':10,'algorithm',['auto','ball_tree','brute']}
grid=GridSearchCV(knn,grid_param,cv=5)
grid.fit(train_x, train_y)
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))

In [None]:
#------------------k-近邻分类器--------------
knn = KNeighborsClassifier()  #todo 需要根据上一步来设置模型参数
knn.fit(train_x, train_y)
pred_test_knn = knn.predict(test_x)
print('knn:',format(sm.accuracy_score(test_y, pred_test_knn),'.4f'))
print('recall:',format(sm.recall_score(test_y, pred_test_knn,average='macro'),'.4f'))
print('F1:',format(sm.f1_score(test_y, pred_test_knn, average='macro'),'.4f'))
#统计准确率
printPredictionResult(test_y,pred_test_knn)

In [None]:
#------------------Logistic Regression Classifier--------------
#调参数
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
grid_param={'penalty':['l1', 'l2', 'elasticnet', 'none'],'solver',['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],'C':np.logspace(1, 4, 20)}
grid=GridSearchCV(lg,grid_param,cv=5)
grid.fit(train_x, train_y)
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))

In [None]:
#------------------Logistic Regression Classifier--------------
lg = LogisticRegression(penalty='l2')  #todo 需要根据上一步来设置模型参数
lg.fit(train_x, train_y)
pred_test_lg = lg.predict(test_x)
print('accuracylg:',format(sm.accuracy_score(test_y, pred_test_lg),'.4f'))
printPredictionResult(test_y,pred_test_lg)

In [None]:
#------------------Random Forest Classifier--------------
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier()

param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 11),
              "min_samples_split": randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
rng = np.random.RandomState(0)
grid = HalvingRandomSearchCV(estimator=RFC, param_distributions=param_dist,
                            factor=2, random_state=rng)
grid.fit(train_x, train_y)
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))

In [None]:
#------------------Random Forest Classifier--------------
RFC = RandomForestClassifier(n_estimators=8)   #todo 需要根据上一步来设置模型参数
RFC.fit(train_x, train_y)
pred_test_rfc = RFC.predict(test_x)
printPredictionResult(test_y,pred_test_rfc)


In [None]:
#------------------DecisionTreeClassifier--------------
from sklearn import tree
param = [{'criterion':['gini'],'max_depth':[30,50,60,100],'min_samples_leaf':[2,3,5,10],'min_impurity_decrease':[0.1,0.2,0.5]},
         {'criterion':['gini','entropy']},
         {'max_depth': [30,60,100], 'min_impurity_decrease':[0.1,0.2,0.5]}]
grid = GridSearchCV(tree.DecisionTreeClassifier(),param_grid=param,cv=6)
grid.fit(train_x, train_y)
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))

In [None]:
#------------------DecisionTreeClassifier--------------
tre = tree.DecisionTreeClassifier()   #todo 需要根据上一步来设置模型参数
tre.fit(train_x, train_y)
pred_test_tre = tre.predict(test_x)
printPredictionResult(test_y,pred_test_tre)

In [None]:
#------------------MLPClassifier--------------
from sklearn import neural_network
mlp=neural_network.MLPClassifier(max_iter=1000)

param_grid = {
    'hidden_layer_sizes':[(10, ), (20, ), (5, 5)],
    'activation':['logistic', 'tanh', 'relu'],
    'alpha':[0.001, 0.01, 0.1, 0.4, 1]
}

grid = model_selection.GridSearchCV(estimator=mlp,
                                   param_grid=param_grid,
                                   scoring='accuracy', # 打分
                                   cv=gkf.split(X,y,groups), # cv 方法
                                   return_train_score=True, # 默认不返回 train 的score
                                   refit=True, # 默认为 True, 用最好的模型+全量数据再次训练，用 gscv.best_estimator_ 获取最好模型
                                   n_jobs=-1)
grid.fit(train_x, train_y)
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))


In [None]:
#------------------MLPClassifier--------------
mlp=neural_network.MLPClassifier(max_iter=1000) #todo 需要根据上一步来设置模型参数
mlp.fit(train_x, train_y)
pred_y = mlp.predict(test_x)
printPredictionResult(test_y,pred_y)

In [None]:
#------------------XGBClassifier--------------
from xgboost import XGBClassifier
#分类器使用 xgboost
clf1 = xgb.XGBClassifier()
 
#设定网格搜索的xgboost参数搜索范围，值搜索XGBoost的主要6个参数
param_dist = {
        'n_estimators':range(80,200,4),
        'max_depth':range(2,15,1),
        'learning_rate':np.linspace(0.01,2,20),
        'subsample':np.linspace(0.7,0.9,20),
        'colsample_bytree':np.linspace(0.5,0.98,10),
        'min_child_weight':range(1,9,1)
        }
 
#GridSearchCV参数说明，clf1设置训练的学习器
#param_dist字典类型，放入参数搜索范围
#scoring = 'neg_log_loss'，精度评价方式设定为“neg_log_loss“
#n_iter=300，训练300次，数值越大，获得的参数精度越大，但是搜索时间越长
#n_jobs = -1，使用所有的CPU进行训练，默认为1，使用1个CPU
grid = GridSearchCV(clf1,param_dist,cv = 3,scoring = 'neg_log_loss',n_iter=300,n_jobs = -1)
grid.fit(train_x, train_y)
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))

In [None]:
#------------------XGBClassifier--------------
mo = XGBClassifier()   #todo 需要根据上一步来设置模型参数
mo.fit(train_x, train_y)
pred_y = mo.predict(test_x)
printPredictionResult(test_y,pred_y)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

RF = RandomForestRegressor()
#设置初始的参数空间
n_estimators = [int(x) for x in np.linspace(start = 200,stop = 2000,num = 10)]
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,4]
max_depth = [5,8,10]
max_features = ['auto','sqrt']
bootstrap = [True,False]
#将参数整理为字典格式
random_params_group = {'n_estimators':n_estimators,
                      'min_samples_split':min_samples_split,
                      'min_samples_leaf':min_samples_leaf,
                      'max_depth':max_depth,
                      'max_features':max_features,
                      'bootstrap':bootstrap}
#建立RandomizedSearchCV模型
grid =RandomizedSearchCV(RF,param_distributions = random_params_group,n_iter = 100,scoring = 'neg_mean_squared_error',verbose = 2,n_jobs = -1,cv = 3,random_state = 0)
#使用该模型训练数据
grid.fit(train_x, train_y)
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))

In [None]:
#使用网格搜索进行细化处理
from sklearn.model_selection import GridSearchCV
import time

param_grid = {'n_estimators':[1100,1200,1300],
             'min_samples_split':[4,5,6,7],
             'min_samples_leaf':[3,4,5],
             'max_depth':[4,5,6,7]}
randomForest = RandomForestRegressor()
grid = GridSearchCV(randomForest,param_grid = param_grid,scoring = 'neg_mean_squared_error',cv = 3,n_jobs = -1)
start_time = time.time()
grid.fit(train_x, train_y)
end_time = time.time()
print('模型训练用时:{}'.format(end_time - start_time))
print("The best parameters are %s with a score of %0.2f" %(grid.best_params_, grid.best_score_))

In [None]:
randomForest = RandomForestRegressor() #todo 需要根据上一步来设置模型参数
randomForest.fit(train_x, train_y)
pred_y = mo.predict(test_x)
printPredictionResult(test_y,pred_y)

In [None]:
model = svm.SVC(kernel='linear', C=0.58)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)

#保存模型
file = r'../../modefiles/svm_no_yes.joblib'
joblib.dump(model,file)
# 读取模型
svm_model = joblib.load(file)
pred_test_y = svm_model.predict(test_x)
#统计准确率
print(sm.accuracy_score(test_y, pred_test_y))  #https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
bg = sm.classification_report(test_y, pred_test_y) #y_true, y_pred,Build a text report showing the main classification metrics.
print('分类报告：', bg, sep='\n')
# 列表左边的一列为分类的标签名；support列为每个标签的出现次数
# avg / total行为各列的均值（support列为总和）
# macro , weighted 介绍 https://www.cnblogs.com/laozhanghahaha/p/12499979.html
#保存混淆矩阵
#def confusion_matrix(pred_test_y):
#     coonfusion_plot = [[0]*10 for i in range(10)]
#     for i in range(10):
#         for j in range(30):
#             coonfusion_plot[i][pred_test_y[i*30+j]] += 1
#     return coonfusion_plot

# pred_test = confusion_matrix(pred_test_y)
# f_confusion = r'../../modefiles/confusion_matrix_no_yes.txt'
# with open(f_confusion, 'w') as fl3:
#     fl3.write(str(pred_test))

In [None]:
# # 【6】 GBDT(Gradient Boosting Decision Tree) Classifier
# # 梯度增强决策树分类器
from sklearn.ensemble import GradientBoostingClassifier

GBDT = GradientBoostingClassifier()
GBDT.fit(train_x, train_y)
pred_test_GBDT = GBDT.predict(test_x)
print('accuracyGBDT:',format(sm.accuracy_score(test_y, pred_test_GBDT),'.4f'))
print('recall:',format(sm.recall_score(test_y, pred_test_GBDT,average='macro'),'.4f'))
print('F1:',format(sm.f1_score(test_y, pred_test_GBDT, average='macro'),'.4f'))

# # 伯努利贝叶斯分类器
from sklearn.naive_bayes import BernoulliNB

Bernoulli = BernoulliNB()
Bernoulli.fit(train_x, train_y)
pred_test_BernoulliNB = Gaussian.predict(test_x)
print('BernoulliNB:',format(sm.accuracy_score(test_y, pred_test_BernoulliNB),'.4f'))

# # 多项式贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB

Multinomial = MultinomialNB()
Multinomial.fit(train_x, train_y)
pred_test_Multinomial = Multinomial.predict(test_x)
print('MultinomialNB:',format(sm.accuracy_score(test_y, pred_test_Multinomial),'.4f'))

# 【7】 GaussianNB
# # 高斯贝叶斯分类器
from sklearn.naive_bayes import GaussianNB

Gaussian = GaussianNB()
Gaussian.fit(train_x, train_y)
pred_test_Gaussian = Gaussian.predict(test_x)
print('accuracyGaussian:',format(sm.accuracy_score(test_y, pred_test_Gaussian),'.4f'))
print('recall:',format(sm.recall_score(test_y, pred_test_Gaussian,average='macro'),'.4f'))
print('F1:',format(sm.f1_score(test_y, pred_test_Gaussian, average='macro'),'.4f'))