In [152]:
import os
import numpy as np
import pandas as pd
import csv
import random
import time
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split
from pandas import Series, DataFrame
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from scipy.signal import savgol_filter
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing

In [153]:
#可视化函数，返回一个plt对象
def PlotSpectrum(spec):
    plt.figure(figsize=(5.2, 3.1), dpi=600)
    col = spec.columns.values.tolist()
    x = np.linspace(float(col[0]), float(col[-1]),len(col))
    for i in range(spec.shape[0]):
        plt.plot(x, spec.iloc[i, :], linewidth=1)
    fonts = 10
    plt.xlim(float(col[0]), float(col[-1]))
    plt.xlabel('Wavelength (nm)', fontsize=fonts)
    plt.ylabel('absorbance (AU)', fontsize=fonts)
    plt.yticks(fontsize=fonts)
    plt.xticks(fontsize=fonts)
    plt.tight_layout(pad=0.3)
    plt.grid(True)
    return plt

In [154]:
#定义SG-filter预处理函数
def smooth_acf_savgol_(sdata, windowsize=9, polyorder=3):#s_filter
    col = sdata.columns.values.tolist()
    smoothed = savgol_filter(sdata, windowsize, polyorder)
    return pd.DataFrame(smoothed, columns=col)

In [155]:
#定义minmaxscaler预处理函数
def minmaxScaler(sdata):
    col = sdata.columns.values.tolist()#获取行索引
    scaler = MinMaxScaler()
    scaler.fit(sdata)
    newsdata=scaler.transform(sdata)#得到array
    new_sdata=pd.DataFrame(newsdata, columns=col)#转为dataframe
    return new_sdata
def scale(X_train, X_test):
    col=X_train.columns.tolist()
    scaler = preprocessing.MinMaxScaler().fit(X_train) 
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return [pd.DataFrame(X_train_scaled,columns=col), pd.DataFrame(X_test_scaled,columns=col)] 

In [156]:
def set_random():
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(42)
    random.seed(12345)

In [157]:
#数据应当按照类别 均匀 划分
def data_split(df,test_percent):
    col = df.columns.values.tolist()
    # label_num=list(set(df['year'].tolist()))
    label_num=[1,3,6,9]
    df_train=pd.DataFrame(columns=col)
    df_test=pd.DataFrame(columns=col)
    for i in label_num:
        # if i==5:continue
        # print(i)
        _train,_test=train_test_split(df[df["year"]==i],test_size = test_percent,random_state=42)
        df_train=df_train.append(_train)#注意覆盖
        df_test=df_test.append(_test)
    return df_train, df_test

In [158]:
#特征与label划分
def feature_label_split(sdata):
    feature=sdata.iloc[:,0:sdata.shape[1]-1]
    label=sdata.iloc[:,sdata.shape[1]-1]
    return feature,label

In [159]:
#数据读取与可视化
data=pd.read_csv('yangben.csv',header=0,index_col=0)
data=data.dropna(axis=0,how='any')
data=data.drop(index=(data[data["462"]>0.15].index))
# PlotSpectrum(data.iloc[:,0:data.shape[1]-1]).show()
# data[data["year"]==3].shape

In [160]:
#数据划分
df_train,df_test=data_split(data,test_percent=0.2)
train_x,train_y=feature_label_split(df_train)
test_x,test_y=feature_label_split(df_test)

In [161]:
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((767, 130), (767,), (193, 130), (193,))

In [162]:
#5折交叉验证,寻找最优参数
def find_k(train_x,train_y):
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=4, random_state=42, shuffle=True)
    kf.get_n_splits(train_x)
    ACCU={}
    for k in range(2,42):
        accu=0
        for train_index, validation_index in kf.split(train_x):
            X_train, X_valid = train_x.iloc[train_index], train_x.iloc[validation_index]
            y_train, y_valid = train_y.iloc[train_index], train_y.iloc[validation_index]

            #使用pls得到降维之后的特征
            pls = PLSRegression(n_components=k)
            pls.fit(X_train,y_train)
            new_X_train=pls.transform(X_train, Y=None, copy=True)
            new_X_valid=pls.transform(X_valid, Y=None, copy=True)
            y_valid = y_valid.astype('int')#注意 object->object
            y_train = y_train.astype('int')
            #使用LDA分类器(无超参数)
            lda=LinearDiscriminantAnalysis()
            lda.fit(new_X_train,y_train)
            y_train_pred=lda.predict(new_X_train)
            y_valid_pred=lda.predict(new_X_valid)
            acc_train=accuracy_score(y_train_pred,y_train)
            acc_valid=accuracy_score(y_valid_pred,y_valid)
            accu+=acc_valid
        ACCU[k]=accu
    #根据验证集选取最优超参数
    k=max(ACCU,key=lambda x:ACCU[x])
    return k

In [163]:
#PLS-LDA
def PLS_LDA(train_x,train_y,test_x,test_y):
    k=find_k(train_x,train_y)
    print("best_k={:d}".format(k))
    #得到最优的PLS-LDA并测试
    PLS_best=PLSRegression(n_components=k)
    PLS_best.fit(train_x,train_y)
    new_train_x=PLS_best.transform(train_x,Y=None, copy=True)#降维
    new_test_x=PLS_best.transform(test_x,Y=None, copy=True)

    train_y = train_y.astype('int')
    test_y = test_y.astype('int')
    lda_best=LinearDiscriminantAnalysis().fit(new_train_x,train_y)
    train_y_pred=lda_best.predict(new_train_x)
    test_y_pred=lda_best.predict(new_test_x)
    
    acc_train = accuracy_score(train_y_pred,train_y)
    acc_test = accuracy_score(test_y_pred,test_y)
    print('\n\ntraining_accuracy:{:.4f}\n test_accuracy:{:.4f}\n'.format(acc_train, acc_test))
    print('test_confusion_matrix:\n',confusion_matrix(test_y_pred,test_y))
    return acc_train,acc_test

In [164]:
def SV_classfic(train_x,train_y,test_x,test_y):
    from sklearn import svm
    from sklearn.model_selection import GridSearchCV
    param_dist = {
    'svc': {
        #'C': [10**k for k in range(-5,10)],
        'C': np.arange(100,900,100).tolist(),
        #'C': np.arange(10,500,20).tolist(),
        #'C': np.arange(30,300,10).tolist(),
        #'C': np.arange(50,265,5).tolist(),
        #'C': [10**k for k in range(-5,10)],
        # 'gamma': [1e-1]
       'gamma':[10**j for j in range(-5,0)]
            }
    }
    svc =svm.SVC(kernel='linear', gamma='auto')
    #svr =svm.SVR(gamma='auto')
    gsearch = GridSearchCV(estimator=svc,param_grid =param_dist['svc'],scoring='neg_mean_absolute_error',n_jobs=-1, 
                           refit=True, cv=5, verbose=0)
    gsearch.fit(train_x,train_y)
    train_y_pred=gsearch.predict(train_x)
    test_y_pred=gsearch.predict(test_x)
    acc_train = accuracy_score(train_y_pred,train_y)
    acc_test = accuracy_score(test_y_pred,test_y)
    print('\n\ntraining_accuracy:{:.4f}\n test_accuracy:{:.4f}\n'.format(acc_train, acc_test))
    print('test_confusion_matrix:\n',confusion_matrix(test_y_pred,test_y))
    print(gsearch.best_params_)
    return acc_train,acc_test


In [165]:
#不采用预处理方法
acc_train,acc_test=PLS_LDA(train_x,train_y,test_x,test_y)
result_table = pd.DataFrame(columns=['algorithm','train_accu', 'test_accu'])
result_PLSLDA = {'algorithm': 'PLS-LDA','train_accu': acc_train,'test_accu': acc_test}
result_table = result_table.append(result_PLSLDA, sort=False, ignore_index=True)
result_table

best_k=40


training_accuracy:0.7718
 test_accuracy:0.6891

test_confusion_matrix:
 [[ 9  0  0  0]
 [ 3 59 16  2]
 [ 4 26 49  5]
 [ 0  2  2 16]]


Unnamed: 0,algorithm,train_accu,test_accu
0,PLS-LDA,0.771838,0.689119


In [166]:
#minmax数据预处理与可视化
df_train,df_test=data_split(data,test_percent=0.2)
train_x,train_y=feature_label_split(df_train)
test_x,test_y=feature_label_split(df_test)
train_x,test_x=scale(train_x,test_x)
# PlotSpectrum(train_x).show()
# PlotSpectrum(test_x).show()
#minmaxscaling+PLS-LDA
acc_train,acc_test=PLS_LDA(train_x,train_y,test_x,test_y)
result_minmax_PLSLDA = {'algorithm': 'minmax+PLS-LDA','train_accu': acc_train,'test_accu': acc_test}
result_table = result_table.append(result_minmax_PLSLDA, sort=False, ignore_index=True)
result_table

best_k=40


training_accuracy:0.7718
 test_accuracy:0.6891

test_confusion_matrix:
 [[ 9  0  0  0]
 [ 3 59 16  2]
 [ 4 26 49  5]
 [ 0  2  2 16]]


Unnamed: 0,algorithm,train_accu,test_accu
0,PLS-LDA,0.771838,0.689119
1,minmax+PLS-LDA,0.771838,0.689119


In [167]:
#SGFILTER数据预处理与可视化
df_train,df_test=data_split(data,test_percent=0.2)
train_x,train_y=feature_label_split(df_train)
test_x,test_y=feature_label_split(df_test)

train_x=smooth_acf_savgol_(train_x)
test_x=smooth_acf_savgol_(test_x)

# PlotSpectrum(train_x).show()
# PlotSpectrum(test_x).show()
#SG_filter+PLS-LDA
acc_train,acc_test=PLS_LDA(train_x,train_y,test_x,test_y)
result_SGF_PLSLDA = {'algorithm': 'SGF+PLS-LDA','train_accu': acc_train,'test_accu': acc_test}
result_table = result_table.append(result_SGF_PLSLDA, sort=False, ignore_index=True)
result_table

best_k=35


training_accuracy:0.7432
 test_accuracy:0.7047

test_confusion_matrix:
 [[10  0  0  0]
 [ 2 62 17  1]
 [ 4 23 48  6]
 [ 0  2  2 16]]


Unnamed: 0,algorithm,train_accu,test_accu
0,PLS-LDA,0.771838,0.689119
1,minmax+PLS-LDA,0.771838,0.689119
2,SGF+PLS-LDA,0.743155,0.704663


In [168]:
#不采用预处理方法
acc_train,acc_test=SV_classfic(train_x,train_y,test_x,test_y)
result_table = pd.DataFrame(columns=['algorithm','train_accu', 'test_accu'])
result_SVC = {'algorithm': 'SVC','train_accu': acc_train,'test_accu': acc_test}
result_table = result_table.append(result_SVC, sort=False, ignore_index=True)
result_table



training_accuracy:0.7419
 test_accuracy:0.7150

test_confusion_matrix:
 [[14  0  0  0]
 [ 2 60 17  2]
 [ 0 21 46  3]
 [ 0  6  4 18]]
{'C': 800, 'gamma': 1e-05}


Unnamed: 0,algorithm,train_accu,test_accu
0,SVC,0.741851,0.715026


In [169]:
#minmax数据预处理与可视化
df_train,df_test=data_split(data,test_percent=0.2)
train_x,train_y=feature_label_split(df_train)
test_x,test_y=feature_label_split(df_test)
train_x,test_x=scale(train_x,test_x)
# PlotSpectrum(train_x).show()
# PlotSpectrum(test_x).show()
#minmaxscaling+SVC
acc_train,acc_test=SV_classfic(train_x,train_y,test_x,test_y)
result_SVC = {'algorithm': 'minmaxscaling+SVC','train_accu': acc_train,'test_accu': acc_test}
result_table = result_table.append(result_SVC, sort=False, ignore_index=True)
result_table



training_accuracy:0.8096
 test_accuracy:0.7409

test_confusion_matrix:
 [[13  1  0  0]
 [ 3 63 15  1]
 [ 0 20 47  2]
 [ 0  3  5 20]]
{'C': 200, 'gamma': 1e-05}


Unnamed: 0,algorithm,train_accu,test_accu
0,SVC,0.741851,0.715026
1,minmaxscaling+SVC,0.809648,0.740933


In [170]:
#SGFILTER数据预处理与可视化
df_train,df_test=data_split(data,test_percent=0.2)
train_x,train_y=feature_label_split(df_train)
test_x,test_y=feature_label_split(df_test)

train_x=smooth_acf_savgol_(train_x)
test_x=smooth_acf_savgol_(test_x)

# PlotSpectrum(train_x).show()
# PlotSpectrum(test_x).show()
#SG_filter+SVC
acc_train,acc_test=SV_classfic(train_x,train_y,test_x,test_y)
result_SVC = {'algorithm': 'SG_filter+SVC','train_accu': acc_train,'test_accu': acc_test}
result_table = result_table.append(result_SVC, sort=False, ignore_index=True)
result_table



training_accuracy:0.7419
 test_accuracy:0.7150

test_confusion_matrix:
 [[14  0  0  0]
 [ 2 60 17  2]
 [ 0 21 46  3]
 [ 0  6  4 18]]
{'C': 800, 'gamma': 1e-05}


Unnamed: 0,algorithm,train_accu,test_accu
0,SVC,0.741851,0.715026
1,minmaxscaling+SVC,0.809648,0.740933
2,SG_filter+SVC,0.741851,0.715026
