>This node book is modified from https://github.com/andy6804tw/crazyai-ml

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import signal_processing

## 1) 載入資料集
### 1.1) psd spectrum (averaged over windows)

In [None]:
X = signal_processing.read_sheets('psd.xlsx', usecols=[0,1,2,3], combine=True)
X = X.transpose()
y = np.array([signal_processing.class_label(sample.split(' ')[-1].split('_')[0]) for sample in X.index])

### 1.2) psd spectrum (unaveraged)

In [3]:
df = signal_processing.read_sheets('psd_window.xlsx', combine=True, axis=0)
print(df.head())
print(df.info())

There are 126 sheets in this workbook ( psd_window.xlsx )
          0         1         2         3         4         5             6  \
0  0.000394  0.000931  0.000232  0.000036  0.000125  0.000029  2.462081e-08   
1  0.000388  0.000923  0.000232  0.000036  0.000126  0.000029  4.138839e-08   
2  0.000398  0.000929  0.000230  0.000035  0.000122  0.000029  6.400088e-08   
3  0.000396  0.000925  0.000227  0.000036  0.000124  0.000029  8.550375e-08   
4  0.000251  0.000593  0.000147  0.000035  0.000123  0.000029  1.333229e-07   

          7         8         9  ...           504           505  \
0  0.000004  0.000017  0.000005  ...  1.072361e-08  4.413828e-09   
1  0.000004  0.000017  0.000005  ...  9.712649e-09  6.340550e-09   
2  0.000004  0.000017  0.000005  ...  6.594086e-09  7.198611e-09   
3  0.000004  0.000018  0.000005  ...  4.066844e-09  4.107293e-09   
4  0.000005  0.000018  0.000005  ...  1.051115e-08  2.345412e-08   

            506           507           508           509 

In [4]:
# add columns to describe the sensor channel and the sample_num
df['channel'] = [name[7:] for name in df['name']]
df['sample_num'] = [name[:6] for name in df['name']]
print(df.head())

          0         1         2         3         4         5             6  \
0  0.000394  0.000931  0.000232  0.000036  0.000125  0.000029  2.462081e-08   
1  0.000388  0.000923  0.000232  0.000036  0.000126  0.000029  4.138839e-08   
2  0.000398  0.000929  0.000230  0.000035  0.000122  0.000029  6.400088e-08   
3  0.000396  0.000925  0.000227  0.000036  0.000124  0.000029  8.550375e-08   
4  0.000251  0.000593  0.000147  0.000035  0.000123  0.000029  1.333229e-07   

          7         8         9  ...           506           507  \
0  0.000004  0.000017  0.000005  ...  9.671872e-09  3.596593e-09   
1  0.000004  0.000017  0.000005  ...  1.997388e-08  2.283715e-09   
2  0.000004  0.000017  0.000005  ...  2.412018e-08  4.293159e-09   
3  0.000004  0.000018  0.000005  ...  1.866381e-08  4.129371e-09   
4  0.000005  0.000018  0.000005  ...  1.782503e-08  2.957801e-09   

            508           509           510           511           512  \
0  4.412334e-08  2.614256e-08  2.964701e-

## 2) 切割訓練集與測試集
### 2.1) option 1: use all sample data for train/test

In [None]:
# select a particular channel
X = df.loc[df['channel'] == 'ud_axial']
y = np.array([signal_processing.class_label(sample_num) for sample_num in X['sample_num']])

# drop unused column
X = X.iloc[:, :513]
X.reset_index(drop=True, inplace=True)
print(X)
print(y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print('train shape:', X_train.shape)
print('test shape:', X_test.shape)

### 2.2) option 2: use partial sample data for train and another partial sample for test

In [5]:
def label_transfer(sample_num: str):
    '''
    transfer sample number to label 0 and 1
    '''
    return 0 if signal_processing.class_label(sample_num) == 0 else 1

In [None]:
# select a particular channel and shuffle
X = df.loc[df['channel'] == 'lr_right'].sample(frac = 1).reset_index(drop=True)

# select a proportion of sample_num to be test sample, so the train and test sets are different in part number 
all_sample = X.value_counts(subset='sample_num')
test_size = 0.3
test_sample = all_sample.sample(n=int(all_sample.shape[0]*test_size))

print('test sample number:', test_sample.index.to_list())
print('test_sample type:', [label_transfer(sample_num) for sample_num in test_sample.index])

# separate train and test set
X_train = X.loc[[x not in test_sample for x in X['sample_num']]]
X_test = X.loc[[x in test_sample for x in X['sample_num']]]

y_train = np.array([label_transfer(sample_num) for sample_num in X_train['sample_num']])
y_test = np.array([label_transfer(sample_num) for sample_num in X_test['sample_num']])

# drop unused column
X_train = X_train.iloc[:, :200]
X_test = X_test.iloc[:, :200]
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
print('train shape:', X_train.shape)
print('test shape:', X_test.shape)

test sample number: ['001833', '000053', '003861', '000048', '000051', '000052']
test_sample type: [1, 0, 1, 0, 0, 0]
train shape: (5881, 200)
test shape: (2398, 200)


## Auto-sklearn
首先我們來測試第一版的 Auto-sklearn。以下是模型常用的超參數以及方法，詳細內容可以參考官方 API [文件](https://automl.github.io/auto-sklearn/master/api.html)。

Parameters:
- time_left_for_this_task: 搜尋時間(秒)，預設3600秒(6分鐘)。
- per_run_time_limit: 每個模型訓練的上限時間，預設為time_left_for_this_task的1/10。
- ensemble_size: 模型輸出數量，預設50。
- resampling_strategy: 資料採樣方式。為了避免過擬合，可以採用交叉驗證機制。預設方法為最基本的 holdout。

Attributes:
- cv_results_: 查詢模型搜尋結果以及每個最佳模型的超參數。

Methods:
- fit: 放入X、y進行模型擬合。
- refit: 使用 fit 尋找好的參數後，再使用所有的資料進行最後微調。
- predict: 預測並回傳預測類別。
- score: 預測成功的比例。
- predict_proba: 預測每個類別的機率值。
- leaderboard: 顯示 k 個 ensemble 模型並排名。

In [None]:
import autosklearn.classification
automlclassifierV1 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=600,
    per_run_time_limit=200,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 5}
)
automlclassifierV1.fit(X_train, y_train)

In [None]:
# 預測成功的比例
print('automlclassifierV1 訓練集: ',automlclassifierV1.score(X_train,y_train))
print('automlclassifierV1 測試集: ',automlclassifierV1.score(X_test,y_test))

In [None]:
# 查看模型參數
df_cv_results = pd.DataFrame(automlclassifierV1.cv_results_).sort_values(by = 'mean_test_score', ascending = False)
df_cv_results

In [None]:
# 模型聚合結果
automlclassifierV1.leaderboard(detailed = True, ensemble_only=True)

### 使用 Auto-sklearn 2.0

In [7]:
from autosklearn.experimental.askl2 import AutoSklearn2Classifier

automlclassifierV2 = AutoSklearn2Classifier(time_left_for_this_task=600, per_run_time_limit=200)
automlclassifierV2.fit(X_train, y_train)

AutoSklearn2Classifier(metric=accuracy, per_run_time_limit=200,
                       time_left_for_this_task=600)

In [8]:
# 預測成功的比例
print('automlclassifierV2 訓練集: ',automlclassifierV2.score(X_train,y_train))
print('automlclassifierV2 測試集: ',automlclassifierV2.score(X_test,y_test))

automlclassifierV2 訓練集:  1.0
automlclassifierV2 測試集:  0.682652210175146


## 真實分類

In [None]:
# 建立測試集的 DataFrame
columns = []
for i in X_test.columns:
    columns.append(str(i))
df_test=pd.DataFrame(X_test.to_numpy(), columns=columns, index=X_test.index)
df_test['Type'] = y_test
pred = automlclassifierV2.predict(X_test)
df_test['Predict'] = pred
#df_test.to_excel('prediction_window.xlsx') # for further analysis

In [None]:
sns.lmplot(x='52', y='53', hue='Type', data=df_test, fit_reg=False, legend=False)
plt.legend(title='target', loc='upper left', labels=['normal', 'bearing noise', 'unknown noise'])
plt.show()

## Auto-sklearn (訓練集)預測結果

In [None]:
sns.lmplot(x='52', y='53', data=df_test, hue="Predict", fit_reg=False, legend=False)
plt.legend(title='target', loc='upper left', labels=['normal', 'bearing noise', 'unknown noise'])
plt.show()

## 查看每個模型的權重
我們可以使用模型提供的方法查看最終訓練結果，並查看 k 個 Ensemble 模型的訓練結果以及每個模型的權重。

In [None]:
automlclassifierV2.leaderboard(detailed = True, ensemble_only=True)

## 輸出模型
如果想將 AutoML 的模型儲存起來，可以透過 `joblib` 將模型打包匯出。

In [None]:
from joblib import dump, load

In [None]:
# 匯出模型
dump(automlclassifierV2, 'model_ud_axial.joblib')

In [None]:
# 匯入模型
clf = load('model_right.joblib')

In [None]:
# 模型預測測試
clf.predict(X_test)

## 視覺化 AutoML 模型

In [None]:
#pip install pipelineprofiler

In [None]:
import PipelineProfiler

profiler_data= PipelineProfiler.import_autosklearn(automlclassifierV2)
PipelineProfiler.plot_pipeline_matrix(profiler_data)