>This node book is modified from https://github.com/andy6804tw/crazyai-ml

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import ml

## 1) 載入資料集
### 1.1) loading PSD spectrum from excel or parquet files
PSD spectrum is calculated from accelerometer data recordings.
Each sample records 10 seconds for 2 times with 3 accelerometers attached on different position:
* 1st run: left, right, and axial (channel labels are: lr_left, lr_right, and lr_axial).
* 2ed run: up, down, and the axial is the same as 1st run((channel labels are: ud_up, ud_down, and ud_axial)).

The sample are running with 20 and 100 duty, and also using normal(1 spike per order) and high resolution(10 spikes per order).
Therefore there are 4 different types of PSD spectrum.

|  duty | resolution | order number* | feature labels |
|-------|------------|---------------|----------------|
| 20    | normal     | 512  | 0, 1, 2, ... 512 |
| 20    | high       | 5120 | 0.0, 0.1, 0.2, ... 512.0 |
| 100   | normal     | 128  | 0, 1, 2, ... 128 |
| 100   | high       | 1280 | 0.0, 0.1, 0.2, ... 128.0 |

* order number: according to data points per round. The features may not all be used for training to improve the model performance.

when loading data, the sample_num is added in the last column, this sample_num is used for labeling normal or abnormal target
#### * option1: psd spectrum (averaged over windows)
>> samples are not enough when we use averaged psd, that means only one psd for each sample

In [None]:
import signal_processing
df = signal_processing.read_sheets('../../test_data//psd_100%//psd_100%.xlsx', usecols=[0,1,2,3], combine=True).transpose()
# add columns to describe the sensor channel and the sample_num
df['channel'] = [name[7:] for name in df.index]
df['sample_num'] = [name[:6] for name in df.index]
X = df.loc[df['channel'] == 'lr_left'].sample(frac = 1).reset_index(drop=True)
y = np.array([signal_processing.class_label(sample_num) for sample_num in X['sample_num']])

#### * option2: psd spectrum (unaveraged)
Each rounds creates a spectrum sample to enlarge the data set.

In [None]:
keyword='lr_left'
df = ml.load_data(format='parquet', dir='../../test_data//psd_100%//psd_window_100%//', keyword)
print(df.columns)

read psd_window_000039_lr_left.parquet.gzip
read psd_window_000027_lr_left.parquet.gzip
read psd_window_000051_lr_left.parquet.gzip
read psd_window_000050_lr_left.parquet.gzip
read psd_window_000030_lr_left.parquet.gzip
read psd_window_004073_lr_left.parquet.gzip
read psd_window_004802_lr_left.parquet.gzip
read psd_window_000037_lr_left.parquet.gzip
read psd_window_000045_lr_left.parquet.gzip
read psd_window_004124_lr_left.parquet.gzip
read psd_window_003861_lr_left.parquet.gzip
read psd_window_003720_lr_left.parquet.gzip
read psd_window_000785_lr_left.parquet.gzip
read psd_window_000048_lr_left.parquet.gzip
read psd_window_000053_lr_left.parquet.gzip
read psd_window_000052_lr_left.parquet.gzip
read psd_window_000022_lr_left.parquet.gzip
read psd_window_003735_lr_left.parquet.gzip
read psd_window_004072_lr_left.parquet.gzip
read psd_window_001833_lr_left.parquet.gzip
read psd_window_002577_lr_left.parquet.gzip
Index([         0.0,          1.0,          2.0,          3.0,          4.0,

### 1.2) preprocessing
#### 1.2.1. Add additional information to calculate the stats like mean and std of specified order range
this is experimental for improving test accuracy
#### 1.2.2. Drop high order features
according to [include enough data](https://cloud.google.com/vertex-ai/docs/tabular-data/tabular101#include-enough-data), for Classification problem: 50 rows x the number features
#### 1.3.3. Transfer feature label type from numerical to string
`autosklearn` takes `int` or `string` for feature label type, and the labels of model training data need to be identical with testing data 

In [3]:
col = 80
X = ml.preprocess_features(df, col=col)

calculate between 30 and 80


## 2) 切割訓練集與測試集
### 2.1) option 1: use all sample data for train/test (not recommended, causing data leakage)

In [None]:
# drop unused column
X = X.iloc[:, :513]
X.reset_index(drop=True, inplace=True)
print(X)
print(y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print('train shape:', X_train.shape)
print('test shape:', X_test.shape)

### 2.2) option 2: use partial sample data for train and another partial sample for test

#### 2.2.1) Simplify 3 categories from (0,1,2) to (0,1) for better result

#### 2.2.2) Separate train/test samples to avoid data leakage
* optional1: select a proportion of sample_num to be test sample, so the train and test sets are different in part number

In [None]:
all_sample = df.value_counts(subset='sample_num')
test_size = 0.3
test_sample = all_sample.sample(n=int(all_sample.shape[0]*test_size))
print('test sample number:', test_sample.index.to_list())
print('test_sample type:', [ml.label_transfer(sample_num) for sample_num in test_sample.index])

* optional2: specify 'test_sample' as control variables

In [5]:
test_set_no = 1
X_train, X_test, y_train, y_test = ml.train_test_split(df=X, test_samples=ml.test_sample['set%i'%test_set_no])
print(X_train.columns)

train shape: (43301, 82)
test shape: (17364, 82)
Index(['0.0', '1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0',
       '10.0', '11.0', '12.0', '13.0', '14.0', '15.0', '16.0', '17.0', '18.0',
       '19.0', '20.0', '21.0', '22.0', '23.0', '24.0', '25.0', '26.0', '27.0',
       '28.0', '29.0', '30.0', '31.0', '32.0', '33.0', '34.0', '35.0', '36.0',
       '37.0', '38.0', '39.0', '40.0', '41.0', '42.0', '43.0', '44.0', '45.0',
       '46.0', '47.0', '48.0', '49.0', '50.0', '51.0', '52.0', '53.0', '54.0',
       '55.0', '56.0', '57.0', '58.0', '59.0', '60.0', '61.0', '62.0', '63.0',
       '64.0', '65.0', '66.0', '67.0', '68.0', '69.0', '70.0', '71.0', '72.0',
       '73.0', '74.0', '75.0', '76.0', '77.0', '78.0', '79.0',
       'mean_energy_30_80', 'std_energy_30_80'],
      dtype='object')


## Auto-sklearn
首先我們來測試第一版的 Auto-sklearn。以下是模型常用的超參數以及方法，詳細內容可以參考官方 API [文件](https://automl.github.io/auto-sklearn/master/api.html)。

Parameters:
- time_left_for_this_task: 搜尋時間(秒)，預設3600秒(6分鐘)。
- per_run_time_limit: 每個模型訓練的上限時間，預設為time_left_for_this_task的1/10。
- ensemble_size: 模型輸出數量，預設50。
- resampling_strategy: 資料採樣方式。為了避免過擬合，可以採用交叉驗證機制。預設方法為最基本的 holdout。

Attributes:
- cv_results_: 查詢模型搜尋結果以及每個最佳模型的超參數。

Methods:
- fit: 放入X、y進行模型擬合。
- refit: 使用 fit 尋找好的參數後，再使用所有的資料進行最後微調。
- predict: 預測並回傳預測類別。
- score: 預測成功的比例。
- predict_proba: 預測每個類別的機率值。
- leaderboard: 顯示 k 個 ensemble 模型並排名。

In [None]:
automlclassifierV1 = ml.train_autosklearn_v1_model(X_train, X_test, y_train, y_test)

In [None]:
# 查看模型參數
import pandas as pd
df_cv_results = pd.DataFrame(automlclassifierV1.cv_results_).sort_values(by = 'mean_test_score', ascending = False)
df_cv_results

In [None]:
# 模型聚合結果
automlclassifierV1.leaderboard(detailed = True, ensemble_only=True)

### 使用 Auto-sklearn 2.0

In [None]:
automlclassifierV2 = ml.train_autosklearn_v2_model(X_train, X_test, y_train, y_test)

## Performance metrics
### Performance-over-time plot
using the plot to check wether the time limit is sufficient from [example](https://automl.github.io/auto-sklearn/master/examples/40_advanced/example_pandas_train_test.html#sphx-glr-examples-40-advanced-example-pandas-train-test-py)

In [None]:
poT = automlclassifierV1.performance_over_time_
poT.plot(
    x='Timestamp',
    kind='line',
    legend=True,
    title='Auto-sklearn accuracy over time',
    grid=True
)
plt.show()

### Confusion Matrix
>`ConfusionMatrixDisplay.from_predictions()` is not supported in this version of scikit-learn 

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = automlclassifierV2.predict(X_test)

titles_options = [('Confusion Matrix', None), ('Normalized Confusion Matrix', 'true')]
fig, axes = plt.subplots(1, 2, layout='constrained', figsize=(12, 5), sharey='row')

for i, (title, normalize) in enumerate(titles_options):
    cm = confusion_matrix(y_test, y_pred, labels=[1, 0], normalize=normalize)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['abnormal', 'normal'])
    disp.plot(cmap=plt.cm.Blues, ax=axes[i])
    axes[i].set_title(title, fontsize=14)

### Classification Report

In [None]:
from sklearn.metrics import classification_report
y_pred = automlclassifierV1.predict(X_test)
print(classification_report(y_test, y_pred))

## 真實分類

In [None]:
# 建立測試集的 DataFrame
columns = []
for i in X_test.columns:
    columns.append(str(i))
df_test=pd.DataFrame(X_test.to_numpy(), columns=columns, index=X_test.index)
df_test['Type'] = y_test
pred = automlclassifierV2.predict(X_test)
df_test['Predict'] = pred
#df_test.to_excel('prediction_window.xlsx') # for further analysis

In [None]:
sns.lmplot(x='52', y='53', hue='Type', data=df_test, fit_reg=False, legend=False)
plt.legend(title='target', loc='upper left', labels=['normal', 'bearing noise', 'unknown noise'])
plt.show()

## Auto-sklearn (訓練集)預測結果

In [None]:
sns.lmplot(x='52', y='53', data=df_test, hue="Predict", fit_reg=False, legend=False)
plt.legend(title='target', loc='upper left', labels=['normal', 'bearing noise', 'unknown noise'])
plt.show()

## 查看每個模型的權重
我們可以使用模型提供的方法查看最終訓練結果，並查看 k 個 Ensemble 模型的訓練結果以及每個模型的權重。

In [None]:
automlclassifierV2.leaderboard(detailed = True, ensemble_only=True)

## 輸出模型
如果想將 AutoML 的模型儲存起來，可以透過 `joblib` 將模型打包匯出。

In [None]:
# 匯出模型
name = '../../model//%s_set%i_%i'%(keyword, test_set_no, col)
ml.save_model(automlclassifierV1, automlclassifierV2, name)

In [None]:
# 匯入模型
from joblib import load
clf = load('../../model//100duty_stats//%s_v1.joblib'%name)

In [None]:
# detail of the model
best_model_info = clf.show_models()
print(best_model_info)

In [None]:
# 模型預測測試
clf.predict(X_test)

## 視覺化 AutoML 模型

In [None]:
#pip install pipelineprofiler

In [None]:
import PipelineProfiler

profiler_data= PipelineProfiler.import_autosklearn(automlclassifierV2)
PipelineProfiler.plot_pipeline_matrix(profiler_data)