In [1]:
import numpy as np
from sklearn.pipeline import Pipeline
from transformers import DataLoader, CustomPreprocessor, FeatureTargetSplitter, FeatureDataTypeExtractor
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from models import baseline_model, custom_model, ensemble_model, autokeras_model
import tensorflow as tf
from performance import calculate_roc_auc

In [2]:
# you should specify necessary informations about data
SEED = 42
DEBUG_PATH = "udaq_tsfresh/Data/D1_turbidity_data_2022_08_16/processed/sivi/sivi_combined_tests_1sec_pickle"
DATA_PATH = "../../Data/D1_turbidity_data_2022_08_16/processed/sivi/sivi_combined_tests_1sec_pickle"
CUSTOM_PREPROCESS = True
TARGET = 'Dirt'

In [3]:
# load data and then custom preprocessing if necessary
preprocessor_pipeline = Pipeline([
    ('load_data', DataLoader(DATA_PATH)),
    ('custom_preprocess', CustomPreprocessor(CUSTOM_PREPROCESS)),
    ('feauture_target_split', FeatureTargetSplitter(TARGET)),
    ('feature_type_extract', FeatureDataTypeExtractor(TARGET))

])

In [4]:
X, y, FEATURES = preprocessor_pipeline.fit_transform(None) 

NUMERICAL = FEATURES[0]
CATEGORICAL = FEATURES[1]

y = LabelEncoder().fit_transform(y)


In [5]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Define numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, CATEGORICAL),
    ('num', num_pipe, NUMERICAL)
])

In [6]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cvscores = []

for train, test in kfold.split(X, y):

    model = baseline_model() # in order to surpass tf warning message, move this line outside of the loop
 # create model
    pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model),
    ])


    x_train, x_test = pd.DataFrame(X.iloc[train]), pd.DataFrame(X.iloc[test])
    y_train, y_test = np.asarray(pd.DataFrame(y).iloc[train]), np.asarray(pd.DataFrame(y).iloc[test])


    pipe.fit(x_train, y_train)

    #  evaluate the model
    y_pred = pipe.predict(x_test)


    score = accuracy_score(y_test, y_pred.round())

    print('Test Accuracy: %.3f' % score ) # format float output

    cvscores.append(score * 100)
 
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

Test Accuracy: 0.625
Test Accuracy: 0.750
Test Accuracy: 0.500
Test Accuracy: 0.500
Test Accuracy: 0.571
58.93% (+/- 9.31%)


In [21]:
pipe2 = Pipeline([
('preprocessor', preprocessor)
])

pipe2.fit(X)
df = pd.DataFrame(pipe2.transform(X))

y

array([1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [22]:
x_tr, x_ts, y_tr, y_ts = train_test_split(df, y, test_size=0.3)

In [23]:
x_tr

Unnamed: 0,0,1,2,3,4,5,6
6,1.0,0.0,0.0,-0.907558,-1.076414,-1.40125,-1.795352
23,0.0,1.0,0.0,-1.035733,-1.20495,-1.138871,0.336146
18,0.0,0.0,1.0,-0.092728,-0.248233,-0.408956,0.648131
31,0.0,1.0,0.0,-0.251421,-0.039362,0.586129,0.540167
15,0.0,1.0,0.0,0.325369,0.132993,-0.083774,0.567158
1,0.0,1.0,0.0,1.924511,1.326332,1.049479,-2.090666
16,0.0,1.0,0.0,-0.133927,0.369616,0.728484,0.422677
2,0.0,1.0,0.0,0.201771,0.559498,0.43261,-1.760423
32,0.0,1.0,0.0,-0.603904,-0.037902,-0.47455,0.572715
33,1.0,0.0,0.0,-0.15529,-0.116776,-0.315448,0.64575


In [25]:
for i in range(10):
    x_tr, x_ts, y_tr, y_ts = train_test_split(df, y, test_size=0.3,random_state=42)
    m = baseline_model()
    model.fit(x_tr, y_tr, epochs=50, batch_size=1)
    test_loss, test_acc = model.evaluate(x_ts, y_ts)
    print('Test accuracy:', test_acc)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 1.0
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epo