In [1]:
import numpy as np
from sklearn.pipeline import Pipeline
from transformers import DataLoader, CustomPreprocessor, FeatureTargetSplitter, FeatureDataTypeExtractor
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from models import baseline_model, custom_model, ensemble_model, autokeras_model
import tensorflow as tf
from performance import calculate_roc_auc
import matplotlib.pyplot as plt


In [2]:
SEED = 42
DEBUG_PATH = ""
DATA_PATH = "fabric_type_estimation.csv"
CUSTOM_PREPROCESS = True
TARGET = 'Fabric_type'
LOAD_TYPE = "csv"


In [3]:
# load data and then custom preprocessing if necessary
preprocessor_pipeline = Pipeline([
    ('load_data', DataLoader(DATA_PATH, LOAD_TYPE)),
    ('custom_preprocess', CustomPreprocessor(CUSTOM_PREPROCESS)),
    ('feauture_target_split', FeatureTargetSplitter(TARGET)),
    ('feature_type_extract', FeatureDataTypeExtractor(TARGET))

])

In [4]:
data = pd.read_csv("fabric_type_estimation.csv")


In [5]:
X, y, FEATURES = preprocessor_pipeline.fit_transform(None) 

KeyError: 'Fabric_type'

In [None]:
NUMERICAL = FEATURES[0]
CATEGORICAL = FEATURES[1]

y = LabelEncoder().fit_transform(y)

In [None]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Define numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, CATEGORICAL),
    ('num', num_pipe, NUMERICAL)
])

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

cvscores_train = []
cvscores_test = []


In [5]:
for train, test in kfold.split(X, y):

    #model = ensemble_model(number_of_models=5) # in order to surpass tf warning message, move this line outside of the loop
    model = baseline_model()
 # create model
    pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KerasClassifier(model, epochs=50, batch_size=5, verbose=1)),
    ])

    x_train, x_test = pd.DataFrame(X.iloc[train]), pd.DataFrame(X.iloc[test])
    y_train, y_test = np.asarray(pd.DataFrame(y).iloc[train]), np.asarray(pd.DataFrame(y).iloc[test])

    pipe.fit(x_train, y_train)

    #  evaluate the model
    y_train_pred = pipe.predict(x_train)
    y_test_pred = pipe.predict(x_test)


    score_train = accuracy_score(y_train, y_train_pred.round())
    score_test = accuracy_score(y_test, y_test_pred.round())

    print('Train Accuracy: %.3f' % score_train ) # format float output
    print('Test Accuracy: %.3f' % score_test ) # format float output

    cvscores_train.append(score_train * 100)
    cvscores_test.append(score_test * 100)

NameError: name 'kfold' is not defined

In [6]:
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores_train), np.std(cvscores_train)))
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores_test), np.std(cvscores_test)))


NameError: name 'cvscores_train' is not defined

In [7]:
plt_idxs = np.arange(len(cvscores_train))
plt.plot(plt_idxs, cvscores_train, '-o', label='Train')
plt.plot(plt_idxs, cvscores_test, '-o', label='Test')
plt.legend()
plt.show()

NameError: name 'cvscores_train' is not defined