In [11]:
import os
from os.path import join
# From https://stackoverflow.com/questions/51424312/how-to-save-gridsearchcv-object
import joblib
import pandas as pd

from sklearn.metrics import \
    balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import make_pipeline, Pipeline

# pre-processing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import \
    LearningCurveDisplay, validation_curve, learning_curve, train_test_split, ShuffleSplit,\
          GridSearchCV, cross_validate

# Initials

## Raw Data Load

In [16]:
red_wine = os.path.join('data','wine', 'winequality-red.csv')
white_wine = os.path.join('data','wine', 'winequality-white.csv')
turbine = os.path.join('data','turbine','gt_2011.csv')
mushrooms = os.path.join('data','mushroom','secondary_data.csv')

# encoders to use
scale = StandardScaler()
ohe = OneHotEncoder(sparse_output=False)

In [17]:
transformer = make_column_transformer(
    (
        ohe, 
        [
        'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment','gill-spacing', 'gill-color', 
       'stem-root', 'stem-surface', 'stem-color','veil-type', 'veil-color',
        'has-ring', 'ring-type', 'spore-print-color','habitat', 'season'
        ]
        ),
    remainder='passthrough'
    )

# pre-process shroom data

shroom_df = pd.read_csv(mushrooms,sep=';').sample(frac=1).reset_index(drop=True)
x = shroom_df.iloc[:,1:].copy()
x_shroom = pd.DataFrame(transformer.fit_transform(x), 
                columns=transformer.get_feature_names_out())
y = shroom_df.iloc[:,0].copy()
y_shroom = (y == 'p')


# reduce the number of training examples
x_shroom = x_shroom[:7000]
y_shroom =  y_shroom[:7000]

# Scale numerical attributes to be b/w 1 and -1
lst_of_num_cols = [
    'remainder__cap-diameter', 'remainder__stem-height', 'remainder__stem-width'
    ]
x_shroom[lst_of_num_cols] = scale.fit_transform(x_shroom[lst_of_num_cols])

In [18]:
white_df = pd.read_csv(white_wine, sep=';')
red_df = pd.read_csv(red_wine, sep=';')
white_df['type'] = 0
red_df['type'] = 1
wine_df = pd.concat([white_df,red_df])
wine_df = wine_df.sample(frac=1).reset_index(drop=True)

# set x and y values
# remove 'quality' and 'type' column from x array
x_wine = wine_df.iloc[:,:-2].copy()
# scale x vals
x_wine.values[:,:] = scale.fit_transform(x_wine)
# set y array equal to 'type' column 
y_wine = wine_df.iloc[:,-1].copy()

In [19]:
DATA_FOLDER = join('algo_data') 

# Helper Functions

In [65]:
def generate_learning_curve(model, x_data, y_data):
    
      train_sizes, train_scores, test_scores, fit_times, score_times =\
            learning_curve(model, x_data, y_data,cv=ShuffleSplit(), return_times=True, shuffle=True, scoring='f1_weighted')

      train_mean = train_scores.mean(1)
      train_sd = train_scores.std(1)
      test_mean = test_scores.mean(1)
      test_sd = test_scores.std(1)
      fit_mean = fit_times.mean(1)
      fit_sd = fit_times.std(1)
      score_mean = score_times.mean(1)
      score_sd = score_times.std(1)

      train_mean = pd.DataFrame({'train_mean':train_mean})
      train_sd = pd.DataFrame({'train_sd':train_sd})
      test_mean = pd.DataFrame({'test_mean':test_mean})
      test_sd = pd.DataFrame({'test_sd':test_sd})
      fit_mean = pd.DataFrame({'fit_mean':fit_mean})
      fit_sd = pd.DataFrame({'fit_sd':fit_sd})
      score_mean = pd.DataFrame({'score_mean':score_mean})
      score_sd = pd.DataFrame({'score_sd':score_sd})
      train_sizes = pd.DataFrame({'train_sizes': train_sizes})

      return train_sizes, train_mean, train_sd, test_mean, test_sd, fit_mean, fit_sd, score_mean, score_sd

## Clustering Results

In [25]:
## read wine results
df_km_wine = pd.read_csv(join(DATA_FOLDER,'df_km_wine.csv'))
s_km_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_km_cluster_wine.csv'))
x_km_wine= pd.read_csv(join(DATA_FOLDER,'x_km_wine.csv'))

df_em_wine= pd.read_csv(join(DATA_FOLDER,'df_em_wine.csv'))
df_em_score_wine= pd.read_csv(join(DATA_FOLDER,'df_em_score_wine.csv'))
s_em_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_em_cluster_wine.csv'))
x_em_wine= pd.read_csv(join(DATA_FOLDER,'x_em_wine.csv'))

In [26]:
## read shroom results
df_km_shroom = pd.read_csv(join(DATA_FOLDER,'df_km_shroom.csv'))
s_km_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_km_cluster_shroom.csv'))
x_km_shroom= pd.read_csv(join(DATA_FOLDER,'x_km_shroom.csv'))

df_em_shroom= pd.read_csv(join(DATA_FOLDER,'df_em_shroom.csv'))
df_em_score_shroom= pd.read_csv(join(DATA_FOLDER,'df_em_score_shroom.csv'))
s_em_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_em_cluster_shroom.csv'))
x_em_shroom= pd.read_csv(join(DATA_FOLDER,'x_em_shroom.csv'))

## Dimension Reduced Dataset

In [23]:
# dimension reduced wine dataset
df_pca_wine= pd.read_csv(join(DATA_FOLDER,'df_pca_wine.csv'), index_col=0)
s_pca_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_pca_cluster_wine.csv'), index_col=0)
x_pca_wine= pd.read_csv(join(DATA_FOLDER,'x_pca_wine.csv'), index_col=0)

df_rca_wine= pd.read_csv(join(DATA_FOLDER,'df_rca_wine.csv'), index_col=0)
s_rca_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_rca_cluster_wine.csv'), index_col=0)
x_rca_wine= pd.read_csv(join(DATA_FOLDER,'x_rca_wine.csv'), index_col=0)

df_man_wine= pd.read_csv(join(DATA_FOLDER,'df_man_wine.csv'), index_col=0)
s_man_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_man_cluster_wine.csv'), index_col=0)
x_man_wine= pd.read_csv(join(DATA_FOLDER,'x_man_wine.csv'), index_col=0)

df_ica_wine= pd.read_csv(join(DATA_FOLDER,'df_ica_wine.csv'))
s_ica_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_ica_cluster_wine.csv'))
x_ica_wine= pd.read_csv(join(DATA_FOLDER,'x_ica_wine.csv'))

In [24]:
# dimension reduced shroom dataset
df_pca_shroom= pd.read_csv(join(DATA_FOLDER,'df_pca_shroom.csv'), index_col=0)
s_pca_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_pca_cluster_shroom.csv'), index_col=0)
x_pca_shroom= pd.read_csv(join(DATA_FOLDER,'x_pca_shroom.csv'), index_col=0)

df_rca_shroom= pd.read_csv(join(DATA_FOLDER,'df_rca_shroom.csv'), index_col=0)
s_rca_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_rca_cluster_shroom.csv'), index_col=0)
x_rca_shroom= pd.read_csv(join(DATA_FOLDER,'x_rca_shroom.csv'), index_col=0)

df_man_shroom= pd.read_csv(join(DATA_FOLDER,'df_man_shroom.csv'), index_col=0)
s_man_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_man_cluster_shroom.csv'), index_col=0)
x_man_shroom= pd.read_csv(join(DATA_FOLDER,'x_man_shroom.csv'), index_col=0)

df_ica_shroom= pd.read_csv(join(DATA_FOLDER,'df_ica_shroom.csv'))
s_ica_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_ica_cluster_shroom.csv'))
x_ica_shroom= pd.read_csv(join(DATA_FOLDER,'x_ica_shroom.csv'))

# Neural nets

In [36]:
# save wine grids
grid_km_wine=joblib.load('grid_km_wine.pkl')
grid_em_wine=joblib.load('grid_em_wine.pkl')
grid_pca_wine=joblib.load('grid_pca_wine.pkl')
grid_rca_wine=joblib.load('grid_rca_wine.pkl')
grid_man_wine=joblib.load('grid_man_wine.pkl')
grid_ica_wine=joblib.load('grid_ica_wine.pkl')

# save shroom grids
grid_km_shroom=joblib.load('grid_km_shroom.pkl')
grid_em_shroom=joblib.load('grid_em_shroom.pkl')
grid_pca_shroom=joblib.load('grid_pca_shroom.pkl')
grid_rca_shroom=joblib.load('grid_rca_shroom.pkl')
grid_man_shroom=joblib.load('grid_man_shroom.pkl')
grid_ica_shroom=joblib.load('grid_ica_shroom.pkl')

In [48]:
train_sizes,train_mean_km_wine, train_sd_km_wine, test_mean_km_wine, test_sd_km_wine,\
      fit_mean_km_wine, fit_sd_km_wine, score_mean_km_wine, score_sd_km_wine =\
      generate_learning_curve(grid_km_wine.best_estimator_,x_km_wine, y_wine)


_,train_mean_em_wine, train_sd_em_wine, test_mean_em_wine, test_sd_em_wine,\
      fit_mean_em_wine, fit_sd_em_wine, score_mean_em_wine, score_sd_em_wine =\
      generate_learning_curve(grid_em_wine.best_estimator_,x_em_wine.iloc[:, :-1], x_em_wine.iloc[:,-1])

_,train_mean_pca_wine, train_sd_pca_wine, test_mean_pca_wine, test_sd_pca_wine,\
      fit_mean_pca_wine, fit_sd_pca_wine, score_mean_pca_wine, score_sd_pca_wine =\
      generate_learning_curve(grid_pca_wine.best_estimator_,x_pca_wine, y_wine)


_,train_mean_rca_wine, train_sd_rca_wine, test_mean_rca_wine, test_sd_rca_wine,\
      fit_mean_rca_wine, fit_sd_rca_wine, score_mean_rca_wine, score_sd_rca_wine =\
      generate_learning_curve(grid_rca_wine.best_estimator_,x_rca_wine, y_wine)

_,train_mean_ica_wine, train_sd_ica_wine, test_mean_ica_wine, test_sd_ica_wine,\
      fit_mean_ica_wine, fit_sd_ica_wine, score_mean_ica_wine, score_sd_ica_wine =\
      generate_learning_curve(grid_ica_wine.best_estimator_,x_ica_wine, y_wine)


_,train_mean_man_wine, train_sd_man_wine, test_mean_man_wine, test_sd_man_wine,\
      fit_mean_man_wine, fit_sd_man_wine, score_mean_man_wine, score_sd_man_wine =\
      generate_learning_curve(grid_man_wine.best_estimator_,x_man_wine, y_wine)



In [49]:
train_sizes_shroom,train_mean_km_shroom, train_sd_km_shroom, test_mean_km_shroom, test_sd_km_shroom,\
      fit_mean_km_shroom, fit_sd_km_shroom, score_mean_km_shroom, score_sd_km_shroom =\
      generate_learning_curve(grid_km_shroom.best_estimator_,x_km_shroom, y_shroom)

_,train_mean_em_shroom, train_sd_em_shroom, test_mean_em_shroom, test_sd_em_shroom,\
      fit_mean_em_shroom, fit_sd_em_shroom, score_mean_em_shroom, score_sd_em_shroom =\
      generate_learning_curve(grid_em_shroom.best_estimator_,x_em_shroom.iloc[:, :-1], x_em_shroom.iloc[:,-1])

_,train_mean_pca_shroom, train_sd_pca_shroom, test_mean_pca_shroom, test_sd_pca_shroom,\
      fit_mean_pca_shroom, fit_sd_pca_shroom, score_mean_pca_shroom, score_sd_pca_shroom =\
      generate_learning_curve(grid_pca_shroom.best_estimator_,x_pca_shroom, y_shroom)

_,train_mean_rca_shroom, train_sd_rca_shroom, test_mean_rca_shroom, test_sd_rca_shroom,\
      fit_mean_rca_shroom, fit_sd_rca_shroom, score_mean_rca_shroom, score_sd_rca_shroom =\
      generate_learning_curve(grid_rca_shroom.best_estimator_,x_rca_shroom, y_shroom)

_,train_mean_ica_shroom, train_sd_ica_shroom, test_mean_ica_shroom, test_sd_ica_shroom,\
      fit_mean_ica_shroom, fit_sd_ica_shroom, score_mean_ica_shroom, score_sd_ica_shroom =\
      generate_learning_curve(grid_ica_shroom.best_estimator_,x_ica_shroom, y_shroom)

_,train_mean_man_shroom, train_sd_man_shroom, test_mean_man_shroom, test_sd_man_shroom,\
      fit_mean_man_shroom, fit_sd_man_shroom, score_mean_man_shroom, score_sd_man_shroom =\
      generate_learning_curve(grid_man_shroom.best_estimator_,x_man_shroom, y_shroom)




## Save Learning Curve Results

In [53]:
neural_folder = join('neural_net_curves')

In [58]:
train_sizes.to_csv(join(neural_folder,'train_sizes_wine.csv'))
train_mean_km_wine.to_csv(join(neural_folder,'train_mean_km_wine.csv'))
train_sd_km_wine.to_csv(join(neural_folder,'train_sd_km_wine.csv'))
test_mean_km_wine.to_csv(join(neural_folder,'test_mean_km_wine.csv'))
test_sd_km_wine.to_csv(join(neural_folder,'test_sd_km_wine.csv'))
fit_mean_km_wine.to_csv(join(neural_folder,'fit_mean_km_wine.csv'))
fit_sd_km_wine.to_csv(join(neural_folder,'fit_sd_km_wine.csv'))
score_mean_km_wine.to_csv(join(neural_folder,'score_mean_km_wine.csv'))
score_sd_km_wine.to_csv(join(neural_folder,'score_sd_km_wine.csv'))


train_mean_em_wine.to_csv(join(neural_folder,'train_mean_em_wine.csv'))
train_sd_em_wine.to_csv(join(neural_folder,'train_sd_em_wine.csv'))
test_mean_em_wine.to_csv(join(neural_folder,'test_mean_em_wine.csv'))
test_sd_em_wine.to_csv(join(neural_folder,'test_sd_em_wine.csv'))
fit_mean_em_wine.to_csv(join(neural_folder,'fit_mean_em_wine.csv'))
fit_sd_em_wine.to_csv(join(neural_folder,'fit_sd_em_wine.csv'))
score_mean_em_wine.to_csv(join(neural_folder,'score_mean_em_wine.csv'))
score_sd_em_wine.to_csv(join(neural_folder,'score_sd_em_wine.csv'))

train_mean_pca_wine.to_csv(join(neural_folder,'train_mean_pca_wine.csv'))
train_sd_pca_wine.to_csv(join(neural_folder,'train_sd_pca_wine.csv'))
test_mean_pca_wine.to_csv(join(neural_folder,'test_mean_pca_wine.csv'))
test_sd_pca_wine.to_csv(join(neural_folder,'test_sd_pca_wine.csv'))
fit_mean_pca_wine.to_csv(join(neural_folder,'fit_mean_pca_wine.csv'))
fit_sd_pca_wine.to_csv(join(neural_folder,'fit_sd_pca_wine.csv'))
score_mean_pca_wine.to_csv(join(neural_folder,'score_mean_pca_wine.csv'))
score_sd_pca_wine.to_csv(join(neural_folder,'score_sd_pca_wine.csv'))

train_mean_rca_wine.to_csv(join(neural_folder,'train_mean_rca_wine.csv'))
train_sd_rca_wine.to_csv(join(neural_folder,'train_sd_rca_wine.csv'))
test_mean_rca_wine.to_csv(join(neural_folder,'test_mean_rca_wine.csv'))
test_sd_rca_wine.to_csv(join(neural_folder,'test_sd_rca_wine.csv'))
fit_mean_rca_wine.to_csv(join(neural_folder,'fit_mean_rca_wine.csv'))
fit_sd_rca_wine.to_csv(join(neural_folder,'fit_sd_rca_wine.csv'))
score_mean_rca_wine.to_csv(join(neural_folder,'score_mean_rca_wine.csv'))
score_sd_rca_wine.to_csv(join(neural_folder,'score_sd_rca_wine.csv'))

train_mean_ica_wine.to_csv(join(neural_folder,'train_mean_ica_wine.csv'))
train_sd_ica_wine.to_csv(join(neural_folder,'train_sd_ica_wine.csv'))
test_mean_ica_wine.to_csv(join(neural_folder,'test_mean_ica_wine.csv'))
test_sd_ica_wine.to_csv(join(neural_folder,'test_sd_ica_wine.csv'))
fit_mean_ica_wine.to_csv(join(neural_folder,'fit_mean_ica_wine.csv'))
fit_sd_ica_wine.to_csv(join(neural_folder,'fit_sd_ica_wine.csv'))
score_mean_ica_wine.to_csv(join(neural_folder,'score_mean_ica_wine.csv'))
score_sd_ica_wine.to_csv(join(neural_folder,'score_sd_ica_wine.csv'))

train_mean_man_wine.to_csv(join(neural_folder,'train_mean_man_wine.csv')) 
train_sd_man_wine.to_csv(join(neural_folder,'train_sd_man_wine.csv'))
test_mean_man_wine.to_csv(join(neural_folder,'test_mean_man_wine.csv')) 
test_sd_man_wine.to_csv(join(neural_folder,'test_sd_man_wine.csv'))
fit_mean_man_wine.to_csv(join(neural_folder,'fit_mean_man_wine.csv'))
fit_sd_man_wine.to_csv(join(neural_folder,'fit_sd_man_wine.csv'))
score_mean_man_wine.to_csv(join(neural_folder,'score_mean_man_wine.csv'))
score_sd_man_wine.to_csv(join(neural_folder,'score_sd_man_wine.csv'))

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'