## Module Imports

In [1]:
import os

# From https://stackoverflow.com/questions/51424312/how-to-save-gridsearchcv-object
import joblib
#save your model or results
# joblib.dump(gs, 'model_file_name.pkl')

#load your model for further usage
# joblib.load("model_file_name.pkl")

import pandas as pd
import numpy as np
import timeit

import matplotlib.pyplot as plt
import seaborn as sns

# Linear model for PCA
from sklearn.linear_model import LogisticRegression

# Estimation maximization
from sklearn.mixture import GaussianMixture

# Neural Network
from sklearn.neural_network import MLPClassifier

# Clustering Algo
from sklearn.cluster import KMeans

# Principal and indipendent component analysis
from sklearn.decomposition import FastICA, PCA

# Randomized projections
from sklearn.random_projection import GaussianRandomProjection,\
      SparseRandomProjection as RCA

# Manifolds
from sklearn.manifold import LocallyLinearEmbedding

# Mertrics
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score as s_score,\
      calinski_harabasz_score as ch_score,\
        homogeneity_completeness_v_measure as hcv_score, adjusted_rand_score as ar_score,\
        pairwise_distances
from sklearn.metrics import \
    balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import make_pipeline, Pipeline

# pre-processing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import \
    LearningCurveDisplay, validation_curve, learning_curve, train_test_split, ShuffleSplit,\
          GridSearchCV, cross_validate

## Data Processing

In [2]:
red_wine = os.path.join('data','wine', 'winequality-red.csv')
white_wine = os.path.join('data','wine', 'winequality-white.csv')
turbine = os.path.join('data','turbine','gt_2011.csv')
mushrooms = os.path.join('data','mushroom','secondary_data.csv')

# encoders to use
scale = StandardScaler()
s_split = ShuffleSplit()
ohe = OneHotEncoder(sparse_output=False)

## SHROOM DATA

In [3]:
transformer = make_column_transformer(
    (
        ohe, 
        [
        'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment','gill-spacing', 'gill-color', 
       'stem-root', 'stem-surface', 'stem-color','veil-type', 'veil-color',
        'has-ring', 'ring-type', 'spore-print-color','habitat', 'season'
        ]
        ),
    remainder='passthrough'
    )

# pre-process shroom data

shroom_df = pd.read_csv(mushrooms,sep=';').sample(frac=1).reset_index(drop=True)
x = shroom_df.iloc[:,1:].copy()
x_shroom = pd.DataFrame(transformer.fit_transform(x), 
                columns=transformer.get_feature_names_out())
y = shroom_df.iloc[:,0].copy()
y_shroom = (y == 'p')


# reduce the number of training examples
x_shroom = x_shroom[:7000]
y_shroom =  y_shroom[:7000]

# Scale numerical attributes to be b/w 1 and -1
lst_of_num_cols = [
    'remainder__cap-diameter', 'remainder__stem-height', 'remainder__stem-width'
    ]
x_shroom[lst_of_num_cols] = scale.fit_transform(x_shroom[lst_of_num_cols])

## WINE DATA

In [4]:
white_df = pd.read_csv(white_wine, sep=';')
red_df = pd.read_csv(red_wine, sep=';')
white_df['type'] = 0
red_df['type'] = 1
wine_df = pd.concat([white_df,red_df])
wine_df = wine_df.sample(frac=1).reset_index(drop=True)

# set x and y values
# remove 'quality' and 'type' column from x array
x_wine = wine_df.iloc[:,:-2].copy()
# scale x vals
x_wine.values[:,:] = scale.fit_transform(x_wine)
# set y array equal to 'type' column 
y_wine = wine_df.iloc[:,-1].copy()

## Shroom and Wine Train/Test Split

In [5]:
x_shroom_train, x_shroom_test, y_shroom_train, y_shroom_test =\
      train_test_split(x_shroom, y_shroom, test_size=0.2)

x_wine_train, x_wine_test, y_wine_train, y_wine_test =\
      train_test_split(x_wine, y_wine, test_size=0.2)


## Helper Functions

In [6]:
def generate_validation_curve(model, param_grid, x_data, y_data,
                          x_range, param_name, model_name, axs,\
                          plot_train=True, plot_test=True):
    
    train_score, test_score = validation_curve(model, x_data, y_data, 
                                 fit_params=param_grid, scoring='f1_score')
    
    train_mean = train_score.mean(1)
    train_sd = train_score.std(1)
    test_mean = test_score.mean(1)
    test_sd = test_score.std(1)
    axs = axs
    x = x_range
    if plot_train:
        line, = axs.plot(x,train_mean, 'o-')
        axs.fill_between(x,train_mean + train_sd, train_mean - train_sd,alpha=0.3)
    if plot_test:
        axs.plot(x,test_mean,'o-')
        axs.fill_between(x, test_mean + test_sd, test_mean - test_sd,alpha=0.3)
    axs.set_title(f'{model_name} validation curve', fontsize = 15)
    axs.set_xlabel(f'{param_name}', fontsize = 15)
    axs.set_ylabel(f'f1 score', fontsize = 15)
    axs.legend()
    axs.tick_params(axis='both', which='major', labelsize=15)
    axs.tick_params(axis='both', which='minor', labelsize=15)

    return line

In [7]:
def generate_learning_curve(model, param_grid, x_data, y_data,
                          x_range, param_name, model_name, axes,\
                          plot_train=True, plot_test=True):
    
    train_sizes, train_scores, test_scores, fit_times, score_times =\
          learning_curve(model, x_data, y_data,)

    train_mean = train_scores.mean(1)
    train_sd = train_scores.std(1)
    test_mean = test_scores.mean(1)
    test_sd = test_scores.std(1)
    fit_mean = fit_times.mean(1)
    fit_std = fit_times.std(1)
    score_mean = score_times.mean(1)
    score_std = score_times.std(1)

    # Plot n_samples vs fit_times
    line = axes.plot(train_sizes, train_mean, "o-")
    axes.fill_between(
        train_sizes,
        train_mean - train_sd,
        train_mean + train_sd,
        alpha=0.1,
    )

    # Plot n_samples vs query_times
    axes.plot(train_sizes, test_mean, "o-")
    axes.fill_between(
        train_sizes,
        test_mean - test_sd,
        test_mean + test_sd,
        alpha=0.1,
    )
    axes.set_xlabel("Training Examples", fontsize = 15)
    axes.set_ylabel("Time (seconds)", fontsize = 15)
    axes.set_title(f"Time Complexity of {model.__class__.__name__}", fontsize = 15)
    axes.legend(handles[:2], ["Training Time", "Query Time"], loc='upper right')
    axes.tick_params(axis='both', which='major', labelsize=15)
    axes.tick_params(axis='both', which='minor', labelsize=15)
    return line

In [8]:
def plot_curve(x_data, y_data, axs=None, take_mean=False):

    if axs == None:
        _, axs = plt.subplots()

    if take_mean:
        y_mean = y_data.mean(1)
        y_std = y_data.std(1)
        axs.plot(x_data,y_mean,'o-')
        line = axs.fill_between(x_data, y_mean + y_std, y_mean - y_std,alpha=0.3)
    else:
        line = axs.plot(x_data, y_data)

    return line

In [20]:
def perform_kmeans(x_data, y_data, num_runs=5):
    import warnings
    from sklearn.exceptions import DataConversionWarning
    warnings.filterwarnings(action='ignore', category=DataConversionWarning)
    
    r_state = 123456
    num_clusters = np.arange(2,50)
    train_time_dict = {}
    s_score_dict = {}
    ch_score_dict = {}
    v_score_dict = {}
    ar_score_dict = {}
    x_train, x_test, y_train, y_test =\
      train_test_split(x_data, y_data, test_size=0.2)

    for i in list(range(num_runs)):

        train_time_lst = []
        s_score_lst = []
        ch_score_lst = []
        v_score_lst = []
        ar_score_lst = []

        for num in num_clusters:

            st = timeit.default_timer()
            k_model = KMeans(n_init=10, n_clusters=num).fit(x_train)
            et = timeit.default_timer()
            train_time_lst.append(et - st)

            y_pred = k_model.predict(x_test)

            h,c,v = hcv_score(y_test, y_pred)
            v_score_lst.append(v)
            ar_score_lst.append(ar_score(y_test, y_pred))

            ch_score_lst.append(
                ch_score(x_train, k_model.labels_)
                )
            
            s_score_lst.append(
                s_score(x_train, k_model.labels_)
                )
            
        
        train_time_dict[f'train_iter_{i}'] = train_time_lst
        s_score_dict[f's_score_iter_{i}'] = s_score_lst
        v_score_dict[f'v_score_iter_{i}'] = v_score_lst
        ch_score_dict[f'ch_score_iter_{i}'] = ch_score_lst
        ar_score_dict[f'ar_score_iter_{i}'] = ar_score_lst

    df_time = pd.DataFrame(train_time_dict)
    df_s_score = pd.DataFrame(s_score_dict)
    df_v_score = pd.DataFrame(v_score_dict)
    df_ch_score = pd.DataFrame(ch_score_dict)
    df_ar_score = pd.DataFrame(ar_score_dict)

    df_results = pd.DataFrame({
        'num_clusters': num_clusters,
        'mean_fit_time': df_time.mean(1),
        'std_fit_time': df_time.std(1),
        'mean_s_score': df_s_score.mean(1),
        'std_s_score': df_s_score.std(1),
        'mean_v_score': df_v_score.mean(1),
        'std_v_score': df_v_score.std(1),
        'mean_ch_score': df_ch_score.mean(1),
        'std_ch_score': df_ch_score.std(1),
        'mean_ar_score': df_ar_score.mean(1),
        'std_ar_score': df_ar_score.std(1)
        })
    
    # select the number of clusters based on ch score
    mask = df_results['mean_ch_score'] == df_results['mean_ch_score'].max()
    selected_n_clusters =df_results[mask]['num_clusters']
    x_transformed = KMeans(n_init=10, n_clusters=selected_n_clusters.iloc[0]).\
        fit_transform(x_data)

    return df_results, selected_n_clusters, x_transformed

In [21]:
def perform_em(x_data, y_data, num_runs=5):

    def gmm_bic_score(estimator, x_data):
        """Callable to pass to GridSearchCV that will use the BIC score."""
        # Make it negative since GridSearchCV expects a score to maximize
        return -estimator.bic(x_data)

    n_components = np.arange(2,50)

    import warnings
    from sklearn.exceptions import DataConversionWarning
    warnings.filterwarnings(action='ignore', category=DataConversionWarning)
    num_clusters = np.arange(2,50)
    # if x_data.shape[1] == 2:
        # num_clusters = np.arange(5,50)
    train_time_dict = {}
    s_score_dict = {}
    ch_score_dict = {}
    v_score_dict = {}
    ar_score_dict = {}
    x_train, x_test, y_train, y_test =\
      train_test_split(x_data, y_data, test_size=0.2)

    for i in list(range(num_runs)):

        train_time_lst = []
        s_score_lst = []
        ch_score_lst = []
        v_score_lst = []
        ar_score_lst = []

        for num in num_clusters:

            st = timeit.default_timer()
            k_model = GaussianMixture(n_init=10, n_components=num).fit(x_train)
            et = timeit.default_timer()
            train_time_lst.append(et - st)

            y_pred = k_model.predict(x_test)

            h,c,v = hcv_score(y_test, y_pred)
            v_score_lst.append(v)
            ar_score_lst.append(ar_score(y_test, y_pred))

            ch_score_lst.append(
                ch_score(x_test, y_pred)
                )
            
            s_score_lst.append(
                s_score(x_test, y_pred)
                )
            
        
        train_time_dict[f'train_iter_{i}'] = train_time_lst
        s_score_dict[f's_score_iter_{i}'] = s_score_lst
        v_score_dict[f'v_score_iter_{i}'] = v_score_lst
        ch_score_dict[f'ch_score_iter_{i}'] = ch_score_lst
        ar_score_dict[f'ar_score_iter_{i}'] = ar_score_lst

    df_time = pd.DataFrame(train_time_dict)
    df_s_score = pd.DataFrame(s_score_dict)
    df_v_score = pd.DataFrame(v_score_dict)
    df_ch_score = pd.DataFrame(ch_score_dict)
    df_ar_score = pd.DataFrame(ar_score_dict)

    df_scores = pd.DataFrame({
        'num_clusters': num_clusters,
        'mean_fit_time': df_time.mean(1),
        'std_fit_time': df_time.std(1),
        'mean_s_score': df_s_score.mean(1),
        'std_s_score': df_s_score.std(1),
        'mean_v_score': df_v_score.mean(1),
        'std_v_score': df_v_score.std(1),
        'mean_ch_score': df_ch_score.mean(1),
        'std_ch_score': df_ch_score.std(1),
        'mean_ar_score': df_ar_score.mean(1),
        'std_ar_score': df_ar_score.std(1)
        })
    param_grid = {
        "n_components": n_components,
        "covariance_type": ["full"],
    }

    grid_search = GridSearchCV(
        GaussianMixture(), param_grid=param_grid, scoring=gmm_bic_score
    )

    grid_search.fit(x_data)

    
    df = pd.DataFrame(grid_search.cv_results_)[
        ["param_n_components", "param_covariance_type", "mean_test_score","std_test_score",
          "mean_fit_time", "std_fit_time", "mean_score_time", "std_score_time"]
    ]
    
    df["mean_test_score"] = -df["mean_test_score"]

    df = df.rename(
        columns={
            "param_n_components": "n_components",
            "param_covariance_type": "covariance_type",
            "mean_test_score": "bic_score",
            "std_test_score": "std_bic_score",
        }
    )

    df = df.sort_values(by="bic_score")

    # select n clusters based on minimum bic score
    mask = df['bic_score'] == df['bic_score'].min()
    selected_n_clusters = df[mask]['n_components']
    x_new, y_new = GaussianMixture(n_init=10, 
                                    n_components=5).\
                                        fit(x_wine).sample(n_samples=7000)

    x_transformed = pd.DataFrame(x_new)
    y_transformed = pd.DataFrame(y_new)

    x_final = pd.concat([x_transformed,y_transformed], axis=1)


    return df, df_scores, selected_n_clusters, x_final

In [11]:
def perform_pca(x_data):

    pca = PCA(whiten=True, svd_solver='full')
    pca.fit(x_data)
    cum_explained_variance = np.cumsum(pca.explained_variance_ratio_)

    selected_n_components = np.sum(cum_explained_variance <= 0.95)
    x_transformed = PCA(n_components=selected_n_components, 
                        whiten=True, svd_solver='full').fit_transform(x_data)
    
    return cum_explained_variance, selected_n_components, x_transformed

In [12]:
def perform_ica(x_data):

    if x_data.shape[1] > 20:
        n_components = np.arange(2,x_data.shape[1]+1, 5)
    else:
        n_components = np.arange(2,x_data.shape[1]+1)
        
    average_kurtosis_lst = []
    std_kurtosis_lst = []

    for n in n_components:
        ica = FastICA(n_components=n)
        df_temp = pd.DataFrame(ica.fit_transform(x_data))
        df_temp = df_temp.kurtosis(axis=0)
        average_kurtosis_lst.append(df_temp.abs().mean())
        std_kurtosis_lst.append(df_temp.abs().std())

    df_result = pd.DataFrame({
        'num_components': n_components,
        'mean_kurtosis': average_kurtosis_lst,
        'std_kurtosis': std_kurtosis_lst
    })

    mask = df_result['mean_kurtosis'] == df_result['mean_kurtosis'].max()
    selected_n_clusters = df_result[mask]['num_components']
    x_transformed = FastICA(n_components=selected_n_clusters.iloc[0]).\
        fit_transform(x_data)
    
    return df_result, selected_n_clusters, x_transformed

In [13]:
def pdist_correllation(x_data_1,x_data_2):
    
    rmse = ((x_data_1- x_data_2.to_numpy()) ** 2).mean()
    
    return rmse

In [14]:
def perform_rca(x_data, num_runs=5):

    if x_data.shape[1] > 20:
        n_components = np.arange(2,x_data.shape[1]+1, 5)
    else:
        n_components = np.arange(2,x_data.shape[1]+1)
        
    average_recon_error_dict = {}

    for num in list(range(num_runs)):
        average_recon_error_lst = []

        for n in n_components:
            rca = RCA(n_components=n)
            average_recon_error_lst.\
                append(
                    pdist_correllation(
                        rca.inverse_transform(rca.fit_transform(x_data)), x_data
                        )
                    )
            
        average_recon_error_dict[f'r_error_{num}'] = average_recon_error_lst

    df_recon = pd.DataFrame(average_recon_error_dict)
    df_result = pd.DataFrame({
        'num_components': n_components,
        'mean_recon_error': df_recon.mean(1),
        'std_recon_error': df_recon.std(1)
    })

    mask = df_result['mean_recon_error'].abs() ==\
          df_result['mean_recon_error'].abs().min()
    
    selected_n_components = df_result[mask]['num_components']
    x_transformed = RCA(n_components=selected_n_components.iloc[0]).fit_transform(x_data)

    return df_result, selected_n_components, x_transformed

In [15]:
def perform_manifold(x_data, num_runs=5):
    if x_data.shape[1] > 20:
        n_components = np.arange(2,x_data.shape[1]+1, 5)
    else:
        n_components = np.arange(2,x_data.shape[1]+1)
    es='dense'
    average_recon_error_dict = {}
    
    for num in list(range(num_runs)):
        average_recon_error_lst = []
        for n in n_components:
            n_neigh = 5 + n
            lle = LocallyLinearEmbedding(
                n_components=n, eigen_solver=es,
                method = 'modified', n_neighbors = n_neigh
                )
            lle.fit_transform(x_data)
            average_recon_error_lst.append(lle.reconstruction_error_)
            
        average_recon_error_dict[f'r_error_{num}'] = average_recon_error_lst

    df_recon = pd.DataFrame(average_recon_error_dict)
    df_result = pd.DataFrame({
        'num_components': n_components,
        'mean_recon_error': df_recon.mean(1),
        'std_recon_error': df_recon.std(1)
    })

    mask = df_result['mean_recon_error'].abs() ==\
        df_result['mean_recon_error'].abs().min()
    
    selected_n_components = df_result[mask]['num_components']
    x_transformed = LocallyLinearEmbedding(
                n_components=selected_n_components.iloc[0], eigen_solver=es,
                method = 'modified', n_neighbors = (selected_n_components.iloc[0] + 5)
                ).fit_transform(x_data)

    return df_result, selected_n_components, x_transformed

In [16]:
def perform_nn(x_data, y_data):
    
    cv = ShuffleSplit(n_splits=5)

    # Neural Network hyper tuning
    # defining parameter range 
    param_grid = {
    'learning_rate': ["constant"],
    'hidden_layer_sizes': [(10,),(50,),(100,),(150,), (200,)],
    'learning_rate_init': np.linspace(0.01, 1, 5),
    'activation': ['relu'],
    'solver': ['sgd']
    }

    grid = GridSearchCV(MLPClassifier(), param_grid, scoring='f1_weighted',\
                        refit = True, verbose = 3,n_jobs=-1, cv=cv) 
    
    # return fitted grid object
    return grid.fit(x_data, y_data)

## DataSet Generation

In [None]:
## Generate transformed datasets for wine
df_km_wine, s_km_cluster_wine, x_km_wine = perform_kmeans(x_wine, y_wine)
df_em_wine,df_em_score_wine, s_em_cluster_wine, x_em_wine = perform_em(x_wine, y_wine)
df_pca_wine, s_pca_cluster_wine, x_pca_wine = perform_pca(x_wine)

In [17]:
df_rca_wine, s_rca_cluster_wine, x_rca_wine = perform_rca(x_wine)

In [None]:
df_man_wine, s_man_cluster_wine, x_man_wine = perform_manifold(x_wine)

In [167]:
df_ica_wine, s_ica_cluster_wine, x_ica_wine = perform_ica(x_wine)

In [None]:
## Generate transformed datasets for shroom
df_km_shroom, s_km_cluster_shroom, x_km_shroom = perform_kmeans(x_shroom, y_shroom)
df_em_shroom,df_em_score_shroom, s_em_cluster_shroom, x_em_shroom = \
    perform_em(x_shroom, y_shroom)
df_pca_shroom, s_pca_cluster_shroom, x_pca_shroom = perform_pca(x_shroom)
df_man_shroom, s_man_cluster_shroom, x_man_shroom = perform_manifold(x_shroom)

In [18]:
df_rca_shroom, s_rca_cluster_shroom, x_rca_shroom = perform_rca(x_shroom)

In [168]:
df_ica_shroom, s_ica_cluster_shroom, x_ica_shroom = perform_ica(x_shroom)



In [None]:
mask = df_man_wine['mean_recon_error'].abs() ==\
    df_man_wine['mean_recon_error'].abs().min()

s_man_cluster_wine = df_man_wine[mask]['num_components']

mask = df_man_shroom['mean_recon_error'].abs() ==\
    df_man_shroom['mean_recon_error'].abs().min()

s_man_cluster_shroom = df_man_shroom[mask]['num_components']

In [None]:
x_man_wine = LocallyLinearEmbedding(
            n_components=s_man_cluster_wine.iloc[0], eigen_solver='dense',
            method = 'modified', n_neighbors = (s_man_cluster_wine.iloc[0] + 5)
            ).fit_transform(x_wine)

x_man_shroom = LocallyLinearEmbedding(
            n_components=s_man_cluster_shroom.iloc[0], eigen_solver='dense',
            method = 'modified', n_neighbors = (s_man_cluster_shroom.iloc[0] + 5)
            ).fit_transform(x_shroom)

In [None]:
## save wine results

df_km_wine.to_csv('df_km_wine.csv')
s_km_cluster_wine.to_csv('s_km_cluster_wine.csv')
pd.DataFrame(x_km_wine).to_csv('x_km_wine.csv')

df_em_wine.to_csv('df_em_wine.csv')
df_em_score_wine.to_csv('df_em_score_wine.csv')
s_em_cluster_wine.to_csv('s_em_cluster_wine.csv')
x_em_wine.to_csv('x_em_wine.csv')

pd.DataFrame(df_pca_wine).to_csv('df_pca_wine.csv')
pd.DataFrame({'s_pca_cluster_wine':[s_pca_cluster_wine]}).to_csv('s_pca_cluster_wine.csv')
pd.DataFrame(x_pca_wine).to_csv('x_pca_wine.csv')

df_rca_wine.to_csv('df_rca_wine.csv')
s_rca_cluster_wine.to_csv('s_rca_cluster_wine.csv')
pd.DataFrame(x_rca_wine).to_csv('x_rca_wine.csv')

df_man_wine.to_csv('df_man_wine.csv')
s_man_cluster_wine.to_csv('s_man_cluster_wine.csv')
pd.DataFrame(x_man_wine).to_csv('x_man_wine.csv')

In [178]:
df_ica_wine.to_csv('df_ica_wine.csv')
s_ica_cluster_wine.to_csv('s_ica_cluster_wine.csv')
pd.DataFrame(x_ica_wine).to_csv('x_ica_wine.csv')

In [None]:
## save shroom results

df_km_shroom.to_csv('df_km_shroom.csv')
s_km_cluster_shroom.to_csv('s_km_cluster_shroom.csv')
pd.DataFrame(x_km_shroom).to_csv('x_km_shroom.csv')

df_em_shroom.to_csv('df_em_shroom.csv')
df_em_score_shroom.to_csv('df_em_score_shroom.csv')
s_em_cluster_shroom.to_csv('s_em_cluster_shroom.csv')
x_em_shroom.to_csv('x_em_shroom.csv')

pd.DataFrame(df_pca_shroom).to_csv('df_pca_shroom.csv')
pd.DataFrame({'s_pca_cluster_shroom':[s_pca_cluster_shroom]}).to_csv('s_pca_cluster_shroom.csv')
pd.DataFrame(x_pca_shroom).to_csv('x_pca_shroom.csv')

df_rca_shroom.to_csv('df_rca_shroom.csv')
s_rca_cluster_shroom.to_csv('s_rca_cluster_shroom.csv')
pd.DataFrame(x_rca_shroom).to_csv('x_rca_shroom.csv')

df_man_shroom.to_csv('df_man_shroom.csv')
s_man_cluster_shroom.to_csv('s_man_cluster_shroom.csv')
pd.DataFrame(x_man_shroom).to_csv('x_man_shroom.csv')

In [177]:
df_ica_shroom.to_csv('df_ica_shroom.csv')
s_ica_cluster_shroom.to_csv('s_ica_cluster_shroom.csv')
pd.DataFrame(x_ica_shroom).to_csv('x_ica_shroom.csv')

## Clustering on Dimension Reduced Wine Datasets

In [None]:
## Generate transformed datasets for dimension reduced wine set
df_km_pca_wine, s_km_pca_cluster_wine, x_km_pca_wine =\
      perform_kmeans(x_pca_wine, y_wine)
df_em_pca_wine,df_em_score_pca_wine, s_em_cluster_pca_wine, x_em_pca_wine =\
      perform_em(x_pca_wine, y_wine)

df_km_rca_wine, s_km_rca_cluster_wine, x_km_rca_wine =\
      perform_kmeans(x_rca_wine, y_wine)
df_em_rca_wine,df_em_score_rca_wine, s_em_cluster_rca_wine, x_em_rca_wine =\
      perform_em(x_rca_wine, y_wine)

df_km_man_wine, s_km_man_cluster_wine, x_km_man_wine =\
      perform_kmeans(x_man_wine, y_wine)

In [101]:
df_em_man_wine,df_em_score_man_wine, s_em_cluster_man_wine, x_em_man_wine =\
      perform_em(x_man_wine, y_wine)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [179]:
# clustering using wine dataset dimension reduced using ica
df_km_ica_wine, s_km_ica_cluster_wine, x_km_ica_wine =\
      perform_kmeans(x_ica_wine, y_wine)

df_em_ica_wine,df_em_score_ica_wine, s_em_cluster_ica_wine, x_em_ica_wine =\
      perform_em(x_ica_wine, y_wine)

In [102]:
## Generate transformed datasets for dimension reduced shroom set
df_km_pca_shroom, s_km_pca_cluster_shroom, x_km_pca_shroom =\
      perform_kmeans(x_pca_shroom, y_shroom)
df_em_pca_shroom,df_em_score_pca_shroom, s_em_cluster_pca_shroom, x_em_pca_shroom =\
      perform_em(x_pca_shroom, y_shroom)

df_km_rca_shroom, s_km_rca_cluster_shroom, x_km_rca_shroom =\
      perform_kmeans(x_rca_shroom, y_shroom)
df_em_rca_shroom,df_em_score_rca_shroom, s_em_cluster_rca_shroom, x_em_rca_shroom =\
      perform_em(x_rca_shroom, y_shroom)

df_km_man_shroom, s_km_man_cluster_shroom, x_km_man_shroom =\
      perform_kmeans(x_man_shroom, y_shroom)
df_em_man_shroom,df_em_score_man_shroom, s_em_cluster_man_shroom, x_em_man_shroom =\
      perform_em(x_man_shroom, y_shroom)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [None]:
# clustering using wine dataset dimension reduced using ica
df_km_ica_shroom, s_km_ica_cluster_shroom, x_km_ica_shroom =\
      perform_kmeans(x_ica_shroom, y_shroom)

df_em_ica_shroom,df_em_score_ica_shroom, s_em_cluster_ica_shroom, x_em_ica_shroom =\
      perform_em(x_ica_shroom, y_shroom)

In [106]:
## save transformed datasets for dimension reduced wine set
df_km_pca_wine.to_csv('df_km_pca_wine.csv')
s_km_pca_cluster_wine.to_csv('s_km_pca_cluster_wine.csv')
pd.DataFrame(x_km_pca_wine).to_csv('x_km_pca_wine.csv')

df_em_pca_wine.to_csv('df_em_pca_wine.csv')
df_em_score_pca_wine.to_csv('df_em_score_pca_wine.csv')
s_em_cluster_pca_wine.to_csv('s_em_cluster_pca_wine.csv')
x_em_pca_wine.to_csv('x_em_pca_wine.csv')

df_km_rca_wine.to_csv('df_km_rca_wine.csv')
s_km_rca_cluster_wine.to_csv('s_km_rca_cluster_wine.csv')
pd.DataFrame(x_km_rca_wine).to_csv('x_km_rca_wine.csv')

df_em_rca_wine.to_csv('df_em_rca_wine.csv')
df_em_score_rca_wine.to_csv('df_em_score_rca_wine.csv')
s_em_cluster_rca_wine.to_csv('s_em_cluster_rca_wine.csv') 
x_em_rca_wine.to_csv('x_em_rca_wine.csv')

df_km_man_wine.to_csv('df_km_man_wine.csv')
s_km_man_cluster_wine.to_csv('s_km_man_cluster_wine.csv')
pd.DataFrame(x_km_man_wine).to_csv('df_km_man_wine.csv')

df_em_man_wine.to_csv('df_km_man_wine.csv')
df_em_score_man_wine.to_csv('df_em_score_man_wine.csv')
s_em_cluster_man_wine.to_csv('s_em_cluster_man_wine.csv')
x_em_man_wine.to_csv('x_em_man_wine.csv')

In [None]:
## save transformed datasets for dimension reduced wine set
df_km_ica_wine.to_csv('df_km_ica_wine.csv')
s_km_ica_cluster_wine.to_csv('s_km_ica_cluster_wine.csv')
pd.DataFrame(x_km_ica_wine).to_csv('x_km_ica_wine.csv')

df_em_ica_wine.to_csv('df_em_ica_wine.csv')
df_em_score_ica_wine.to_csv('df_em_score_ica_wine.csv')
s_em_cluster_ica_wine.to_csv('s_em_cluster_ica_wine.csv')
x_em_ica_wine.to_csv('x_em_ica_wine.csv')

In [107]:
## save transformed datasets for dimension reduced shroom set
df_km_pca_shroom.to_csv('df_km_pca_shroom.csv')
s_km_pca_cluster_shroom.to_csv('s_km_pca_cluster_shroom.csv')
pd.DataFrame(x_km_pca_shroom).to_csv('x_km_pca_shroom.csv')

df_em_pca_shroom.to_csv('df_em_pca_shroom.csv')
df_em_score_pca_shroom.to_csv('df_em_score_pca_shroom.csv')
s_em_cluster_pca_shroom.to_csv('s_em_cluster_pca_shroom.csv') 
x_em_pca_shroom.to_csv('x_em_pca_shroom.csv')

df_km_rca_shroom.to_csv('df_km_rca_shroom.csv')
s_km_rca_cluster_shroom.to_csv('s_km_rca_cluster_shroom.csv')
pd.DataFrame(x_km_rca_shroom).to_csv('x_km_rca_shroom.csv')

df_em_rca_shroom.to_csv('df_em_rca_shroom.csv')
df_em_score_rca_shroom.to_csv('df_em_score_rca_shroom.csv')
s_em_cluster_rca_shroom.to_csv('s_em_cluster_rca_shroom.csv') 
x_em_rca_shroom.to_csv('x_em_rca_shroom.csv')

df_km_man_shroom.to_csv('df_km_man_shroom.csv')
s_km_man_cluster_shroom.to_csv('s_km_man_cluster_shroom.csv')
pd.DataFrame(x_km_man_shroom).to_csv('x_km_man_shroom.csv')

df_em_man_shroom.to_csv('df_km_man_shroom.csv')
df_em_score_man_shroom.to_csv('df_em_score_man_shroom.csv')
s_em_cluster_man_shroom.to_csv('s_em_cluster_man_shroom.csv')
x_em_man_shroom.to_csv('x_em_man_shroom.csv')

In [None]:
## save transformed datasets for dimension reduced shroom set
df_km_ica_shroom.to_csv('df_km_ica_shroom.csv')
s_km_ica_cluster_shroom.to_csv('s_km_ica_cluster_shroom.csv')
pd.DataFrame(x_km_ica_shroom).to_csv('x_km_ica_shroom.csv')

df_em_ica_shroom.to_csv('df_em_ica_shroom.csv')
df_em_score_ica_shroom.to_csv('df_em_score_ica_shroom.csv')
s_em_cluster_ica_shroom.to_csv('s_em_cluster_ica_shroom.csv') 
x_em_ica_shroom.to_csv('x_em_ica_shroom.csv')

## Neural Network Training

In [144]:
# wine training
grid_km_wine = perform_nn(x_km_wine, y_wine)
grid_em_wine = perform_nn(x_em_wine.iloc[:, :-1], x_em_wine.iloc[:,-1])
grid_pca_wine = perform_nn(x_pca_wine, y_wine)
grid_rca_wine = perform_nn(x_rca_wine, y_wine)
grid_man_wine = perform_nn(x_man_wine, y_wine)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits




Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [145]:
# shroom training
grid_km_shroom = perform_nn(x_km_shroom, y_shroom)
grid_em_shroom = perform_nn(x_em_shroom.iloc[:,:-1], x_em_shroom.iloc[:,-1])
grid_pca_shroom = perform_nn(x_pca_shroom, y_shroom)
grid_rca_shroom = perform_nn(x_rca_shroom, y_shroom)
grid_man_shroom = perform_nn(x_man_shroom, y_shroom)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


Fitting 5 folds for each of 25 candidates, totalling 125 fits




Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [165]:
# save wine grids
joblib.dump(grid_km_wine, 'grid_km_wine.pkl')
joblib.dump(grid_em_wine, 'grid_em_wine.pkl')
joblib.dump(grid_pca_wine, 'grid_pca_wine.pkl')
joblib.dump(grid_rca_wine, 'grid_rca_wine.pkl')
joblib.dump(grid_man_wine, 'grid_man_wine.pkl')

# save shroom grids
joblib.dump(grid_km_shroom, 'grid_km_shroom.pkl')
joblib.dump(grid_em_shroom, 'grid_em_shroom.pkl')
joblib.dump(grid_pca_shroom, 'grid_pca_shroom.pkl')
joblib.dump(grid_rca_shroom, 'grid_rca_shroom.pkl')
joblib.dump(grid_man_shroom, 'grid_man_shroom.pkl')

['grid_man_shroom.pkl']