In [None]:
import os
from os.path import join
# From https://stackoverflow.com/questions/51424312/how-to-save-gridsearchcv-object
import joblib
#save your model or results
# joblib.dump(gs, 'model_file_name.pkl')

#load your model for further usage
# joblib.load("model_file_name.pkl")

import pandas as pd
import numpy as np
import timeit

import matplotlib.pyplot as plt
import seaborn as sns

# Linear model for PCA
from sklearn.linear_model import LogisticRegression

# Estimation maximization
from sklearn.mixture import GaussianMixture

# Neural Network
from sklearn.neural_network import MLPClassifier

# Clustering Algo
from sklearn.cluster import KMeans

# Principal and indipendent component analysis
from sklearn.decomposition import FastICA, PCA

# Randomized projections
from sklearn.random_projection import GaussianRandomProjection,\
      SparseRandomProjection as RCA

# Manifolds
from sklearn.manifold import LocallyLinearEmbedding

# Mertrics
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score as s_score,\
      calinski_harabasz_score as ch_score,\
        homogeneity_completeness_v_measure as hcv_score, adjusted_rand_score as ar_score,\
        pairwise_distances
from sklearn.metrics import \
    balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import make_pipeline, Pipeline

# pre-processing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import \
    LearningCurveDisplay, validation_curve, learning_curve, train_test_split, ShuffleSplit,\
          GridSearchCV, cross_validate

# Initialization

## Raw Data

In [None]:
red_wine = os.path.join('data','wine', 'winequality-red.csv')
white_wine = os.path.join('data','wine', 'winequality-white.csv')
turbine = os.path.join('data','turbine','gt_2011.csv')
mushrooms = os.path.join('data','mushroom','secondary_data.csv')

# encoders to use
scale = StandardScaler()
ohe = OneHotEncoder(sparse_output=False)

In [None]:
transformer = make_column_transformer(
    (
        ohe, 
        [
        'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment','gill-spacing', 'gill-color', 
       'stem-root', 'stem-surface', 'stem-color','veil-type', 'veil-color',
        'has-ring', 'ring-type', 'spore-print-color','habitat', 'season'
        ]
        ),
    remainder='passthrough'
    )

# pre-process shroom data

shroom_df = pd.read_csv(mushrooms,sep=';').sample(frac=1).reset_index(drop=True)
x = shroom_df.iloc[:,1:].copy()
x_shroom = pd.DataFrame(transformer.fit_transform(x), 
                columns=transformer.get_feature_names_out())
y = shroom_df.iloc[:,0].copy()
y_shroom = (y == 'p')


# reduce the number of training examples
x_shroom = x_shroom[:7000]
y_shroom =  y_shroom[:7000]

# Scale numerical attributes to be b/w 1 and -1
lst_of_num_cols = [
    'remainder__cap-diameter', 'remainder__stem-height', 'remainder__stem-width'
    ]
x_shroom[lst_of_num_cols] = scale.fit_transform(x_shroom[lst_of_num_cols])

In [None]:
white_df = pd.read_csv(white_wine, sep=';')
red_df = pd.read_csv(red_wine, sep=';')
white_df['type'] = 0
red_df['type'] = 1
wine_df = pd.concat([white_df,red_df])
wine_df = wine_df.sample(frac=1).reset_index(drop=True)

# set x and y values
# remove 'quality' and 'type' column from x array
x_wine = wine_df.iloc[:,:-2].copy()
# scale x vals
x_wine.values[:,:] = scale.fit_transform(x_wine)
# set y array equal to 'type' column 
y_wine = wine_df.iloc[:,-1].copy()

In [None]:
DATA_FOLDER = join('algo_data') 

## Read in Data

In [None]:
## read wine results

df_km_wine = pd.read_csv(join(DATA_FOLDER,'df_km_wine.csv'))
s_km_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_km_cluster_wine.csv'))
x_km_wine= pd.read_csv(join(DATA_FOLDER,'x_km_wine.csv'))

df_em_wine= pd.read_csv(join(DATA_FOLDER,'df_em_wine.csv'))
df_em_score_wine= pd.read_csv(join(DATA_FOLDER,'df_em_score_wine.csv'))
s_em_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_em_cluster_wine.csv'))
x_em_wine= pd.read_csv(join(DATA_FOLDER,'x_em_wine.csv'))

In [None]:
## read shroom results

df_km_shroom = pd.read_csv(join(DATA_FOLDER,'df_km_shroom.csv'))
s_km_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_km_cluster_shroom.csv'))
x_km_shroom= pd.read_csv(join(DATA_FOLDER,'x_km_shroom.csv'))

df_em_shroom= pd.read_csv(join(DATA_FOLDER,'df_em_shroom.csv'))
df_em_score_shroom= pd.read_csv(join(DATA_FOLDER,'df_em_score_shroom.csv'))
s_em_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_em_cluster_shroom.csv'))
x_em_shroom= pd.read_csv(join(DATA_FOLDER,'x_em_shroom.csv'))

## Clustering Algos

In [39]:
def perform_kmeans(x_data, y_data, num_runs=5):
    import warnings
    from sklearn.exceptions import DataConversionWarning
    warnings.filterwarnings(action='ignore', category=DataConversionWarning)
    
    r_state = 123456
    num_clusters = num_clusters = np.arange(2,50)
    train_time_dict = {}
    s_score_dict = {}
    ch_score_dict = {}
    v_score_dict = {}
    ar_score_dict = {}
    x_train, x_test, y_train, y_test =\
      train_test_split(x_data, y_data, test_size=0.2)

    for i in list(range(num_runs)):

        train_time_lst = []
        s_score_lst = []
        ch_score_lst = []
        v_score_lst = []
        ar_score_lst = []

        for num in num_clusters:

            st = timeit.default_timer()
            k_model = KMeans(n_init=10, n_clusters=num).fit(x_train)
            et = timeit.default_timer()
            train_time_lst.append(et - st)

            y_pred = k_model.predict(x_test)

            h,c,v = hcv_score(y_test, y_pred)
            v_score_lst.append(v)
            ar_score_lst.append(ar_score(y_test, y_pred))

            ch_score_lst.append(
                ch_score(x_train, k_model.labels_)
                )
            
            s_score_lst.append(
                s_score(x_train, k_model.labels_)
                )
            
        
        train_time_dict[f'train_iter_{i}'] = train_time_lst
        s_score_dict[f's_score_iter_{i}'] = s_score_lst
        v_score_dict[f'v_score_iter_{i}'] = v_score_lst
        ch_score_dict[f'ch_score_iter_{i}'] = ch_score_lst
        ar_score_dict[f'ar_score_iter_{i}'] = ar_score_lst

    df_time = pd.DataFrame(train_time_dict)
    df_s_score = pd.DataFrame(s_score_dict)
    df_v_score = pd.DataFrame(v_score_dict)
    df_ch_score = pd.DataFrame(ch_score_dict)
    df_ar_score = pd.DataFrame(ar_score_dict)

    df_results = pd.DataFrame({
        'num_clusters': num_clusters,
        'mean_fit_time': df_time.mean(1),
        'std_fit_time': df_time.std(1),
        'mean_s_score': df_s_score.mean(1),
        'std_s_score': df_s_score.std(1),
        'mean_v_score': df_v_score.mean(1),
        'std_v_score': df_v_score.std(1),
        'mean_ch_score': df_ch_score.mean(1),
        'std_ch_score': df_ch_score.std(1),
        'mean_ar_score': df_ar_score.mean(1),
        'std_ar_score': df_ar_score.std(1)
        })
    
    # select the number of clusters based on ch score
    mask = df_results['mean_ch_score'] == df_results['mean_ch_score'].max()
    selected_n_clusters =df_results[mask]['num_clusters']
    x_transformed = KMeans(n_init=10, n_clusters=selected_n_clusters.iloc[0]).\
        fit_transform(x_data)

    return df_results, selected_n_clusters, x_transformed

In [40]:
def perform_em(x_data, y_data, num_runs=5):

    def gmm_bic_score(estimator, x_data):
        """Callable to pass to GridSearchCV that will use the BIC score."""
        # Make it negative since GridSearchCV expects a score to maximize
        return -estimator.bic(x_data)

    n_components = np.arange(2,50)
    import warnings
    from sklearn.exceptions import DataConversionWarning
    warnings.filterwarnings(action='ignore', category=DataConversionWarning)
    num_clusters = np.arange(2,50)
    # if x_data.shape[1] == 2:
        # num_clusters = np.arange(5,50)

    train_time_dict = {}
    s_score_dict = {}
    ch_score_dict = {}
    v_score_dict = {}
    ar_score_dict = {}
    x_train, x_test, y_train, y_test =\
      train_test_split(x_data, y_data, test_size=0.2)

    for i in list(range(num_runs)):

        train_time_lst = []
        s_score_lst = []
        ch_score_lst = []
        v_score_lst = []
        ar_score_lst = []

        for num in num_clusters:

            st = timeit.default_timer()
            k_model = GaussianMixture(n_init=10, n_components=num).fit(x_train)
            et = timeit.default_timer()
            train_time_lst.append(et - st)

            y_pred = k_model.predict(x_test)

            h,c,v = hcv_score(y_test, y_pred)
            v_score_lst.append(v)
            ar_score_lst.append(ar_score(y_test, y_pred))

            ch_score_lst.append(
                ch_score(x_test, y_pred)
                )
            
            s_score_lst.append(
                s_score(x_test, y_pred)
                )
            
        
        train_time_dict[f'train_iter_{i}'] = train_time_lst
        s_score_dict[f's_score_iter_{i}'] = s_score_lst
        v_score_dict[f'v_score_iter_{i}'] = v_score_lst
        ch_score_dict[f'ch_score_iter_{i}'] = ch_score_lst
        ar_score_dict[f'ar_score_iter_{i}'] = ar_score_lst

    df_time = pd.DataFrame(train_time_dict)
    df_s_score = pd.DataFrame(s_score_dict)
    df_v_score = pd.DataFrame(v_score_dict)
    df_ch_score = pd.DataFrame(ch_score_dict)
    df_ar_score = pd.DataFrame(ar_score_dict)

    df_scores = pd.DataFrame({
        'num_clusters': num_clusters,
        'mean_fit_time': df_time.mean(1),
        'std_fit_time': df_time.std(1),
        'mean_s_score': df_s_score.mean(1),
        'std_s_score': df_s_score.std(1),
        'mean_v_score': df_v_score.mean(1),
        'std_v_score': df_v_score.std(1),
        'mean_ch_score': df_ch_score.mean(1),
        'std_ch_score': df_ch_score.std(1),
        'mean_ar_score': df_ar_score.mean(1),
        'std_ar_score': df_ar_score.std(1)
        })
    param_grid = {
        "n_components": n_components,
        "covariance_type": ["full"],
    }

    grid_search = GridSearchCV(
        GaussianMixture(), param_grid=param_grid, scoring=gmm_bic_score
    )

    grid_search.fit(x_data)

    
    df = pd.DataFrame(grid_search.cv_results_)[
        ["param_n_components", "param_covariance_type", "mean_test_score","std_test_score",
          "mean_fit_time", "std_fit_time", "mean_score_time", "std_score_time"]
    ]
    
    df["mean_test_score"] = -df["mean_test_score"]

    df = df.rename(
        columns={
            "param_n_components": "n_components",
            "param_covariance_type": "covariance_type",
            "mean_test_score": "bic_score",
            "std_test_score": "std_bic_score",
        }
    )

    df = df.sort_values(by="bic_score")

    # select n clusters based on minimum bic score
    mask = df['bic_score'] == df['bic_score'].min()
    selected_n_clusters = df[mask]['n_components']
    x_new, y_new = GaussianMixture(n_init=10, 
                                    n_components=5).\
                                        fit(x_wine).sample(n_samples=7000)

    x_transformed = pd.DataFrame(x_new)
    y_transformed = pd.DataFrame(y_new)

    x_final = pd.concat([x_transformed,y_transformed], axis=1)


    return df, df_scores, selected_n_clusters, x_final

## Dimension Reduced Dataset

In [None]:
# dimension reduced wine dataset
df_pca_wine= pd.read_csv(join(DATA_FOLDER,'df_pca_wine.csv'), index_col=0)
s_pca_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_pca_cluster_wine.csv'), index_col=0)
x_pca_wine= pd.read_csv(join(DATA_FOLDER,'x_pca_wine.csv'), index_col=0)

df_rca_wine= pd.read_csv(join(DATA_FOLDER,'df_rca_wine.csv'), index_col=0)
s_rca_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_rca_cluster_wine.csv'), index_col=0)
x_rca_wine= pd.read_csv(join(DATA_FOLDER,'x_rca_wine.csv'), index_col=0)

df_man_wine= pd.read_csv(join(DATA_FOLDER,'df_man_wine.csv'), index_col=0)
s_man_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_man_cluster_wine.csv'), index_col=0)
x_man_wine= pd.read_csv(join(DATA_FOLDER,'x_man_wine.csv'), index_col=0)

df_ica_wine= pd.read_csv(join(DATA_FOLDER,'df_ica_wine.csv'))
s_ica_cluster_wine= pd.read_csv(join(DATA_FOLDER,'s_ica_cluster_wine.csv'))
x_ica_wine= pd.read_csv(join(DATA_FOLDER,'x_ica_wine.csv'))

In [None]:
# dimension reduced shroom dataset
df_pca_shroom= pd.read_csv(join(DATA_FOLDER,'df_pca_shroom.csv'), index_col=0)
s_pca_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_pca_cluster_shroom.csv'), index_col=0)
x_pca_shroom= pd.read_csv(join(DATA_FOLDER,'x_pca_shroom.csv'), index_col=0)

df_rca_shroom= pd.read_csv(join(DATA_FOLDER,'df_rca_shroom.csv'), index_col=0)
s_rca_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_rca_cluster_shroom.csv'), index_col=0)
x_rca_shroom= pd.read_csv(join(DATA_FOLDER,'x_rca_shroom.csv'), index_col=0)

df_man_shroom= pd.read_csv(join(DATA_FOLDER,'df_man_shroom.csv'), index_col=0)
s_man_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_man_cluster_shroom.csv'), index_col=0)
x_man_shroom= pd.read_csv(join(DATA_FOLDER,'x_man_shroom.csv'), index_col=0)

df_ica_shroom= pd.read_csv(join(DATA_FOLDER,'df_ica_shroom.csv'))
s_ica_cluster_shroom= pd.read_csv(join(DATA_FOLDER,'s_ica_cluster_shroom.csv'))
x_ica_shroom= pd.read_csv(join(DATA_FOLDER,'x_ica_shroom.csv'))

# Redo Clustering on DRed Datasets

In [None]:
## Generate transformed datasets for dimension reduced wine set
df_km_pca_wine, s_km_pca_cluster_wine, x_km_pca_wine =\
    perform_kmeans(x_pca_wine, y_wine)
df_em_pca_wine,df_em_score_pca_wine, s_em_cluster_pca_wine, x_em_pca_wine =\
    perform_em(x_pca_wine, y_wine)

df_km_rca_wine, s_km_rca_cluster_wine, x_km_rca_wine =\
    perform_kmeans(x_rca_wine, y_wine)
df_em_rca_wine,df_em_score_rca_wine, s_em_cluster_rca_wine, x_em_rca_wine =\
    perform_em(x_rca_wine, y_wine)

df_km_man_wine, s_km_man_cluster_wine, x_km_man_wine =\
    perform_kmeans(x_man_wine, y_wine)

# clustering using wine dataset dimension reduced using ica
df_km_ica_wine, s_km_ica_cluster_wine, x_km_ica_wine =\
    perform_kmeans(x_ica_wine, y_wine)

df_em_ica_wine,df_em_score_ica_wine, s_em_cluster_ica_wine, x_em_ica_wine =\
    perform_em(x_ica_wine, y_wine)

In [None]:
## Generate transformed datasets for dimension reduced shroom set
df_km_pca_shroom, s_km_pca_cluster_shroom, x_km_pca_shroom =\
      perform_kmeans(x_pca_shroom, y_shroom)
df_em_pca_shroom,df_em_score_pca_shroom, s_em_cluster_pca_shroom, x_em_pca_shroom =\
      perform_em(x_pca_shroom, y_shroom)

df_km_rca_shroom, s_km_rca_cluster_shroom, x_km_rca_shroom =\
      perform_kmeans(x_rca_shroom, y_shroom)
df_em_rca_shroom,df_em_score_rca_shroom, s_em_cluster_rca_shroom, x_em_rca_shroom =\
      perform_em(x_rca_shroom, y_shroom)

df_km_man_shroom, s_km_man_cluster_shroom, x_km_man_shroom =\
      perform_kmeans(x_man_shroom, y_shroom)
df_em_man_shroom,df_em_score_man_shroom, s_em_cluster_man_shroom, x_em_man_shroom =\
      perform_em(x_man_shroom, y_shroom)

# clustering using wine dataset dimension reduced using ica
df_km_ica_shroom, s_km_ica_cluster_shroom, x_km_ica_shroom =\
      perform_kmeans(x_ica_shroom, y_shroom)

df_em_ica_shroom,df_em_score_ica_shroom, s_em_cluster_ica_shroom, x_em_ica_shroom =\
      perform_em(x_ica_shroom, y_shroom)

In [41]:
# Perform clustering only for EM
df_em_pca_wine,df_em_score_pca_wine, s_em_cluster_pca_wine, x_em_pca_wine =\
      perform_em(x_pca_wine, y_wine)

df_em_rca_wine,df_em_score_rca_wine, s_em_cluster_rca_wine, x_em_rca_wine =\
      perform_em(x_rca_wine, y_wine)

df_em_man_wine,df_em_score_man_wine, s_em_cluster_man_wine, x_em_man_wine =\
      perform_em(x_man_wine, y_wine)

df_em_ica_wine,df_em_score_ica_wine, s_em_cluster_ica_wine, x_em_ica_wine =\
      perform_em(x_ica_wine, y_wine)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [42]:
## Generate transformed datasets for dimension reduced shroom set
df_em_pca_shroom,df_em_score_pca_shroom, s_em_cluster_pca_shroom, x_em_pca_shroom =\
      perform_em(x_pca_shroom, y_shroom)

df_em_rca_shroom,df_em_score_rca_shroom, s_em_cluster_rca_shroom, x_em_rca_shroom =\
      perform_em(x_rca_shroom, y_shroom)

df_em_man_shroom,df_em_score_man_shroom, s_em_cluster_man_shroom, x_em_man_shroom =\
      perform_em(x_man_shroom, y_shroom)

df_em_ica_shroom,df_em_score_ica_shroom, s_em_cluster_ica_shroom, x_em_ica_shroom =\
      perform_em(x_ica_shroom, y_shroom)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [51]:
## save transformed datasets for dimension reduced wine set
df_em_pca_wine.to_csv('df_em_pca_wine.csv')
df_em_score_pca_wine.to_csv('df_em_score_pca_wine.csv')
s_em_cluster_pca_wine.to_csv('s_em_cluster_pca_wine.csv')
x_em_pca_wine.to_csv('x_em_pca_wine.csv')

df_em_rca_wine.to_csv('df_em_rca_wine.csv')
df_em_score_rca_wine.to_csv('df_em_score_rca_wine.csv')
s_em_cluster_rca_wine.to_csv('s_em_cluster_rca_wine.csv') 
x_em_rca_wine.to_csv('x_em_rca_wine.csv')

df_em_man_wine.to_csv('df_km_man_wine.csv')
df_em_score_man_wine.to_csv('df_em_score_man_wine.csv')
s_em_cluster_man_wine.to_csv('s_em_cluster_man_wine.csv')
x_em_man_wine.to_csv('x_em_man_wine.csv')

df_em_ica_wine.to_csv('df_em_ica_wine.csv')
df_em_score_ica_wine.to_csv('df_em_score_ica_wine.csv')
s_em_cluster_ica_wine.to_csv('s_em_cluster_ica_wine.csv')
x_em_ica_wine.to_csv('x_em_ica_wine.csv')

In [52]:
## save transformed datasets for dimension reduced shroom set
df_em_pca_shroom.to_csv('df_em_pca_shroom.csv')
df_em_score_pca_shroom.to_csv('df_em_score_pca_shroom.csv')
s_em_cluster_pca_shroom.to_csv('s_em_cluster_pca_shroom.csv') 
x_em_pca_shroom.to_csv('x_em_pca_shroom.csv')

df_em_rca_shroom.to_csv('df_em_rca_shroom.csv')
df_em_score_rca_shroom.to_csv('df_em_score_rca_shroom.csv')
s_em_cluster_rca_shroom.to_csv('s_em_cluster_rca_shroom.csv') 
x_em_rca_shroom.to_csv('x_em_rca_shroom.csv')

df_em_man_shroom.to_csv('df_km_man_shroom.csv')
df_em_score_man_shroom.to_csv('df_em_score_man_shroom.csv')
s_em_cluster_man_shroom.to_csv('s_em_cluster_man_shroom.csv')
x_em_man_shroom.to_csv('x_em_man_shroom.csv')

df_em_ica_shroom.to_csv('df_em_ica_shroom.csv')
df_em_score_ica_shroom.to_csv('df_em_score_ica_shroom.csv')
s_em_cluster_ica_shroom.to_csv('s_em_cluster_ica_shroom.csv') 
x_em_ica_shroom.to_csv('x_em_ica_shroom.csv')