# Ensemble various Clustering methods

We don't have information about the number of clusters so we can run multiple models with different parameters and then allow the models to vote.  The cluster assignment with the most votes is the one we will assign the data point to. 


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer

In [None]:
df = pd.read_csv("../input/tabular-playground-series-jul-2022/data.csv")
submission = pd.read_csv("../input/tabular-playground-series-jul-2022/sample_submission.csv")

In [None]:
df = df.drop(columns = "id")
cols = list(df.columns)

In [None]:
# Drop features that do not help us with clustering
drop_feats = [f'f_0{i}' for i in range(7)]
drop_feats = drop_feats + [f'f_{i}' for i in range(14,22)]
data_crop = data.drop(drop_feats, axis=1)
data_crop

## Preprocessing
- Here we try out different preprocessing pipelines

In [None]:
X_scaled = RobustScaler().fit_transform(df)
X_scaled = PowerTransformer().fit_transform(X_scaled)
X_scaled = pd.DataFrame(X_scaled, columns = cols)

## Additional Hyperparameters
- We now define a set of hyperparameters that we are **not** going to search values for. 
- We simply set them to be the same for all instances of our algorithm. 

**Plotting the data before we start searching**

In [None]:
pca = PCA(random_state = 10, whiten = True)
X_pca = pca.fit_transform(X_scaled)
PCA_df = pd.DataFrame({"PCA_1" : X_pca[:,0], "PCA_2" : X_pca[:,1]})

plt.figure(figsize=(14, 14))
sns.scatterplot(data = PCA_df, x = "PCA_1", y = "PCA_2", s=3, color='Red');

# Brute Force 🔥
## Searching for the optimal hyperparameters
- We now search the range of possible values to assign to our algorithm hyperparameters
- **Note:** The final one we use each time is names `preds_1`, this allows it to be used at the end of the notebook for submission.

In [None]:
models = {}

def ensemble_models(models):
    for m in models:
        print(m)

In [None]:
SEED = 620
additional_hyperparams = dict(  
                              covariance_type = 'full', 
                              random_state = SEED, 
                              n_init = 5, 
                              tol=.01
                             )

In [None]:
gm = GaussianMixture(n_components=7, **additional_hyperparams)
gm_preds = gm.fit_predict(X_scaled)

models['GaussianMixture'] = gm_preds

In [None]:
bgm = BayesianGaussianMixture(n_components=7, **additional_hyperparams)
bgm_preds = bgm.fit_predict(X_scaled)

models['BayesianGaussianMixture'] = bgm_preds

In [None]:
ensemble_models(models)

In [None]:
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X_scaled)

In [None]:
df = pd.DataFrame({"x" : reduced_data[:,0], "y" : reduced_data[:,1], "clusters" : gm_preds})
plt.figure(figsize=(20, 10))
sns.scatterplot(x=df["x"], y=df["y"], hue=df["clusters"], palette="deep")

In [None]:
df = pd.DataFrame({"x" : reduced_data[:,0], "y" : reduced_data[:,1], "clusters" : bgm_preds})
plt.figure(figsize=(20, 10))
sns.scatterplot(x=df["x"], y=df["y"], hue=df["clusters"], palette="deep")

# Submission

In [None]:
submission["Predicted"] = preds

In [None]:
submission.to_csv('../output/submission.csv', index=False)