# Clustering and Deep Learning Template

In [None]:
# imports

# general
import pandas as pd
import numpy as np

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# modeling
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from tensorflow.keras import Sequential
from tensorflow.keras import metrics
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# plotting
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.metrics import silhouette_score

# 1 Clustering - Cleaning (1.25)

## 1.1 Clustering data file is correctly loaded (0.25)

In [None]:
# load data
path = ""
df = pd.read_csv()

## 1.2 Data is properly cleaned (1)

In [None]:
# make all lowercase
df = df.applymap(lambda x: x.lower() if type(x) == str else x)

In [None]:
# check
df.info()

In [None]:
# drop any duplicates
df.drop_duplicates(inplace = True)

# check
df.duplicated().sum()

In [None]:
# determine number of missing values
df.isna().sum()

# no missing values

In [None]:
# inspect data (all dtypes are int64)

df.describe()

# 2 Clustering (6.5)

## 2.1 Data is properly prepared for clustering (1)

In [None]:
# create copy of data to use for part 1
df1 = df.copy()

In [None]:
# no need for a preprocessor object; all data only needs to be scaled

scaler = StandardScaler()

In [None]:
# scale all data with scaler
scaled_data = scaler.fit_transform(df1)

# check
scaled_data[:5]

## 2.2 KMeans model is fit on data (2)

## 2.3 Compare intertia over 2-10 clusters (0.5)

## 2.4 Compare silhouette score over 2-10 clusters (0.5)

In [None]:
# create empty lists for inertias and silhouette scores
inertias = []
silhouette_scores = []

# look through k values 2-10 and store inertias and silhouette_scores
for k in range(2, 11):
    kmeans = KMeans(n_clusters = k, random_state = 42)
    kmeans.fit(scaled_data)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(scaled_data, kmeans.labels_))
    
# visualize plots
fig, axes = plt.subplots(ncols = 2, figsize = (10, 5))

# inertia
axes[0].set_title('inertia plot')
axes[0].plot(range(2, 11), inertias, marker = '.')
axes[0].set_ylabel('inertia')
axes[0].set_xlabel('clusters')

# find the knee point for inertia plot
knee_locator = KneeLocator(range(2, 11), 
                           inertias,
                           curve = 'convex',
                           direction = 'decreasing',
                           interp_method = 'polynomial')
knee = knee_locator.knee
axes[0].axvline(x = knee, color = 'red')

# silhouette scores
axes[1].set_title('silhouette scores plot')
axes[1].plot(range(2, 11), silhouette_scores, marker = '.')
axes[1].set_ylabel('silhouette scores')
axes[1].set_xlabel('clusters')

fig.tight_layout();

## 2.5 Explain choice of number of clusters based on silhouette score or inertia (0.25)

The knee locator on the inertia plot indicates that 6 clusters is the ideal knee point of the plot. Visually, I can also see a slight knee in the plot at 7 clusters. Any number of clusters before 6 or after 7 does not seem to be an inflection point in the plot.

The silhouette scores plot indicates that 2 clusters would be a good number for this data, and the second-best would be 7 or 8, both of which have significantly lower silhouette scores than 2, but comparable silhouette scores to each other.

Seven clusters seems to be a compromise number between the inertia and silhouette scores plot. The inertia plot suggests 6 or 7, and the silhouette scores plot suggests 2, 8, or possibly 7. So, I will proceed with 7 clusters for this analysis.

In [None]:
# instantiate model with 7 clusters
kmeans = KMeans(n_clusters = 7, random_state = 42)
kmeans.fit(scaled_data)

# add clusters as column in df1
df1['cluster'] = kmeans.labels_

# check
df1.head(10)

In [None]:
# look at aggregate means by cluster in table form
# as_index = False makes clusters a column instead of the index
cluster_groups = df1.groupby('cluster', as_index = False).mean()
cluster_groups

## 2.6 1st explanatory visualization to describe a trend in clusters (1)

In [None]:
# visualize the clusters' aggregate means with bar charts
# 24 columns
fig, axes = plt.subplots(8, 3, figsize = (9, 24))

# axes.ravel() flattens array
axes = axes.ravel()

# loop over columns and plot each; skip 'cluster' column
for i, col in enumerate(cluster_groups.columns[1:]):
    axes[i].bar(cluster_groups['cluster'], cluster_groups[col])
    axes[i].set_title(f"Mean {col}")

plt.tight_layout();

## 2.7 Interprets 1st visualization to describe a trend in clusters (0.5)

interpretation

## 2.8 2nd explanatory visualization shows a difference between clusters (0.5)

In [None]:
# scatterplot of two features (one on x and one on y axis)
# hue = clusters

## 2.9 Interprets 2nd visualization to describe a trend in clusters (0.25)

interpretation

# 3 Modeling (11)

## 3.1 Modeling data file is correctly loaded (0.25)

In [None]:
# load data
path = ""
df = pd.read_csv()

In [None]:
# create copy of df to use for part 2
df2 = df.copy()

## 3.2 Train/text split is performed (3)

In [None]:
# split into target (y) and features (X)
target = 'satisfied'
y = df2[target]
X = df2.drop(columns = target)

# check
print(f"y:\n{y}")
print(f"X:\n{X}")

In [None]:
# train/test split for model validation
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# check
print(f"shape of X_train: {X_train.shape}")
print(f"shape of X_test: {X_test.shape}")
print(f"shape of y_train: {y_train.shape}")
print(f"shape of y_test: {y_test.shape}")

## 3.3 Data is prepared for PCA without data leakage (0.5)

## 3.4 PCA is performed without data leakage (1)

## 3.5 Sequential deep learning model 1 is created and fit on data (1)

## 3.6 Reasons for changes between models 1 and 2 are explained in text (0.25)

## 3.7 Sequential deep learning model 2 is created and fit on data (0.5)

## 3.8 Reasons for changes between models 2 and 3 are explained in text (0.25)

## 3.9 Sequential deep learning model 3 is created and fit on data (0.25)

## 3.10 Regularization is used with at least 1 model (0.5)

## 3.11 Each model's final layer is appropriate to the prediction type (1)

## 3.12 Model loss function is appropriate to prediction type (1)

## 3.13 Final model is chosen (0.5)

## 3.14 Final model choice is justified in text (0.25)

## 3.15 Evaluates final model with multiple appropriate metrics (0.25)

## 3.16 Summary relates final model performance to business problem (0.5)

# 4 Other (1.5)

## 4.1 File requirements: Python code for part 1 (0.25)

## 4.2 File requirements: Python code for part 2 (0.25)

## 4.3 Code: Organization, structure, indentation, comments (0.25)

## 4.4 Code: Unnecessary code or errors in code (0.25)

## 4.5 Code: Unnecessary imports (0.25)

## 4.6 Code: Unnecessary files (0.25)

# Total points

In [1]:
1.25 + 6.5 + 11 + 1.5

20.25