In [None]:
# import kagglehub

# path = kagglehub.dataset_download("hrokrin/the-largest-diamond-dataset-currely-on-kaggle")
# filepath = path + '\\diamonds.csv'
# filepath

# Libraries

In [5]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

from imblearn.ensemble import BalancedRandomForestClassifier

import gower
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product

# Helper functions

### `info_df`

In [2]:
def info_df(df):
    non_null = len(df)-df.isnull().sum().values
    null = df.isnull().sum().values
    pct_non_null = np.round(100 * non_null / (non_null + null), 0)
    types = df.dtypes.values
    
    info_df = pd.DataFrame({
            "name": df.columns,
            "non_null": non_null,
            "null": null,
            "non_null_pct": pct_non_null,
            "type": types
        })
    
    print(info_df)
    print()
    print(f'{df.shape[0]:,} rows')
    return info_df

### `plot_corr_heatmap`

In [3]:
def plot_corr_heatmap(df, method):
    '''
    Plots a heatmap of the correlation matrix for df.
    Inputs:
        Dataframe: Dataframe to compute pairwise column correlations
        method: {‘pearson’, ‘kendall’, ‘spearman’} or callable
    '''

    # Copied code from seaborn examples
    # https://seaborn.pydata.org/examples/many_pairwise_correlations.html
    sns.set(style="white")

    # Generate a mask for the upper triangle
    mask = np.zeros_like(df.corr())
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(10, 10))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    # sns.heatmap(df.corr(), mask=mask, cmap=cmap, vmax=1, center=0,
    #             square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)
    sns.heatmap(df.corr(method=method, numeric_only=True), cmap=cmap, vmax=1, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

### `print_scores`

In [4]:
def print_scores(training_score, testing_score, metric='R^2'):
    print(f'Training {metric}: {training_score:.4}')
    print(f'Testing {metric}: {testing_score:.4}')

# Load data

## Initial load & inspection

In [6]:
filepath = 'C:\\Users\\jlefe\\.cache\\kagglehub\\datasets\\hrokrin\\the-largest-diamond-dataset-currely-on-kaggle\\versions\\1\\diamonds.csv'

In [None]:
raw_diamonds = pd.read_csv(filepath, index_col=0)
raw_frame_memory = raw_diamonds.memory_usage(index=True, deep=True).sum()

In [None]:
raw_diamonds_info = info_df(raw_diamonds);

In [None]:
raw_diamonds.sample(10)

Review all categorical feature domains.  Get values to store in dictionary for CategoricalDType below.

In [None]:
raw_diamonds.color.unique()

## Revised load

1. Read "unknown" as NaN
2. Create & assign customized categorial data types
2. Read `eye_clean` value of 'E1' as NaN, by excluding it from the domain specification
    - The rating is ambiguous
    - This rating only occurs in rows attributed to the GIA lab, whereas the GIA (Gemological Institute of America) professes not to use eye clean as a grading factor.
    - 300 of ~220K rows - only 1.4% of data set
3. Rename `cut` to `shape` (avoid confusion with traditional 'cut' which here is `cut_quality`)

### Define categorical variables

In [7]:
nominal_variables = ['cut', 'lab', 'fancy_color_dominant_color', 'fancy_color_secondary_color', 'fancy_color_overtone', 'fancy_color_intensity']
nominal_cat = {col_name:'category' for col_name in nominal_variables}

# Quality categories are ordered from worst to best
ordinal_cat = {
    'cut_quality':CategoricalDtype(['Fair', 'Good', 'Very Good', 'Excellent', 'Ideal'], ordered=True),
    'color':CategoricalDtype(['M', 'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D'], ordered=True),
    'clarity':CategoricalDtype(['I3', 'I2', 'I1', 'SI3', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'], ordered=True),
    'symmetry': CategoricalDtype(['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'], ordered=True),
    'polish': CategoricalDtype(['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'], ordered=True),
    'eye_clean': CategoricalDtype(['No', 'Borderline', 'Yes'], ordered=True),
    'culet_size': CategoricalDtype(['EL', 'VL', 'L', 'SL', 'M', 'S', 'VS', 'N'], ordered=True),
    'culet_condition': CategoricalDtype(['Chipped', 'Abraded', 'Pointed'], ordered=True),
    'girdle_min': CategoricalDtype(['XTN', 'VTN', 'STN', 'TN', 'M', 'STK', 'TK', 'VTK', 'XTK'], ordered=True),
    'girdle_max': CategoricalDtype(['XTN', 'VTN', 'STN', 'TN', 'M', 'STK', 'TK', 'VTK', 'XTK'], ordered=True),
}

ordinal_variables = list(ordinal_cat.keys())

### Revised load

In [8]:
diamonds = pd.read_csv(filepath, index_col=0, na_values=['unknown'], dtype={**ordinal_cat, **nominal_cat})
diamonds.rename(columns={'cut':'shape', 'color':'clear_color'}, inplace=True)
diamonds_info = info_df(diamonds)

                           name  non_null    null  non_null_pct      type
0                         shape    219703       0         100.0  category
1                   clear_color    210541    9162          96.0  category
2                       clarity    219703       0         100.0  category
3                  carat_weight    219703       0         100.0   float64
4                   cut_quality    159096   60607          72.0  category
5                           lab    219703       0         100.0  category
6                      symmetry    219703       0         100.0  category
7                        polish    219703       0         100.0  category
8                     eye_clean     62487  157216          28.0  category
9                    culet_size    133963   85740          61.0  category
10              culet_condition     15319  204384           7.0  category
11                depth_percent    219703       0         100.0   float64
12                table_percent    219

Update variable lists -- they will be used later.

In [None]:
nominal_variables.remove('cut')
nominal_variables.append('shape')

ordinal_variables.remove('color')
ordinal_variables.append('clear_color')

continuous_variables = list(diamonds.select_dtypes(include = 'number').columns)

print(f'Continuous variables: {continuous_variables}')
print(f'Ordinal variables: {ordinal_variables}')
print(f'Nominal variables: {nominal_variables}')

#### Check memory improvement

In [None]:
rev_frame_memory = diamonds.memory_usage(index=True, deep=True).sum()
print(f'{raw_frame_memory} -> {rev_frame_memory}')
print(f'{rev_frame_memory/raw_frame_memory*100:.1f}%')

# Initial data cleaning

## Remove `lab` column

3-letter code indicating the lab that performed the analysis.  We are making an assumption that this does not have an effect on the actual measurements or assessments.

In [9]:
# Remove lab column
diamonds.drop(columns='lab', inplace=True, errors='ignore')

## Check for duplicates

### Are there duplicates?

In [None]:
diamonds.duplicated().any()

### How many?

In [None]:
diamonds[diamonds.duplicated()].shape

### Remove duplicates

In [10]:
diamonds.drop_duplicates(inplace=True)
diamonds.shape

(215816, 24)

## Encode ordinal variables

- **Ordered** categorical variables get label encoding, i.e., map category to an integer in order. This does not change the number of columns, only their type.
- **Unordered** (nominal) variables get dummy-variable (one-hot) encoding, which will increase the number of columns.  Since some of these columns pertain only to fancy diamonds or only to clear diamonds, this encoding will be done later.

In [11]:
# use .where(...notna()) to keep NaNs and not map them to -1
diamonds = pd.DataFrame({
    var_name: diamonds[var_name].cat.codes.where(diamonds[var_name].notna()) if var_name in ordinal_variables else diamonds[var_name]
    for var_name in diamonds.columns
})
info_df(diamonds);

                           name  non_null    null  non_null_pct      type
0                         shape    215816       0         100.0  category
1                   clear_color    206717    9099          96.0  category
2                       clarity    215816       0         100.0      int8
3                  carat_weight    215816       0         100.0   float64
4                   cut_quality    155573   60243          72.0   float64
5                      symmetry    215816       0         100.0      int8
6                        polish    215816       0         100.0      int8
7                     eye_clean     62238  153578          29.0   float64
8                    culet_size    132964   82852          62.0   float64
9               culet_condition     15314  200502           7.0   float64
10                depth_percent    215816       0         100.0   float64
11                table_percent    215816       0         100.0   float64
12                  meas_length    215

# 💎 Supervised regression: Can a model predict the price of a fancy (colored) diamond or a clear diamond?

Because there are some columns that only apply to 'fancy' or clear diamonds, these two categories are predicted separately.

In [None]:
colors = diamonds.fancy_color_dominant_color.notna()
n_fancy = colors[colors].shape[0]
n_clear = colors[~colors].shape[0]
print(n_fancy, n_clear)

## Fancy diamonds

In [None]:
fancy_diamonds = diamonds[diamonds.fancy_color_dominant_color.notna()].copy()
info_df(fancy_diamonds);

### Data cleaning

#### Remove columns with missing values

There are several columns with a high number of null values -- remove these.

In [13]:
fancy_diamonds.dropna(axis='columns', inplace=True)
fancy_info = info_df(fancy_diamonds)

                          name  non_null  null  non_null_pct      type
0                        shape      9101     0         100.0  category
1                      clarity      9101     0         100.0      int8
2                 carat_weight      9101     0         100.0   float64
3                     symmetry      9101     0         100.0      int8
4                       polish      9101     0         100.0      int8
5                depth_percent      9101     0         100.0   float64
6                table_percent      9101     0         100.0   float64
7                  meas_length      9101     0         100.0   float64
8                   meas_width      9101     0         100.0   float64
9                   meas_depth      9101     0         100.0   float64
10  fancy_color_dominant_color      9101     0         100.0  category
11           total_sales_price      9101     0         100.0     int64

9,101 rows


Note that the `color` category gets dropped.  It is almost entirely empty, as this feature is represented by the `fancy_color` columns.

Of the `fancy_color` columns, the only one with sufficient information is the `...dominant_color` column

Any relationship between fancy color & sales price?

In [None]:
fancy_diamonds.plot.scatter(x='fancy_color_dominant_color', y='total_sales_price')

#### Look for correlated features

Instead of Pearson's coefficient, use Kendall's tau correlation function, which works on label-encoded ordinal variables as well as continuous variable.

Kendall's tau does not work for dummy (one-hot encoded) variables, so must leave those out.

In [None]:
plot_corr_heatmap(fancy_diamonds.select_dtypes(include="number"), method='kendall')

The volume measurements appear strongly correlated with the weight measurement (caret).  Double-check with Pearson's correlation.

In [None]:
volume_measures = ['meas_length', 'meas_width', 'meas_depth']
plot_corr_heatmap(fancy_diamonds[volume_measures + ['carat_weight']], method='pearson')

### Remove correlated features

In [16]:
fancy_diamonds.drop(columns=volume_measures, inplace=True, errors='ignore')

### Encode nominal variables with dummy variables

In [None]:
fancy_nominal_variables = [col_name for col_name in fancy_diamonds.columns if col_name in nominal_variables]
fancy_nominal_variables

In [None]:
fancy_nominal = pd.get_dummies(fancy_diamonds[fancy_nominal_variables], dtype='int')
fancy_ordered = fancy_diamonds.select_dtypes(include="number")
fancy_encoded = fancy_ordered.merge(fancy_nominal, right_index=True, left_index=True)
fancy_encoded.info()

### How well can we predict the price of a colored diamond?

### Setup
- Set features & target
- Create train/test sets

In [None]:
X_fancy = fancy_encoded.drop(columns='total_sales_price')
y_fancy = fancy_encoded['total_sales_price']
X_fancy_train, X_fancy_test, y_fancy_train, y_fancy_test = train_test_split(X_fancy, y_fancy, random_state=42, shuffle=True)

### Linear Regression model

In [None]:
fancy_lr = LinearRegression()
fancy_lr.fit(X_fancy_train, y_fancy_train)
fancy_R2_train = fancy_lr.score(X_fancy_train, y_fancy_train)
fancy_R2_test = fancy_lr.score(X_fancy_test, y_fancy_test)
print_scores(fancy_R2_train, fancy_R2_test)

#### Try without dummy variables

`cut` (cut shape) and `fancy_color`

In [None]:
X_fancy_ordered = fancy_ordered.drop(columns='total_sales_price', errors='ignore')
X_fancy_ordered_train, X_fancy_ordered_test, y_fancy_ordered_train, y_fancy_ordered_test = train_test_split(X_fancy_ordered, y_fancy, random_state=42, shuffle=True)

fancy_ordered_lr = LinearRegression()
fancy_ordered_lr.fit(X_fancy_ordered_train, y_fancy_ordered_train)
fancy_ordered_R2_train = fancy_ordered_lr.score(X_fancy_ordered_train, y_fancy_ordered_train)
fancy_ordered_R2_test = fancy_ordered_lr.score(X_fancy_ordered_test, y_fancy_ordered_test)
print_scores(fancy_ordered_R2_train, fancy_ordered_R2_test)

Worse

## Clear diamonds

In [None]:
clear_diamonds = diamonds[diamonds.fancy_color_dominant_color.isna()].drop(columns=[col_name for col_name in diamonds.columns if col_name.startswith('fancy')])
clear_info = info_df(clear_diamonds)

### Data cleaning

#### Remove columns with > 50% null.

In [18]:
drop_cols = list(clear_info[clear_info.non_null_pct < 50]['name'])
print(drop_cols)

['eye_clean', 'culet_condition', 'fluor_color', 'fluor_intensity']


In [None]:
clear_diamonds.drop(columns=drop_cols, inplace=True, errors='ignore')
clear_info = info_df(clear_diamonds)

#### Remove the rest of the missing data by removing rows with missing values

In [20]:
clear_diamonds.dropna(inplace=True)

#### Look for correlations

Exclude the one nominal variable, `shape`

In [None]:
plot_corr_heatmap(clear_diamonds.drop(columns=['shape', 'total_sales_price']), method='kendall')

In [None]:
# Smaller heatmap for slide deck
sns.heatmap(clear_diamonds.drop(columns='total_sales_price').corr(numeric_only=True, method='kendall'), cmap=sns.diverging_palette(220, 10, as_cmap=True))

### Remove correlated variables
Again, the volume measurements are strongly correlated with the weight measurement.

In [21]:
clear_diamonds.drop(columns=volume_measures, inplace=True, errors='ignore')

### Encode the one nominal variable, `shape`

This identifies the cut shape, not the cut quality

In [None]:
print(clear_diamonds['shape'].unique())

In [None]:
clear_nominal = pd.get_dummies(clear_diamonds['shape'], dtype='int')
clear_ordered = clear_diamonds.drop(columns='shape', errors='ignore')
clear_encoded = clear_ordered.merge(clear_nominal, right_index=True, left_index=True)

### Pre-processing
- Set features & target
- Create train/test sets

In [None]:
X_clear = clear_encoded.drop(columns='total_sales_price', errors='ignore')
y_clear = clear_encoded['total_sales_price']
X_clear_train, X_clear_test, y_clear_train, y_clear_test = train_test_split(X_clear, y_clear, random_state=42, shuffle=True)

### Linear Regression model

In [None]:
clear_lr = LinearRegression()
clear_lr.fit(X_clear_train, y_clear_train)
R2_clear_train = clear_lr.score(X_clear_train, y_clear_train)
R2_clear_test = clear_lr.score(X_clear_test, y_clear_test)
print_scores(R2_clear_train, R2_clear_test)

# 💎 Supervised Classification:
# Can a model distinguish colored diamonds from clear diamonds even if the color columns are missing?
- Combine `fancy` and `clear` data sets
- Replace the color column with Boolean `is_fancy`

## Prep

- In `fancy_diamonds` rename `fancy_color_dominant_color` to `dominant_color`
- In `clear_diamonds` add `dominant_color` column with value 'clear'

In [22]:
fancy_diamonds.rename(columns={'fancy_color_dominant_color':'dominant_color'}, inplace=True)
clear_diamonds['dominant_color'] = 'clear'

### Combine `fancy` and `clear` into a single dataframe

In [23]:
# Which columns are useful for both sets?
clear_columns = set(clear_diamonds.columns)
fancy_columns = set(fancy_diamonds.columns)
common_columns = list(clear_columns.intersection(fancy_columns))
common_columns

['shape',
 'table_percent',
 'polish',
 'carat_weight',
 'clarity',
 'depth_percent',
 'symmetry',
 'total_sales_price',
 'dominant_color']

In [24]:
all_diamonds = pd.concat([clear_diamonds[common_columns], fancy_diamonds[common_columns]])

### Encode the cut column again

In [None]:
all_diamonds_no_shape = all_diamonds.drop(columns='shape', errors='ignore')
shape_dummies = pd.get_dummies(all_diamonds['shape'], dtype='int', prefix='shape')
all_diamonds = all_diamonds_no_shape.merge(shape_dummies, right_index=True, left_index=True)
all_diamonds.columns = all_diamonds.columns.str.replace(' ', '_')

In [None]:
all_diamonds.columns

### Add Boolean `is_fancy` and remove `dominant_color` for this exercise

In [None]:
all_diamonds['is_fancy'] = all_diamonds['dominant_color'] != 'clear'

In [None]:
no_color = all_diamonds.drop(columns='dominant_color')
no_color.info()

#### Proportion of fancy diamonds

In [None]:
fancy_pct = no_color.is_fancy.value_counts()[True] / no_color.is_fancy.value_counts()[False]
print(f'{fancy_pct:.2%}')

## Model: Balanced Random Forest classifier

Use a balanced model because the proportion of fancy diamonds is quite small (~10%)

Notes on the parameters from the documentation: The default of `bootstrap` will change from `True` to `False` in version 0.13. **Bootstrapping is already taken care by the internal sampler using `replacement=True`.**

### Setup
- Set features & target
- Create train/test sets

In [None]:
X_all_no_color = no_color.drop(columns='is_fancy')
y_all_no_color = no_color['is_fancy']
X_all_no_color_train, X_all_no_color_test, y_all_no_color_train, y_all_no_color_test = train_test_split(X_all_no_color, y_all_no_color, random_state=42, shuffle=True)

###  Model

In [None]:
balanced_rfc = BalancedRandomForestClassifier(sampling_strategy='all', replacement=True, bootstrap=False)
balanced_rfc.fit(X_all_no_color_train, y_all_no_color_train)
y_all_no_color_train_pred = balanced_rfc.predict(X_all_no_color_train)
y_all_no_color_test_pred = balanced_rfc.predict(X_all_no_color_test)

#### Training results

In [None]:
print(confusion_matrix(y_all_no_color_train, y_all_no_color_train_pred))

In [None]:
print(f'Accuracy: {accuracy_score(y_all_no_color_train, y_all_no_color_train_pred):.2%}')

In [None]:
print(classification_report(y_all_no_color_train, y_all_no_color_train_pred))

In [None]:
fpr, tpr, _ = roc_curve(y_all_no_color_train, y_all_no_color_train_pred, pos_label=1)

# Find auc
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for class 1 (has_cancer)
plt.figure(figsize=[8,8])

# Plot fpr, tpr
plt.plot(fpr, tpr, color='skyblue', lw = 2, label = 'ROC curve (area = %0.2f)' % roc_auc)

plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('ROC for clear/fancy detection (training)', fontsize=18)
plt.legend(loc="lower right")
plt.show()

#### Testing results

In [None]:
print(confusion_matrix(y_all_no_color_test, y_all_no_color_test_pred))

In [None]:
print(f'Accuracy: {accuracy_score(y_all_no_color_test, y_all_no_color_test_pred):.2%}')

In [None]:
print(classification_report(y_all_no_color_test, y_all_no_color_test_pred))


In [None]:
fpr, tpr, _ = roc_curve(y_all_no_color_test, y_all_no_color_test_pred, pos_label=1)

# Find auc
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for class 1 (has_cancer)
plt.figure(figsize=[8,8])

# Plot fpr, tpr
plt.plot(fpr, tpr, color='skyblue', lw = 2, label = 'ROC curve (area = %0.2f)' % roc_auc)

plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('ROC for clear/fancy detection (test)', fontsize=18)
plt.legend(loc="lower right")
plt.show()

# 💎 Unsupervised classification: DBSCAN

## Setup

### Remove the `is_fancy` column from the last exercise

In [None]:
all_diamonds.drop(columns='is_fancy', inplace=True, errors='ignore')

In [None]:
all_diamonds.columns

### Encode `dominant_color`

In [None]:
colors = pd.get_dummies(all_diamonds['dominant_color'], dtype='int', prefix='color')
all_diamonds.drop(columns='dominant_color', inplace=True, errors='ignore')
all_diamonds = all_diamonds.merge(colors, left_index=True, right_index=True)


In [None]:
all_diamonds.info()

## Model

DBSCAN relies on distance measurements to find clusters.  Because there are mixed dtypes in the data set, we will use Gower's distance as the distance metric.  Unlike Euclidean distance, Gower's, a rank-based metric, can handle mixed data types.

Ordinarily we would first scale the data. However Gower's scales the data internally so it is not necessary to perform a separate scaling step.

Gower's distance is not implemented in scikit-learn, so we have to compute the distance matrix separately and pass that to the model.

### Calculate distance matrix

In [None]:
# 🛑✋🏼 STOP ✋🏼🛑
# Do not run this cell.  Calculating the matrix takes ~ 40 minutes.
#
# It is stored as a pickle.  Retrieve it below. The code is commented out for protection.

'''

'''

distance_matrix = gower.gower_matrix(all_diamonds)

with open('gower_matrix.pickle', 'wb') as handle:
    pickle.dump(distance_matrix, handle)



In [None]:
with open('gower_matrix.pickle', 'rb') as handle:
    distance_matrix = pickle.load(handle)

In [None]:
with open()

### Choose parameters

Rule of thumb for number of neighbors is 2 x number of features.

Then use an elbow curve to determine the right value for epsilon.

In [None]:
min_samples = 60

In [None]:
# Number of neighbors: k is usually the same as 'min_samples' in DBSCAN
k = min_samples # k = min_samples = 2 * # of features = 2 * 7

neighbors = NearestNeighbors(n_neighbors=k)
neighbors_fit = neighbors.fit(all_diamonds)
distances, indices = neighbors_fit.kneighbors(all_diamonds)

# Sort the distances of the k-th nearest neighbor
distances = np.sort(distances[:, k-1])

# Plot the k-distance graph (Elbow Curve)
# ymax = 5
# gridunit = 0.25

plt.plot(distances)
# plt.ylim(ymax=ymax)
# plt.yticks(np.arange(0, ymax)) # gridunit
plt.xlabel("Points")
plt.ylabel(f"{k}-th Nearest Neighbor Distance")
plt.title(f"{k}-th Nearest Neighbor Distance Elbow Curve")
plt.grid(True)
plt.show()


### Run model

In [None]:
'''
epsilon = 0.25
db = DBSCAN(eps=epsilon, min_samples=min_samples, metric = "precomputed").fit(distance_matrix)

# Number of clusters in labels, ignoring noise if present.
labels = db.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")
'''

### Let's try PCA

In [None]:
pca = PCA(n_components=2)
all_diamonds_pca = pd.DataFrame(pca.fit_transform(all_diamonds), columns=['PCA_1', 'PCA_2'])
all_diamonds_pca.shape

In [None]:
explained_variance=np.var(all_diamonds_pca,axis=0)
explained_variance_ratio = explained_variance/np.sum(explained_variance)
print(f'total explained variance: {explained_variance_ratio.sum()}' )

In [None]:
explained_variance_ratio.sum()

In [None]:
sns.scatterplot(all_diamonds_pca, x='PCA_1', y='PCA_2')

In [None]:
'''
pca_distance_matrix = gower.gower_matrix(all_diamonds_pca)

with open('pca_gower_matrix.pickle', 'wb') as handle:
    pickle.dump(pca_distance_matrix, handle)
'''

In [None]:
with open('pca_gower_matrix.pickle', 'rb') as handle:
    pca_distance_matrix = pickle.load(handle)

In [None]:
# 🛑✋🏼 Caution - this is likely to crash ✋🏼🛑
epsilon = 0.25
db = DBSCAN(eps=epsilon, min_samples=min_samples, metric = "precomputed").fit(pca_distance_matrix)

# Number of clusters in labels, ignoring noise if present.
labels = db.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")