In [3]:
pip install scikit-learn-extra -q

Note: you may need to restart the kernel to use updated packages.


In [35]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
import plotly.express as px

# Load the dataset
file_path = '/kaggle/input/insurance/insurance.csv'  # Update this path
data = pd.read_csv(file_path)

# Initialize Label Encoder
le = LabelEncoder()

# Encode the 'smoker' column
data['smoker_code'] = le.fit_transform(data['smoker'])

# Extract relevant features
X = data[['bmi', 'smoker_code', 'charges']]

In [5]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,smoker_code
0,19,female,27.9,0,yes,southwest,16884.924,1
1,18,male,33.77,1,no,southeast,1725.5523,0
2,28,male,33.0,3,no,southeast,4449.462,0
3,33,male,22.705,0,no,northwest,21984.47061,0
4,32,male,28.88,0,no,northwest,3866.8552,0


In [40]:
# Initialize KMeans with k clusters
k = 3
kmeans = KMeans(n_clusters=k, random_state=0)

# Fit the model
kmeans.fit(X)

# Predict the cluster labels
labels = kmeans.labels_

# Add the cluster labels to our dataframe
X_clustered = X.copy()
X_clustered['cluster'] = labels

# 2D scatter plot using Plotly
fig = px.scatter(X_clustered, x='bmi', y='charges',
                 color='smoker_code', labels={'smoker_code': 'smoker'},
                 title='Relationship between Charges and BMI by Smoking Condition',
                 category_orders={"smoker_code": ["no", "yes"]})  # Order categories for color

fig.show()





In [7]:
# 3D scatter plot using Plotly
fig = px.scatter_3d(X_clustered, x='bmi', y='smoker_code', z='charges',
                    color='cluster', labels={'cluster': 'Cluster'},
                    title='3D Scatter Plot of Clusters')
fig.show()

In [41]:
# Initialize KMedoids with k clusters
k = 3
kmedoids = KMedoids(n_clusters=k, random_state=0)

# Fit the model
kmedoids.fit(X)

# Predict the cluster labels
labels = kmedoids.labels_

# Add the cluster labels to our dataframe
X_clustered = X.copy()
X_clustered['cluster'] = labels

# 2D scatter plot using Plotly
fig = px.scatter(X_clustered, x='bmi', y='charges',
                 color='smoker_code', labels={'smoker_code': 'smoker'},
                 title='Relationship between Charges and BMI by Smoking Condition',
                 category_orders={"smoker_code": ["no", "yes"]})  # Order categories for color

fig.show()

In [42]:
# 3D scatter plot using Plotly
fig = px.scatter_3d(X_clustered, x='bmi', y='smoker_code', z='charges',
                    color='cluster', labels={'cluster': 'Cluster'},
                    title='3D Scatter Plot of Clusters')
fig.show()

In [43]:
# Initialize Gaussian Mixture Model with k components
k = 3
gmm = GaussianMixture(n_components=k, random_state=0)

# Fit the model
gmm.fit(X)

# Predict the cluster labels
labels = gmm.predict(X)

# Add the cluster labels to our dataframe
X_clustered = X.copy()
X_clustered['cluster'] = labels

# 2D scatter plot using Plotly
fig = px.scatter(X_clustered, x='bmi', y='charges',
                 color='smoker_code', labels={'smoker_code': 'smoker'},
                 title='Relationship between Charges and BMI by Smoking Condition',
                 category_orders={"smoker_code": ["no", "yes"]})  # Order categories for color

fig.show()

In [44]:
# 3D scatter plot using Plotly
fig = px.scatter_3d(X_clustered, x='bmi', y='smoker_code', z='charges',
                    color='cluster', labels={'cluster': 'Cluster'},
                    title='3D Scatter Plot of Clusters')
fig.show()

In [45]:
# Initialize Spectral Clustering with k clusters
k = 3
spectral = SpectralClustering(n_clusters=k, random_state=0, affinity='nearest_neighbors')

# Fit the model
spectral.fit(X)

# Predict the cluster labels
labels = spectral.labels_

# Add the cluster labels to our dataframe
X_clustered = X.copy()
X_clustered['cluster'] = labels

# 2D scatter plot using Plotly
fig = px.scatter(X_clustered, x='bmi', y='charges',
                 color='smoker_code', labels={'smoker_code': 'smoker'},
                 title='Relationship between Charges and BMI by Smoking Condition',
                 category_orders={"smoker_code": ["no", "yes"]})  # Order categories for color

fig.show()


Graph is not fully connected, spectral embedding may not work as expected.



In [46]:
# 3D scatter plot using Plotly
fig = px.scatter_3d(X_clustered, x='bmi', y='smoker_code', z='charges',
                    color='cluster', labels={'cluster': 'Cluster'},
                    title='3D Scatter Plot of Clusters')
fig.show()

In [47]:
# Perform t-SNE transformation to reduce to three dimensions
tsne = TSNE(n_components=3, random_state=0)
X_tsne = tsne.fit_transform(X)

# Initialize KMeans with k clusters
k = 3
kmeans = KMeans(n_clusters=k, random_state=0)

# Fit the model on the t-SNE output
kmeans.fit(X_tsne)

# Predict the cluster labels
labels = kmeans.labels_

# Add the cluster labels to our dataframe
X_clustered = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2', 'TSNE3'])
X_clustered['cluster'] = labels

# 2D scatter plot using Plotly
fig_2d = px.scatter(X_clustered, x='TSNE1', y='TSNE2',
                    color='cluster', labels={'cluster': 'Cluster'},
                    title='2D Scatter Plot of t-SNE Clusters')
fig_2d.show()





In [48]:
fig_3d = px.scatter_3d(X_clustered, x='TSNE1', y='TSNE2', z='TSNE3',
                       color='cluster', labels={'cluster': 'Cluster'},
                       title='3D Scatter Plot of t-SNE Clusters')

# Display plots
fig_3d.show()

In [49]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import plotly.express as px

# Load the dataset
file_path = '/kaggle/input/insurance/insurance.csv'  # Update this path
data = pd.read_csv(file_path)

# Selecting numerical features for PCA
numerical_features = data.select_dtypes(include=['float64', 'int64'])

# Standardizing the features
scaler = StandardScaler()
X_std = scaler.fit_transform(numerical_features)

# Applying PCA to reduce to three dimensions
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(X_std)

# Creating a DataFrame with the principal components
principalDf = pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2', 'PC3'])

# Performing KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(principalDf)
principalDf['Cluster'] = kmeans.labels_

# Visualizing the first three principal components with clusters
fig = px.scatter_3d(principalDf, x='PC1', y='PC2', z='PC3',
                    color='Cluster', labels={'Cluster': 'Cluster'},
                    title='3D PCA of Dataset with KMeans Clustering')

fig.show()






In [52]:
# Initialize DBSCAN
dbscan = DBSCAN(eps=0.8, min_samples=2)  # Adjust eps and min_samples as needed

# Fit the model and predict the cluster labels
labels = dbscan.fit_predict(X)

# Add the cluster labels to our dataframe
X_clustered = X.copy()
X_clustered['cluster'] = labels

# 2D scatter plot using Plotly
fig = px.scatter(X_clustered, x='bmi', y='charges',
                 color='cluster', labels={'cluster': 'Cluster'},
                 title='Relationship between Charges and BMI by Smoking Condition',
                 category_orders={"smoker_code": ["no", "yes"]})  # Order categories for color

fig.show()

In [53]:
# 3D scatter plot using Plotly
fig_3d = px.scatter_3d(X_clustered, x='bmi', y='charges', z='smoker_code',
                       color='cluster', labels={'cluster': 'Cluster'},
                       title='3D Plot: Relationship between Charges, BMI, and Smoking Condition')

# Display plots
fig_3d.show()

In [54]:
# Perform Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=3)  # You can adjust n_clusters as needed

# Fit the model and predict the cluster labels
labels = hierarchical.fit_predict(X)

# Add the cluster labels to our dataframe
X_clustered = X.copy()
X_clustered['cluster'] = labels

# 2D scatter plot using Plotly
fig_2d = px.scatter(X_clustered, x='bmi', y='charges',
                    color='cluster', labels={'cluster': 'Cluster'},
                    title='2D Plot: Relationship between Charges and BMI by Smoking Condition')

fig_2d.show()

In [55]:
# 3D scatter plot using Plotly
fig_3d = px.scatter_3d(X_clustered, x='bmi', y='charges', z='smoker_code',
                       color='cluster', labels={'cluster': 'Cluster'},
                       title='3D Plot: Relationship between Charges, BMI, and Smoking Condition')

# Display plots
fig_3d.show()