# DB Scan clustering (not from scratch) using pycaret library

### Preprocess the Data
Before clustering, it's important to handle any missing values and decide whether to include all features. For example, the 'CUST_ID' column might not be useful for clustering and can be ignored. Also, we need to address missing values, like in the 'MINIMUM_PAYMENTS' column.

In [1]:
import pandas as pd

# Load the dataset
file_path = '/content/CC GENERAL.csv'  # Adjust this path as needed
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

  CUST_ID      BALANCE  BALANCE_FREQUENCY  PURCHASES  ONEOFF_PURCHASES  \
0  C10001    40.900749           0.818182      95.40              0.00   
1  C10002  3202.467416           0.909091       0.00              0.00   
2  C10003  2495.148862           1.000000     773.17            773.17   
3  C10004  1666.670542           0.636364    1499.00           1499.00   
4  C10005   817.714335           1.000000      16.00             16.00   

   INSTALLMENTS_PURCHASES  CASH_ADVANCE  PURCHASES_FREQUENCY  \
0                    95.4      0.000000             0.166667   
1                     0.0   6442.945483             0.000000   
2                     0.0      0.000000             1.000000   
3                     0.0    205.788017             0.083333   
4                     0.0      0.000000             0.083333   

   ONEOFF_PURCHASES_FREQUENCY  PURCHASES_INSTALLMENTS_FREQUENCY  \
0                    0.000000                          0.083333   
1                    0.000000       

### Install and Import PyCaret

In [2]:

!pip install pycaret

# Import necessary modules from PyCaret
from pycaret.clustering import *




### Initialize PyCaret and Setup Data

In [3]:
# Initialize PyCaret and setup the data for clustering
# Note: We might need to ignore certain columns or handle missing values based on the dataset
clustering_setup = setup(data, normalize = True, session_id = 123, ignore_features = ['ColumnToIgnore'])


Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(8950, 18)"
2,Transformed data shape,"(8950, 8967)"
3,Ignore features,1
4,Numeric features,17
5,Categorical features,1
6,Rows with missing values,3.5%
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


### Create DBSCAN Model

In [4]:
# Create DBSCAN Model
# Adjust eps and min_samples based on the nature of your data
dbscan_model = create_model('dbscan', eps = 0.5, min_samples = 5)


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

###  Assign Clusters and Analyze

In [5]:
# Assign the clusters to the dataset
clustered_data = assign_model(dbscan_model)

# Display the first few rows with clusters
print(clustered_data.head())

# Summary of clusters
print(clustered_data['Cluster'].value_counts())

# Explore the properties of each cluster (if needed)
# This may involve statistical summaries, visualizations, etc.


  CUST_ID      BALANCE  BALANCE_FREQUENCY    PURCHASES  ONEOFF_PURCHASES  \
0  C10001    40.900749           0.818182    95.400002          0.000000   
1  C10002  3202.467529           0.909091     0.000000          0.000000   
2  C10003  2495.148926           1.000000   773.169983        773.169983   
3  C10004  1666.670532           0.636364  1499.000000       1499.000000   
4  C10005   817.714355           1.000000    16.000000         16.000000   

   INSTALLMENTS_PURCHASES  CASH_ADVANCE  PURCHASES_FREQUENCY  \
0               95.400002      0.000000             0.166667   
1                0.000000   6442.945312             0.000000   
2                0.000000      0.000000             1.000000   
3                0.000000    205.788010             0.083333   
4                0.000000      0.000000             0.083333   

   ONEOFF_PURCHASES_FREQUENCY  PURCHASES_INSTALLMENTS_FREQUENCY  \
0                    0.000000                          0.083333   
1                    0.0

In [6]:
# Save the results to a new CSV file
clustered_data.to_csv('/content/clustered_data.csv', index=False)
