# Clustering for the LoanIDs

## Testing for make it best

In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import OPTICS

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv("../All Data Create/Dataset/New Dataset half million.csv")

In [3]:
df.head()

Unnamed: 0,Id,AccountID,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Purpose_of_Loan__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Payment_Frequency__c,StageName,Applicant Age,summary_income,summary_income_cv,city,state,Country
0,0060K00000RW5M4QAL,0010K00001fghU0QAI,75,73,Living Expenses,4,200,4.571,Weekly,Loan Paid,53,6500.0,,Whittlesea,VIC,Australia
1,0062x00000AkpZnAAJ,0010K00001fghU0QAI,75,73,Living Expenses,10,200,10.429,Weekly,Loan Paid,53,6500.0,14429.94,Whittlesea,VIC,Australia
2,0062x00000AHV4lAAH,0010K00001fghU0QAI,75,73,Living Expenses,10,200,11.429,Weekly,Loan Paid,53,6500.0,9924.14,Whittlesea,VIC,Australia
3,0062x000009Ua6WAAS,0010K00001fghU0QAI,75,73,Living Expenses,10,200,10.286,Weekly,Loan Paid,53,6500.0,14683.88,Whittlesea,VIC,Australia
4,0062x000008zdrUAAQ,0010K00001fghU0QAI,75,73,Living Expenses,6,200,6.571,Weekly,Loan Paid,53,6500.0,8691.38,Whittlesea,VIC,Australia


In [4]:
df.shape

(514658, 16)

In [5]:
df[['Amount','Total_Repayments__c']].loc[1:1000]

Unnamed: 0,Amount,Total_Repayments__c
1,200,10
2,200,10
3,200,10
4,200,6
5,200,10
...,...,...
996,2000,2
997,2000,2
998,2000,2
999,2000,3


#### Checking for null values

In [6]:
df.isnull().sum()

Id                                0
AccountID                         0
Number_Of_Loans_Granted__c        0
Num_Of_Loans_Paid__c              0
Purpose_of_Loan__c               19
Total_Repayments__c               0
Amount                            0
Term_in_Weeks__c                  0
Payment_Frequency__c              0
StageName                         0
Applicant Age                     0
summary_income                    0
summary_income_cv             22618
city                           2367
state                             0
Country                           0
dtype: int64

<h2> DBSCAN </h2>
DBSCAN stands for Density-Based Spatial Clustering of Applications with Noise and is one of clustering algorithms implemented in scikit-learn library.  “A Density-Based Algorithm for Discovering Clusters in Large Spatial Database with Noise”.

The core idea of DBSCAN is around concept of dense regions. The assumption is that natural clusters are composed of densely located points. This requires definition of “dense region”. To do these two parameters are required for DBSCAN algorithm.

Eps, ε - distance
MinPts – Minimum number of points within distance Eps
Optionally the distance metric can be specified by a user, but usually Euclidean distance is implemented (like in scikit learn).

A “dense region” is therefore created by a minimum number of points within distance between all of them, Eps. Points which are within this distance but not close to minimum number of other points are treated as “border points”. Remaining ones are noise or outliers. This is shown in the picture below (for MinPts=3). Red points (D) are in a “dense region” – each one has minimum of 3 neighbours within distance Eps. Green points (B) are border ones – they have a neighbour within distance Eps but less than 3. Blue point (O) is an outlier – no neighbours within distance Eps.

<img src="https://pbs.twimg.com/media/FyBSN5_aIAAH6Rb?format=jpg&name=4096x4096" height="400" width="500" />

- Advantages of this approach:
    - it finds number of clusters itself, based on eps and MinPts parameters
    - It it able to differentiate elongated clusters or clusters surrounded by other clusters in contrary to e.g. K-Means where clusters are always convex.
    - It is also able to find points not fitting into any cluster – it detects outliers.

- The biggest drawback of DBSCAN:
    - High computational expense of average O(n log(n)) coming from a need to execute a neighbourhood query for each point.
    - Poorly identifies clusters with various densities

In DBSCAN there are two major hyperparameters:
- eps
- min_samples

In [7]:
# Define the parameters for DBSCAN
eps_values = np.arange(8,12.75,0.25)  # Example values for epsilon
min_samples_values = np.arange(3,10)  # Example values for min_samples

# Define the number of folds for cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds)

In [8]:
X = df[['Amount','Total_Repayments__c']].iloc[:100000]

In [9]:
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    best_score = -1
    best_params = {}

    # Iterate over parameter combinations
 
    for min_samples in min_samples_values:
        # Train DBSCAN model
        OPTICSscan =  OPTICS(min_samples=min_samples) #DBSCAN(eps=eps, min_samples=min_samples)
        OPTICSscan.fit(X_train)

        # Evaluate model using silhouette score
        labels = OPTICSscan.labels_
        if len(set(labels)) > 1:  # Silhouette score requires at least 2 clusters
            score = silhouette_score(X_train, labels)
            if score > best_score:
                best_score = score
                best_params = {'min_samples': min_samples}

    # Train DBSCAN model on the entire training set with the best parameters
    best_OPTICS = OPTICS(min_samples=best_params['min_samples'])
    best_OPTICS.fit(X_train)

    # Evaluate the best model on the validation set
    val_labels = best_OPTICS.fit_predict(X_val)
    val_score = silhouette_score(X_val, val_labels)

    print("Best parameters:", best_params)
    print("Validation silhouette score:", val_score)

# Get reachability distances
# reachability_distances = clustering.reachability_[clustering.ordering_]

  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]


Best parameters: {'min_samples': 3}
Validation silhouette score: 0.9852925008426683


  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]


Best parameters: {'min_samples': 3}
Validation silhouette score: 0.9791913817290447


  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]


Best parameters: {'min_samples': 3}
Validation silhouette score: 0.9811846332087539


  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]


Best parameters: {'min_samples': 3}
Validation silhouette score: 0.9773562816220767


  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]
  ratio = reachability_plot[:-1] / reachability_plot[1:]


Best parameters: {'min_samples': 3}
Validation silhouette score: 0.973370884284894


In [28]:
df[['Amount','Total_Repayments__c']].iloc[100000:200000]

Unnamed: 0,Amount,Total_Repayments__c
100000,1500,14
100001,200,3
100002,250,3
100003,350,3
100004,400,3
...,...,...
199995,500,4
199996,500,4
199997,400,6
199998,300,6


In [32]:
df_copy = df.copy()

In [33]:
df_copy["Cluster_result"] =  best_OPTICS.fit_predict(df[['Amount','Total_Repayments__c']])

MemoryError: Unable to allocate 3.75 MiB for an array with shape (1, 491895) and data type float64

In [None]:
df_copy["Cluster_result"].hist()

In [None]:
df_copy["Cluster_result"].value_counts()

In [None]:
df_copy["Cluster_result"].describe()

In [None]:
df_copy[df_copy["Cluster_result"]==0]

In [None]:
df_copy[["Total_Repayments__c","Cluster_result", "Amount"]].groupby("Cluster_result").describe()