## Imports

In [63]:
import numpy as np

# Graphing
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Balancing Data
from imblearn.over_sampling import SMOTE

In [8]:
# Python Scripts
%run -i "scripts.py"

## Building the Model

In [9]:
y = credit_df["Class"]  # Dependent variable
X = credit_df.drop(columns=["Class"])  # Independent variable

In [10]:
# Splitting Data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=19, test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((227845, 30), (56962, 30), (227845,), (56962,))

## Balancing The Data
- Using SMOTE to balance the data

In [64]:
# Initializating SMOTE
smote = SMOTE(random_state=19)

In [65]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

## Clustering

In [66]:
# Instantiating K-Means Clustering
kmeans = KMeans(random_state=19)

In [73]:
# Grid search for kmeans
params_kmeans = {
    "n_clusters": [20, 25, 30, 35]
}

grid_search_kmeans = GridSearchCV(estimator=kmeans, param_grid=params_kmeans, cv=10)

In [74]:
grid_search_kmeans.fit(X_train_resampled, y_train_resampled)

GridSearchCV(cv=10, error_score=nan,
             estimator=KMeans(algorithm='auto', copy_x=True, init='k-means++',
                              max_iter=300, n_clusters=8, n_init=10,
                              n_jobs=None, precompute_distances='auto',
                              random_state=19, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid={'n_clusters': [20, 25, 30, 35]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [75]:
grid_search_kmeans.best_estimator_

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=35, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=19, tol=0.0001, verbose=0)

In [76]:
grid_search_kmeans.score(X_train_resampled, y_train_resampled)

-719903801978.0677