In [None]:
import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances

import joblib
import warnings
warnings.filterwarnings(action='ignore') 

## 1. Load Train_data

In [None]:
train_data = pd.read_csv("./dataset/train_data.csv")
train_data.head(2)

In [None]:
# type1 = 4
# train_data = train_data[train_data.type == type1]
# tdf1 = train_data[train_data.type == 1]
# tdf2 = train_data[train_data.type == 2]
# tdf3 = train_data[train_data.type == 3]
# tdf4 = train_data[train_data.type == 4]
# tdf5 = train_data[train_data.type == 5]
# tdf6 = train_data[train_data.type == 6]
# tdf7 = train_data[train_data.type == 7]
# tdf = [tdf0, tdf1, tdf2, tdf3, tdf4, tdf5, tdf6, tdf7]

In [None]:
X_train = train_data.iloc[:,:-1]
X_train.head(1)

In [None]:
y_train = train_data[['type']]
y_train.head(1)

## 2. Scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_train[:2]

## PCA Decomposition

In [None]:
pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)
X_train

In [None]:
x = X_train.T[0]
y = X_train.T[1]

In [None]:
plt.figure(figsize=(15, 6))
plt.scatter(x, y, s=500, edgecolor='black')
plt.show()

## K-Means Clustering

In [None]:
kmeans = KMeans(n_clusters=2, random_state=3)
kmeans.fit(X_train)
X_train

In [None]:
centroids = kmeans.cluster_centers_ # 센트로이드들의 좌표
centroids

In [None]:
cen_x = centroids.T[0]
cen_y = centroids.T[1]

In [None]:
y_kmeans = kmeans.fit_predict(X_train)
y_kmeans

In [None]:
plt.figure(figsize=(15, 6))
plt.scatter(x, y, s=500, edgecolor='black')
plt.scatter(cen_x, cen_y, s=500, marker="s")
plt.show()

In [None]:
plt.figure(figsize=(15, 6))

for cluster in range(2):
    plt.scatter(X_train[y_kmeans == cluster, 0], X_train[y_kmeans == cluster, 1], s=500, edgecolor='black')
    plt.scatter(cen_x, cen_y, s=300, edgecolor='black', color='yellow', marker='s')
    plt.text(centroids[cluster, 0], centroids[cluster, 1], cluster, va='center', ha='center')
    
plt.show()

In [None]:
# Calculate the distances between each data point and the centroids
distances = pairwise_distances(X_train, centroids)
distances

In [None]:
max_distances = np.max(distances, axis=0)
max_distances

## Save Model

In [None]:
joblib.dump(kmeans, 'kmeans_model3.joblib')

## Load Model

In [None]:
kmeans3 = joblib.load('kmeans_model.joblib')

## Load Test_Data

In [None]:
test_data = pd.read_csv("./dataset/test_data.csv")
test_data.head(2)

In [None]:
# test_data = test_data[test_data.type == type1]

In [None]:
X_test = test_data.iloc[:,:-1]
y_test = test_data[['type']]

## Scailing Test Data

In [None]:
sc = StandardScaler()
X_test = sc.fit_transform(X_test)
X_test[:2]

## PCA Decomposition - Test Data

In [None]:
pca = PCA(n_components=2)
X_test = pca.fit_transform(X_test)
X_test

In [None]:
x = X_test.T[0]
y = X_test.T[1]
plt.figure(figsize=(15, 6))
plt.scatter(x, y, s=500, edgecolor='black')
plt.show()

In [None]:
y_kmeans = kmeans3.fit_predict(X_test)
y_kmeans[:10]

In [None]:
plt.figure(figsize=(15, 6))

for cluster in range(2):
    plt.scatter(X_test[y_kmeans == cluster, 0], X_test[y_kmeans == cluster, 1], s=500, edgecolor='black')
    plt.scatter(cen_x, cen_y, s=300, edgecolor='black', color='yellow', marker='s')
    plt.text(centroids[cluster, 0], centroids[cluster, 1], cluster, va='center', ha='center')
    
plt.show()

### *** K-Means Clustering의 단점.. 거리중심으로 봐서 패턴을 못 읽는다

In [None]:
distances = pairwise_distances(X_test, centroids)
distances

### DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X_test)
labels.tolist()

In [None]:
plt.figure(figsize=(15, 6))
plt.scatter(X_test[:, 0], X_test[:, 1], c=labels, cmap='viridis')
plt.show()

### Affinity Propagation

In [None]:
from sklearn.cluster import AffinityPropagation

aff_prop = AffinityPropagation(damping=0.5, max_iter=500, convergence_iter=15)
labels = aff_prop.fit_predict(X_test)


In [None]:
plt.figure(figsize=(15, 6))
plt.scatter(X_test[:, 0], X_test[:, 1], c=labels, cmap='viridis')
plt.show()

In [None]:
from sklearn.cluster import AffinityPropagation
from sklearn.metrics import *

In [None]:
model = AffinityPropagation(preference=-1).fit(X_train)

cluster_centers_indices = model.cluster_centers_indices_
labels = model.labels_
n_clusters_ = len(cluster_centers_indices)

print('Estimated number of clusters: %d' % n_clusters_)
# print("Adjusted Rand Index: %0.3f" % adjusted_rand_score(labels_true, labels))
# print("Adjusted Mutual Information: %0.3f" % adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f" % silhouette_score(X_train, labels, metric='sqeuclidean'))

In [None]:
from itertools import cycle
plt.figure(figsize=(15, 6))

colors = cycle('rgb')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X_train[cluster_centers_indices[k]]
    plt.plot(X_train[class_members, 0], X_train[class_members, 1], col + '.')
    for x in X_train[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col, alpha=0.25)
    plt.plot(cluster_center[0], cluster_center[1], 'o', mec='k', mew=3, markersize=7)

plt.show()