In [167]:
import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import pairwise_distances
import plotly.express as px
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score as ss


import joblib
import warnings
warnings.filterwarnings(action='ignore') 

# Train
## Data Load -> Scailing -> PCA -> DBSCAN -> get Best EPS, Min_samples

In [168]:
train_df = pd.read_csv("./dataset/train_data.csv")


In [169]:
features = train_df.columns[:-1]
features

Index(['air_inflow', 'air_end_temp', 'out_pressure', 'motor_current',
       'motor_rpm', 'motor_temp', 'motor_vibe'],
      dtype='object')

In [170]:
X_train = train_df[features]
y_train = train_df[['type']]


In [171]:
sc = StandardScaler()
X_train[features]= sc.fit_transform(X_train)
X_train.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe
0,-0.448257,-0.917212,-1.110223e-16,-0.511748,-1.007046,-0.910351,-0.331821
1,0.699749,1.17832,-1.110223e-16,0.802045,1.118785,1.172883,0.188462


In [172]:
train_df = X_train
train_df['type'] = y_train
train_df.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,-0.448257,-0.917212,-1.110223e-16,-0.511748,-1.007046,-0.910351,-0.331821,0
1,0.699749,1.17832,-1.110223e-16,0.802045,1.118785,1.172883,0.188462,0


In [173]:
# PCA 대상 그룹핑으로 나누기
g1_train = train_df[['air_inflow','motor_current']]
g2_train = train_df[['air_end_temp', 'motor_rpm', 'motor_temp']]
g3_train = train_df[['motor_vibe', 'out_pressure']]
y_train = train_df[['type']]
combined_train = [g1_train, g2_train, g3_train]

In [174]:
pca = PCA(n_components=1)
pca_list = []
for i in combined_train:
    result = pca.fit_transform(i)
    pca_list.append(result)
    total_var = pca.explained_variance_ratio_.sum() * 100
    print(f"{i}total_var : {total_var}")
    
flat_pca_list = []
for i in pca_list:
    result = [j for sub in i for j in sub]
    flat_pca_list.append(result)
pca_train_df = pd.DataFrame({"PC1": flat_pca_list[0], "PC2": flat_pca_list[1], "PC3": flat_pca_list[2]})
pca_train_df = pd.concat([pca_train_df, y_train], axis=1)
pca_train_df.info()

      air_inflow  motor_current
0      -0.448257      -0.511748
1       0.699749       0.802045
2      -0.182053      -0.202966
3       0.200616       0.230799
4      -0.190372      -0.208848
...          ...            ...
2458    0.125746       0.149927
2459   -0.073907      -0.084600
2460   -0.781013      -0.889638
2461   -0.764375      -0.871259
2462    0.491777       0.566048

[2463 rows x 2 columns]total_var : 99.72924966859087
      air_end_temp  motor_rpm  motor_temp
0        -0.917212  -1.007046   -0.910351
1         1.178320   1.118785    1.172883
2        -0.425427  -0.508305   -0.421612
3         0.266970   0.194005    0.265958
4        -0.434598  -0.517029   -0.430592
...            ...        ...         ...
2458      0.137432   0.063140    0.137680
2459     -0.236279  -0.316369   -0.234326
2460     -1.520194  -1.619204   -1.510693
2461     -1.490389  -1.588669   -1.481189
2462      0.801170   0.736368    0.798312

[2463 rows x 3 columns]total_var : 99.47205314856252
    

In [175]:
tdf0_train = pca_train_df[pca_train_df.type == 0]
tdf1_train = pca_train_df[pca_train_df.type == 1]
tdf2_train = pca_train_df[pca_train_df.type == 2]
tdf3_train = pca_train_df[pca_train_df.type == 3]
tdf4_train = pca_train_df[pca_train_df.type == 4]
tdf5_train = pca_train_df[pca_train_df.type == 5]
tdf6_train = pca_train_df[pca_train_df.type == 6]
tdf7_train = pca_train_df[pca_train_df.type == 7]
tdf_train = [tdf0_train, tdf1_train, tdf2_train, tdf3_train, tdf4_train, tdf5_train, tdf6_train, tdf7_train]

In [176]:
def draw_chart(df):
    fig = px.scatter_3d(
        df, x='PC1', y='PC2', z='PC3', color='type',symbol='type', opacity=0.5, size_max=10,height=600
    )
    fig.show()

In [177]:
for tdf in tdf_train:
    print(tdf.type.unique())
    draw_chart(tdf)

[0]


[1]


[2]


[3]


[4]


[5]


[6]


[7]


In [203]:
epsilons = np.linspace(0.1, 1, num=15)
epsilons

array([0.1       , 0.16428571, 0.22857143, 0.29285714, 0.35714286,
       0.42142857, 0.48571429, 0.55      , 0.61428571, 0.67857143,
       0.74285714, 0.80714286, 0.87142857, 0.93571429, 1.        ])

In [204]:
min_samples = np.arange(2, 20, step=3)
min_samples

array([ 2,  5,  8, 11, 14, 17])

In [178]:
epsilons = np.linspace(0.1, 1, num=15)
min_samples = np.arange(2, 20, step=3)
import itertools
combinations = list(itertools.product(epsilons, min_samples))
N = len(combinations)

In [179]:
def get_scores_and_labels(combinations, X):
  scores = []
  all_labels_list = []

  for i, (eps, num_samples) in enumerate(combinations):
    dbscan_cluster_model = DBSCAN(eps=eps, min_samples=num_samples).fit(X)
    labels = dbscan_cluster_model.labels_
    labels_set = set(labels)
    num_clusters = len(labels_set)
    if -1 in labels_set:
      num_clusters -= 1
    
    if (num_clusters < 2) or (num_clusters > 50):
      scores.append(-10)
      all_labels_list.append('bad')
      c = (eps, num_samples)
      print(f"Combination {c} on iteration {i+1} of {N} has {num_clusters} clusters. Moving on")
      continue
    try:
        scores.append(ss(X, labels))
        all_labels_list.append(labels)
        print(f"Index: {i}, Score: {scores[-1]}, Labels: {all_labels_list[-1]}, NumClusters: {num_clusters}")
    except:
        scores.append('ss Failure')

  best_index = np.argmax(scores)
  best_parameters = combinations[best_index]
  best_labels = all_labels_list[best_index]
  best_score = scores[best_index]

  return {'best_epsilon': best_parameters[0],
          'best_min_samples': best_parameters[1], 
          'best_labels': best_labels,
          'best_score': best_score}

In [180]:
results = []

for tdf in tdf_train:
    X_train = tdf.iloc[:,:-1]
    y_train = tdf[['type']]
    
    result = get_scores_and_labels(combinations, tdf)
    results.append(result)
    
    tdf['cluster'] = result['best_labels']
    
    print(tdf.type.unique())
    fig = px.scatter_3d(
    tdf, x='PC1', y='PC2', z='PC3', color='cluster',symbol='cluster', opacity=0.5, size_max=10,height=600,
    )
    fig.show()
    print("="*70)
    

Combination (0.1, 2) on iteration 1 of 90 has 1 clusters. Moving on
Combination (0.1, 5) on iteration 2 of 90 has 1 clusters. Moving on
Index: 2, Score: 0.19505069403604341, Labels: [ 0  1  0  0  0  0  1  1  0  1 -1  0  0  1  1  1  0  0  1  0  0  0  1  1
  0  1  0  1  0  0  0  1  1  0  0  1  0  0  0  0  1  0  0  0  0  1  0  1
  0  0  0  1  0  1  0  1  1  0  1  0  0  1  0  0  1  0  0  0  0  0  0  0
  1  0  1  0  1  0  1  0  1  0  0  1  0  1  1  0  1  0  0  1  0  1  0  0
  0  1  1  1  0  1  1  0  1  0  0  1  1  0  0  1  1  1  1  0  0  1  1  0
  0  0  0  1  0  1  0  0 -1  0  1  1  0  1  0  0  1  0  1  0  0  1  0  1
  0 -1  0  0  0  1  0  1  1  0  0  0  1  0  1  0  1  1  1  1  0  0  0  1
  0  0  1  0  0  0  0  0  0  1  1  1  0  1  0  0  0  1  1  0  0  0  0  1
  0  1  0  1  0  0  1  1  0  1  0  0  1  0  1  0  1  1  0  0  0  0  1  1
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  1  1  1  0  1  0
  1  1  0  0  0  0  1  1  1  0  0  0  1  1  0  1  0  1  0  1  0  0  0  0
  0  1  0  1  0

Combination (0.55, 11) on iteration 46 of 90 has 1 clusters. Moving on
Combination (0.55, 14) on iteration 47 of 90 has 1 clusters. Moving on
Combination (0.55, 17) on iteration 48 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 2) on iteration 49 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 5) on iteration 50 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 8) on iteration 51 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 11) on iteration 52 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 14) on iteration 53 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 17) on iteration 54 of 90 has 1 clusters. Moving on
Combination (0.6785714285714286, 2) on iteration 55 of 90 has 1 clusters. Moving on
Combination (0.6785714285714286, 5) on iteration 56 of 90 has 1 clusters. Moving on
Combination (0.6785714285714286, 8) on iteration 57 of 90 has 1 clusters. Moving on
Combination (0.6785714285714

Index: 0, Score: 0.7651788937569435, Labels: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0
  0  0  0  0  0 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1  2  2 -1 -1  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0

Index: 12, Score: 0.810512270942052, Labels: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  1  2  3  4 -1  2  3  1 -1 -1  2  4  3  3 -1 -1  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0

Combination (0.55, 8) on iteration 45 of 90 has 1 clusters. Moving on
Combination (0.55, 11) on iteration 46 of 90 has 1 clusters. Moving on
Combination (0.55, 14) on iteration 47 of 90 has 1 clusters. Moving on
Combination (0.55, 17) on iteration 48 of 90 has 1 clusters. Moving on
Index: 48, Score: 0.8087778135734018, Labels: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  1  1  1  1  2  1  1  1  2  2  1  1  1  1  2 -1  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  

Index: 66, Score: 0.8059381583456621, Labels: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  

Index: 87, Score: 0.8139980056641606, Labels: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  1  1  1  1 -1  1  1  1 -1 -1  1  1  1  1 -1 -1  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  

Index: 0, Score: 0.5716122215843026, Labels: [0 0 1 1 1 0 0 1 1 1 1 0 0 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0
 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1
 0 1 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0
 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0 1
 1 0 0 1 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 0
 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1
 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 0 1
 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1], NumClusters: 2
Index: 1, Score: 0.5716122215843026, Labels: [0 0 1 1 1 0 0 1 1 1 1 0 0 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0
 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1
 

Combination (0.2285714285714286, 5) on iteration 14 of 90 has 1 clusters. Moving on
Combination (0.2285714285714286, 8) on iteration 15 of 90 has 1 clusters. Moving on
Combination (0.2285714285714286, 11) on iteration 16 of 90 has 1 clusters. Moving on
Combination (0.2285714285714286, 14) on iteration 17 of 90 has 1 clusters. Moving on
Combination (0.2285714285714286, 17) on iteration 18 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 2) on iteration 19 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 5) on iteration 20 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 8) on iteration 21 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 11) on iteration 22 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 14) on iteration 23 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 17) on iteration 24 of 90 has 1 clusters. Moving on
Combination (0.3571428571428572, 2) on iteration 25 of 90 has 1 cluste

Index: 0, Score: 0.5009318566712694, Labels: [0 0 1 0 2 2 2 3 0 0 0 0 2 4 3 0 2 0 0 0 4 0 2 2 0 0 0 2 3 2 3 2 2 2 5 2 2
 0 0 2 2 2 0 0 2 0 0 0 0 0 1 0 0 0 5 0 2 2 0 0 1 3 1 0 0 0 2 0 5 0 0 0 0 0
 2 1 2 0 5 2 1 2 3 0 0 0 4 0 1 5 5 5 1 4 2 1 0 2 3 2 0 2 0 0 0 0 2 0 2 5 2
 1 0 0 1 0 0 2 1 2 1 0 1 2 2 1 0 2 2 3 2 2 2 2 5 0 4 2 2 1 0 0 2 2 1 5 1 0
 0 2 0 0 0 0 0 4 0 0 2 2 2 0 0 0 0 4 2 1 1 2 0 4 1 0 0 0 4 3 0 2 0 1 0 2 2
 1 2 2 0 2 4 3 2 2 0 2 2 5 2 5 0 0 5 0 1 0 1 2 1 2 0 2 0 1 0 0 2 0 2 0 3 1
 0 0 1 0 0 2 0 1 0 2 2 2 0 0 0 0 2 0 3 0 1 0 4 2 2 2 1 0 2 0 0 1 1 1 0 1 3
 0 2 2 0 0 1 0 0 2 0 0 4 2 1 0 0 2 5 3 0 0 2 2 1 1 0 2 5 0 0 3 2 2 1 5 2 0
 2 2 0 0 1 0 0 5 0 5], NumClusters: 6
Index: 1, Score: 0.5196512955073077, Labels: [ 0  0  1  0  2  2  3 -1  0  0  0  0  3  4  5  0  2  0  0  0  4  0  3  2
  0  0  0  3  5  2 -1  3  3  2  6  2  3  0  0  2  2  3  0  0  2  0  0  0
  0  0  1  0  0  0  6  0  3  2  0  0  1  5  1  0  0  0  3  0  6  0  0  0
  0  0  2  1  3  0  6  2  1  2  5  0  0  0  4  0  1  

Combination (0.2285714285714286, 5) on iteration 14 of 90 has 1 clusters. Moving on
Index: 14, Score: 0.41637742522279625, Labels: [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1
 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 1 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0], NumClusters: 2
Index: 15, Score: 0.41637742522279625, Labels: [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0

Combination (0.6785714285714286, 14) on iteration 59 of 90 has 1 clusters. Moving on
Combination (0.6785714285714286, 17) on iteration 60 of 90 has 1 clusters. Moving on
Combination (0.7428571428571429, 2) on iteration 61 of 90 has 1 clusters. Moving on
Combination (0.7428571428571429, 5) on iteration 62 of 90 has 1 clusters. Moving on
Combination (0.7428571428571429, 8) on iteration 63 of 90 has 1 clusters. Moving on
Combination (0.7428571428571429, 11) on iteration 64 of 90 has 1 clusters. Moving on
Combination (0.7428571428571429, 14) on iteration 65 of 90 has 1 clusters. Moving on
Combination (0.7428571428571429, 17) on iteration 66 of 90 has 1 clusters. Moving on
Combination (0.8071428571428572, 2) on iteration 67 of 90 has 1 clusters. Moving on
Combination (0.8071428571428572, 5) on iteration 68 of 90 has 1 clusters. Moving on
Combination (0.8071428571428572, 8) on iteration 69 of 90 has 1 clusters. Moving on
Combination (0.8071428571428572, 11) on iteration 70 of 90 has 1 cluste

Index: 0, Score: 0.37030610029768707, Labels: [ 0  0  1  0  1  1  0  0  1  1  0  1  0  0  0  0  0  2  1  1  1  0  1  2
  0  2  1  0  1  0  2  0  2  0  0  2  0  0  2  1  1  2  2  0  1  1 -1  1
  1  1  2  2  2  0  2  2  0  0  2  2  2  1  2  1  1  2  0  0  1  2  2  2
  1  0  1  2  2  0  2  2  2  0  2  1  1  2  2  1  2  1  0  0  2  0  0  2
  0  0  0  1  0  1  2  1  2  1  0  0  1  2  2  0  2  1  0  0  0  0  2  2
  2  0  1  2  1  1  1  1  1  2  0  2  2  0  0  1  2  2  1  1  0  2  0  0
  1  2  1  1  1  1  1  2  1  2  1  2  0  2  2  2  1  0  2  1  0  1  2  2
  0  0  1  0  1  0  1  0  0  2  0  2  2  2  1  0  0  2  2  0  2  0  0  2
  1  2  1  0  1  2  2  2  1  0  1  1  2  1  2  0  2  1  1  2  2  0  0  1
  0  1  0  1  1  1  0  1  2  2  0  2  0  2  0  1  0  1  2  1  2  2  1  0
  1  0  1  2  1  2  0  0  2  2  0  0  1  0  2  1  1  0  2  2  2  1  1  0
  0  1  1  1  2  0  0  0  2  0  1  0  0  0  2  1  0  1  1  2  0  0  2  1
  0  0  2  1  2  1  1  2  0  2  0  1  0  2  0  0  0  2], NumClusters: 3
Index:

Combination (0.2928571428571429, 2) on iteration 19 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 5) on iteration 20 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 8) on iteration 21 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 11) on iteration 22 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 14) on iteration 23 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 17) on iteration 24 of 90 has 1 clusters. Moving on
Combination (0.3571428571428572, 2) on iteration 25 of 90 has 1 clusters. Moving on
Combination (0.3571428571428572, 5) on iteration 26 of 90 has 1 clusters. Moving on
Combination (0.3571428571428572, 8) on iteration 27 of 90 has 1 clusters. Moving on
Combination (0.3571428571428572, 11) on iteration 28 of 90 has 1 clusters. Moving on
Combination (0.3571428571428572, 14) on iteration 29 of 90 has 1 clusters. Moving on
Combination (0.3571428571428572, 17) on iteration 30 of 90 has 1 cluste

Index: 0, Score: 0.4378009951940531, Labels: [0 1 1 2 3 4 4 2 1 1 0 0 5 1 4 1 0 3 1 6 2 3 6 0 1 3 3 5 0 5 3 3 0 3 0 3 3
 0 3 0 1 3 2 3 2 3 1 5 1 2 1 4 1 1 3 6 1 0 1 4 4 3 1 0 3 5 1 1 0 1 2 3 3 0
 0 0 1 0 6 3 6 3 3 3 1 0 5 1 4 3 0 1 1 3 3 3 3 2 4 0 0 1 1 0 3 5 1 1 3 1 4
 4 1 1 1 0 1 3 1 4 3 3 3 5 1 1 0 4 0 4 3 5 3 6 1 3 3 1 1 3 0 4 2 0 0 1 3 3
 0 0 3 3 0 4 3 1 1 0 6 3 0 0 1 1 1 6 0 3 3 5 3 0 0 0 3 0 4 1 5 1 4 5 1 1 3
 3 1 1 3 1 3 1 2 1 0 3 4 1 5 0 0 0 1 0 1 1 4 4 1 0 1 1 1 0 3 1 5 1 3 0 5 3
 5 1 1 3 1 3 0 1 1 3 0 5 4 3 6 0 3 5 4 3 0 4 1 1 0 1 0], NumClusters: 7
Index: 1, Score: 0.3773400592024816, Labels: [0 1 2 3 4 5 5 3 2 2 0 0 6 2 5 1 0 4 2 7 3 4 7 0 1 4 4 6 0 6 4 4 0 4 0 4 4
 0 4 0 2 4 3 4 3 4 1 6 2 3 1 5 2 2 4 7 2 0 2 5 5 4 2 0 4 6 1 2 0 2 3 4 4 0
 0 0 1 0 7 4 7 4 4 4 2 0 6 2 5 4 0 2 2 4 4 4 4 3 5 0 0 2 2 0 4 6 2 1 4 2 5
 5 2 2 1 0 2 4 2 5 4 4 4 6 2 1 0 5 0 5 4 6 4 7 2 4 4 1 2 4 0 5 3 0 0 2 4 4
 0 0 4 4 0 5 4 1 1 0 7 4 0 0 2 2 2 7 0 4 4 6 4 0 0 0 4 0 5 2 6 2 5 6 1 2 4
 4 2 1 4 2 4 

Index: 17, Score: 0.4426957083965708, Labels: [ 0  1  1 -1  5  2  2 -1  1  6  0  0  3  6  2  1  0  4  1 -1  3  4 -1  0
  0  5  4  3  0  3  5  5  0  4  0  5  4  0  4  0  1  5 -1  4  3  5  0  3
  1 -1 -1  2  1  1  4 -1  1  0  1  2  2  5  1  0  4  3  1  1  0  1 -1  5
  5  0  0  0  1  0 -1  4 -1  4  4  4  6  0  3  1  2  4  0  1  1  4  4  5
  5  3  2  0  0  1  1  0  4  3  1  1  4  1  2  2  1  1 -1  0  1 -1  1  2
 -1  5  5  3  1  1  0  2  0  2  4  3  5 -1  1  4  4 -1  6  4  0  2  3  0
  0  1  5  4  0  0  4  4  0  2  4  0  1  0 -1  4  0  0  1  1  1 -1  0  4
  4  3  4  0  0  0  5  0  2  1  3  1  2  3  1  1  4  5  1  1  5  1  4  1
 -1  1  0  5  2  0  3  0  0  0  1  0  1  1  2  2  6  0  1  6  6  0  4  1
  3  6  4  0  3  5  3  6  1  5 -1  5  0  1  1  5  0  3  2  4 -1  0  4  3
  2  5  0  2  1  1  0  6  0], NumClusters: 7
Combination (0.2928571428571429, 2) on iteration 19 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 5) on iteration 20 of 90 has 1 clusters. Moving on
Combination

Index: 0, Score: 0.5123022264079898, Labels: [0 1 2 2 3 3 0 0 0 2 2 0 0 1 0 3 3 3 3 3 3 3 3 1 3 3 1 3 2 4 3 0 0 0 0 1 3
 2 2 0 3 0 2 2 4 1 1 0 1 4 3 3 3 3 0 0 4 2 1 1 3 0 4 2 3 1 0 3 0 3 2 3 3 3
 1 2 2 3 3 3 3 2 0 0 4 2 0 0 2 0 4 1 0 3 1 3 1 3 1 0 3 0 3 1 0 0 0 1 3 3 2
 0 4 1 0 3 0 1 0 3 1 2 1 1 0 3 0 2 0 3 2 0 3 4 1 4 3 0 0 2 0 3 0 3 3 1 1 3
 3 3 4 2 3 3 3 3 4 2 2 3 3 3 2 0 0 3 4 3 1 2 0 2 3 1 2 2 4 1 3 3 4 2 3 4 3
 0 3 0 1 3 0 4 0 0 0 3 0 0 3 4 3 2 0 0 3 2 1 3 2 0 3 3 3 3 3 0 0 1 3 0 3 2
 0 4 4 0 1 0 1 3 3 4 3 4 3 0 3 4 2 4 4 3 0 1 0 0 1 2 3], NumClusters: 5
Index: 1, Score: 0.4519509867352718, Labels: [0 1 2 2 3 3 0 0 0 2 2 0 0 1 0 3 3 3 3 3 3 3 3 1 3 3 1 3 2 4 3 0 0 0 0 1 3
 2 2 0 3 0 2 2 4 1 1 0 1 4 3 3 3 3 0 0 4 2 1 1 3 0 4 2 3 1 0 3 0 3 2 3 3 3
 1 2 2 3 3 3 3 2 0 0 4 2 5 0 2 0 4 1 0 3 1 3 1 3 1 0 3 0 3 1 0 0 0 1 3 3 2
 0 4 1 0 3 0 1 5 3 1 2 1 1 0 3 0 2 0 3 2 0 3 4 1 4 3 0 0 2 0 3 0 3 3 1 1 3
 3 3 4 2 3 3 3 3 4 2 2 3 3 3 2 0 0 3 4 3 1 2 0 2 3 1 2 2 4 1 3 3 4 2 3 4 3
 0 3 0 1 3 0 

Index: 17, Score: 0.5093064900055858, Labels: [ 0  1  2  2  3  3  0  0  0  2  2  0  0  1  0  3  3  3  3  3  3  3  3  1
  3  3  1  3  2  4  3  0  0  0  0  1  3  2  2  0  3  2  2  2  4  1  1  0
  1  4  3  3  3  3  2  0  4  2  1  1  3  0  4  2  3  1 -1  3  0  3  2  3
  3  3  1  2  2  3  3  3  3  2  0  2  4  2 -1  0  2  0  4  1  0  3  1  3
  1  3  1  0  3 -1  3  1  2  0  0  1  3  3  2  0  4  1  2  3  0  1 -1  3
  1  2  1  1  0  3  0  2  0  3  2  0  3  4  1  4  3  0  0  2 -1  3  0  3
  3  1  1  3  3  3  4  2  3  3  3  3  4  2  2  3 -1  3  2  0  2  3  4  3
  1  2  0  2  3  1  2  2  4  1  3  3  4  2  3  4  3  0  3  0  1  3  0  4
  0  0 -1  3  0 -1  3  4  3  2  0  0  3  2  1  3  2 -1  3  3  3  3  3 -1
  2  1  3  0  3  2 -1  4  4  0  1 -1  1  3  3  4  3  4  3  0  3  4  2  4
  4  3  0  1  0 -1  1  2  3], NumClusters: 5
Combination (0.2928571428571429, 2) on iteration 19 of 90 has 1 clusters. Moving on
Combination (0.2928571428571429, 5) on iteration 20 of 90 has 1 clusters. Moving on
Combination

Index: 0, Score: 0.34212131274028584, Labels: [ 0  1  2  3  4  5  5  1  5  6  4  2  5  2  5  3  7  3  3  7  1  4  7  4
  6  5  1  7  2  5  1  3  5  5  4  4  7  5  1  5  5  1  6  1  3  5  3  3
  6  7  2  7  5  7  4  7  5  4  5  5  0  6  7  2  6  1  5  1  5  2  5  3
  2  6  3  7  6  5  1  6  7  5  5  6  4  5  6  8 -1  1  8  4  5  4  5  5
  5  5  8  7  4  7  1  5  4  4  1  4  3  2  7  2  6  5  5  5  2  5  6  5
  4  3  3  4  9  5  6  4  3  8  4  5  7  7  3  7  1  5  3  7  1  7  5  7
  6  1  0  5  5  1  4  4  6  6  6  7  2  3  3  1  7  7  5  9  4  3  5  5
  6  5  5  7  5  3  7  4  5  1  3  1  5  6  2  3  3  1], NumClusters: 10
Index: 1, Score: 0.4791216028763301, Labels: [-1  7  4  5  0  1  1  7  2  3  0  4  1  4  1  5  6  5  5  6  7  0  6  0
  3  1  7  6  4  1  7  5  8  1  0  0  6  2  7  2  1 -1  3 -1  5  2  5  5
  3  6  4  6  1  6  0  6  2  0  1  1 -1  3  6  4  3  7  2 -1  2  4  1  5
  4  3  5  6  3  2 -1  3  6  1  1  3  0  1  3 -1 -1  7 -1  0  1  0  2  1
  2  1 -1  6  0  6  7  2  0  0  7

Combination (0.48571428571428577, 8) on iteration 39 of 90 has 1 clusters. Moving on
Combination (0.48571428571428577, 11) on iteration 40 of 90 has 1 clusters. Moving on
Combination (0.48571428571428577, 14) on iteration 41 of 90 has 1 clusters. Moving on
Combination (0.48571428571428577, 17) on iteration 42 of 90 has 1 clusters. Moving on
Combination (0.55, 2) on iteration 43 of 90 has 1 clusters. Moving on
Combination (0.55, 5) on iteration 44 of 90 has 1 clusters. Moving on
Combination (0.55, 8) on iteration 45 of 90 has 1 clusters. Moving on
Combination (0.55, 11) on iteration 46 of 90 has 1 clusters. Moving on
Combination (0.55, 14) on iteration 47 of 90 has 1 clusters. Moving on
Combination (0.55, 17) on iteration 48 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 2) on iteration 49 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 5) on iteration 50 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 8) on iteration 51 of 90 has 1 cl



In [181]:
results[0]

{'best_epsilon': 0.1,
 'best_min_samples': 11,
 'best_labels': array([ 0,  1,  7,  2,  7,  2,  1,  3,  2,  1, -1,  4,  2,  5,  6,  1,  2,
         4,  5,  2,  2,  0,  1,  5,  0,  6,  0,  5,  0,  2,  0,  6,  5,  0,
         0,  3,  0,  2,  2,  2,  6,  0,  2,  4,  0,  3,  0,  6,  7,  7,  7,
         6,  4,  3,  2,  1,  1,  0,  1,  2,  2,  6,  0,  0,  5,  4,  2,  4,
         0,  0,  4,  2,  5,  2,  3,  0,  3,  2,  1,  0,  5,  2,  2,  3,  0,
         1,  3,  0,  1,  4,  4,  5,  2,  6,  4,  0,  2,  6,  1,  3,  4,  5,
         5,  0,  1,  0,  4,  1,  5,  2,  2,  3,  5,  5,  3,  2,  4,  1,  1,
         2,  0,  0,  0,  3,  0,  3,  2,  2, -1,  0,  5,  3,  0,  5,  0,  2,
         5,  4,  1,  0,  2,  6,  2,  1,  0, -1,  0,  0,  0,  6,  2,  3,  6,
         0,  4,  0,  6,  2,  6,  2,  1,  6,  5,  3,  0,  2,  2,  6,  2,  2,
         5,  2,  0,  4,  4,  4,  2,  3,  1,  6,  2,  6,  2,  0,  0,  5,  3,
         2,  2,  4,  0,  5,  2,  1,  2,  6,  2,  2,  3,  5,  2,  5,  0,  0,
         1,  4,  1,  2,  3

In [182]:
best_eps = [results[i]['best_epsilon'] for i in range(8)]
best_eps

[0.1,
 0.8714285714285716,
 0.1,
 0.2285714285714286,
 0.2285714285714286,
 0.1642857142857143,
 0.1,
 0.1642857142857143]

In [183]:
best_min_samples = [results[i]['best_min_samples'] for i in range(8)]
best_min_samples

[11, 2, 2, 17, 14, 2, 2, 2]

In [184]:
best_sil_score = [results[i]['best_score'] for i in range(7)]
best_sil_score

[0.31506907908400383,
 0.8260353154884031,
 0.5716122215843026,
 0.5203956791775051,
 0.6372519605569298,
 0.6244164542412234,
 0.5123022264079898]

# Test
## Data Load -> Scailing -> PCA -> DBSCAN(with Best EPS, Min_samples) -> Get Label

In [185]:
test_df = pd.read_csv("./dataset/test_data.csv")
X_test = test_df[features]
y_test = test_df[['type']]

In [186]:
sc = StandardScaler()
X_test[features]= sc.fit_transform(X_test)
X_test.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe
0,0.34957,0.487848,0.029701,0.387062,0.419055,0.491978,0.208765
1,0.478445,0.713509,0.029701,0.531396,0.647355,0.716757,0.348482


In [187]:
test_df = X_test
test_df['type'] = y_test
test_df.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,0.34957,0.487848,0.029701,0.387062,0.419055,0.491978,0.208765,0
1,0.478445,0.713509,0.029701,0.531396,0.647355,0.716757,0.348482,0


In [188]:
# PCA 대상 그룹핑으로 나누기
g1_test = test_df[['air_inflow','motor_current']]
g2_test = test_df[['air_end_temp', 'motor_rpm', 'motor_temp']]
g3_test = test_df[['motor_vibe', 'out_pressure']]
y_test = test_df[['type']]
combined_test = [g1_test, g2_test, g3_test]

In [189]:
pca = PCA(n_components=1)
pca_list = []
for i in combined_test:
    result = pca.fit_transform(i)
    pca_list.append(result)
    total_var = pca.explained_variance_ratio_.sum() * 100
    print(f"{i}total_var : {total_var}")
    
flat_pca_list = []
for i in pca_list:
    result = [j for sub in i for j in sub]
    flat_pca_list.append(result)
pca_test_df = pd.DataFrame({"PC1": flat_pca_list[0], "PC2": flat_pca_list[1], "PC3": flat_pca_list[2]})
pca_test_df = pd.concat([pca_test_df, y_test], axis=1)
pca_test_df.info()

      air_inflow  motor_current
0       0.349570       0.387062
1       0.478445       0.531396
2      -0.329171      -0.392040
3       0.083228       0.081769
4      -0.037055      -0.056520
...          ...            ...
7384    0.014495       0.002423
7385   -0.535371      -0.621765
7386   -0.466638      -0.549220
7387   -0.440863      -0.521260
7388   -0.346355      -0.408664

[7389 rows x 2 columns]total_var : 99.55080635654565
      air_end_temp  motor_rpm  motor_temp
0         0.487848   0.419055    0.491978
1         0.713509   0.647355    0.716757
2        -0.725652  -0.806787   -0.718468
3         0.012350  -0.060812    0.017876
4        -0.202949  -0.278934   -0.197860
...            ...        ...         ...
7384     -0.110842  -0.185868   -0.104848
7385     -1.083715  -1.168869   -1.076305
7386     -0.970885  -1.053991   -0.962624
7387     -0.927134  -1.010367   -0.918702
7388     -0.752132  -0.832962   -0.744304

[7389 rows x 3 columns]total_var : 99.22881044537638
    

In [190]:
tdf0_test = pca_test_df[pca_test_df.type == 0]
tdf1_test = pca_test_df[pca_test_df.type == 1]
tdf2_test = pca_test_df[pca_test_df.type == 2]
tdf3_test = pca_test_df[pca_test_df.type == 3]
tdf4_test = pca_test_df[pca_test_df.type == 4]
tdf5_test = pca_test_df[pca_test_df.type == 5]
tdf6_test = pca_test_df[pca_test_df.type == 6]
tdf7_test = pca_test_df[pca_test_df.type == 7]
tdf_test = [tdf0_test, tdf1_test, tdf2_test, tdf3_test, tdf4_test, tdf5_test, tdf6_test, tdf7_test]

In [191]:
clusters = []
sil_scores = []

for i, tdf in enumerate(tdf_test):
    X_test = tdf.iloc[:,:-1]
    y_test = tdf[['type']]
    
    dbscan = DBSCAN(eps=best_eps[i], min_samples=best_min_samples[i]).fit(X_test)

    try: 
        sil_score = ss(X_test, dbscan.labels_)
        sil_scores.append(sil_score) 
    except:
        sil_scores.append("값없음") 
        
    clusters.extend(dbscan.labels_)

In [192]:
sil_scores

[0.01069263532513783,
 '값없음',
 -0.5703847180671644,
 0.10809946954563329,
 0.851368792414065,
 -0.7746449898203634,
 -0.6956502742236291,
 -0.04211514473472688]

In [193]:
pca_test_df['label'] = clusters
pca_test_df['label'].value_counts()

 0     7056
-1      223
 3       15
 7       11
 4       10
 5       10
 1        9
 2        9
 6        7
 9        6
 8        6
 14       5
 15       5
 11       3
 16       3
 17       3
 10       2
 12       2
 13       2
 18       2
Name: label, dtype: int64

In [202]:
# ddf = pca_test_df['label'].value_counts().to_frame()
# ddf

In [195]:
# pca_test_df.info()

In [196]:
def 결과정리(label):
    if label in [-1, 18, 13, 12, 10, 17, 16, 11, 15, 14, 8, 9, 6, 2, 1, 5, 4, 7, 3]: 
        return 1
    else:
        return 0

In [197]:
pca_test_df['label'] = pca_test_df['label'].apply(결과정리)
pca_test_df['label'].value_counts()

0    7056
1     333
Name: label, dtype: int64

In [198]:
overall_sil_score = ss(pca_test_df.iloc[:,:-2], pca_test_df['label'])
overall_sil_score

0.10026509379294972

In [199]:
fig = px.scatter_3d(
    pca_test_df, x='PC1', y='PC2', z='PC3', color='label',symbol='label', opacity=0.5, size_max=10,height=800,
)
fig.show()

In [200]:
submit_df = pca_test_df[['type', 'label']]
submit_df['label'].value_counts()

0    7056
1     333
Name: label, dtype: int64

In [201]:
submit_df.to_csv('./submits/submit_20230410_01.csv', index=False)