In [859]:
import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import pairwise_distances
import plotly.express as px
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score as ss


import joblib
import warnings
warnings.filterwarnings(action='ignore') 

### Data Load -> Scaling -> Grouping -> PCA Decomposition -> EDA (Viasual)

In [860]:
train_data = pd.read_csv("./dataset/train_data.csv")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2463 entries, 0 to 2462
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   air_inflow     2463 non-null   float64
 1   air_end_temp   2463 non-null   float64
 2   out_pressure   2463 non-null   float64
 3   motor_current  2463 non-null   float64
 4   motor_rpm      2463 non-null   int64  
 5   motor_temp     2463 non-null   float64
 6   motor_vibe     2463 non-null   float64
 7   type           2463 non-null   int64  
dtypes: float64(6), int64(2)
memory usage: 154.1 KB


In [861]:
features = train_data.columns[:-1]
features

Index(['air_inflow', 'air_end_temp', 'out_pressure', 'motor_current',
       'motor_rpm', 'motor_temp', 'motor_vibe'],
      dtype='object')

In [862]:
X = train_data[features]
X.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe
0,1.59,41.0,0.7,20.53,1680,58.67,2.93
1,2.97,59.28,0.7,38.4,3142,74.91,3.75


In [863]:
y = train_data[['type']]
y.head(2)

Unnamed: 0,type
0,0
1,0


In [864]:
sc = StandardScaler()
X[features]= sc.fit_transform(X)
X.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe
0,-0.448257,-0.917212,-1.110223e-16,-0.511748,-1.007046,-0.910351,-0.331821
1,0.699749,1.17832,-1.110223e-16,0.802045,1.118785,1.172883,0.188462
2,-0.182053,-0.425427,-1.110223e-16,-0.202966,-0.508305,-0.421612,-0.211268
3,0.200616,0.26697,-1.110223e-16,0.230799,0.194005,0.265958,-0.039955
4,-0.190372,-0.434598,-1.110223e-16,-0.208848,-0.517029,-0.430592,-0.211268


In [865]:
sc_df = X
sc_df['type'] = y
sc_df.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,-0.448257,-0.917212,-1.110223e-16,-0.511748,-1.007046,-0.910351,-0.331821,0
1,0.699749,1.17832,-1.110223e-16,0.802045,1.118785,1.172883,0.188462,0


In [866]:
sc_df['type'].value_counts()

0    432
1    369
2    366
3    306
4    306
5    249
6    249
7    186
Name: type, dtype: int64

In [867]:
# PCA 대상 그룹핑으로 나누기
g1 = sc_df[['air_inflow','motor_current']]
g2 = sc_df[['air_end_temp', 'motor_rpm', 'motor_temp']]
g3 = sc_df[['motor_vibe', 'out_pressure']]
y = sc_df[['type']]
combined = [g1, g2, g3]

In [868]:
pca = PCA(n_components=1)
pca_list = []
for i in combined:
    result = pca.fit_transform(i)
    pca_list.append(result)
    total_var = pca.explained_variance_ratio_.sum() * 100
    print(f"{i}total_var : {total_var}")
    
flat_pca_list = []
for i in pca_list:
    result = [j for sub in i for j in sub]
    flat_pca_list.append(result)
pca_df = pd.DataFrame({"PC1": flat_pca_list[0], "PC2": flat_pca_list[1], "PC3": flat_pca_list[2]})
pca_df = pd.concat([pca_df, y], axis=1)
pca_df.info()

      air_inflow  motor_current
0      -0.448257      -0.511748
1       0.699749       0.802045
2      -0.182053      -0.202966
3       0.200616       0.230799
4      -0.190372      -0.208848
...          ...            ...
2458    0.125746       0.149927
2459   -0.073907      -0.084600
2460   -0.781013      -0.889638
2461   -0.764375      -0.871259
2462    0.491777       0.566048

[2463 rows x 2 columns]total_var : 99.7292496685907
      air_end_temp  motor_rpm  motor_temp
0        -0.917212  -1.007046   -0.910351
1         1.178320   1.118785    1.172883
2        -0.425427  -0.508305   -0.421612
3         0.266970   0.194005    0.265958
4        -0.434598  -0.517029   -0.430592
...            ...        ...         ...
2458      0.137432   0.063140    0.137680
2459     -0.236279  -0.316369   -0.234326
2460     -1.520194  -1.619204   -1.510693
2461     -1.490389  -1.588669   -1.481189
2462      0.801170   0.736368    0.798312

[2463 rows x 3 columns]total_var : 99.47205314856232
     

In [869]:
tdf0 = pca_df[sc_df.type == 0]
tdf1 = pca_df[sc_df.type == 1]
tdf2 = pca_df[sc_df.type == 2]
tdf3 = pca_df[sc_df.type == 3]
tdf4 = pca_df[sc_df.type == 4]
tdf5 = pca_df[sc_df.type == 5]
tdf6 = pca_df[sc_df.type == 6]
tdf7 = pca_df[sc_df.type == 7]
tdf = [tdf0, tdf1, tdf2, tdf3, tdf4, tdf5, tdf6, tdf7]

In [870]:
tdf2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 366 entries, 801 to 1166
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PC1     366 non-null    float64
 1   PC2     366 non-null    float64
 2   PC3     366 non-null    float64
 3   type    366 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 14.3 KB


In [871]:
def draw_chart(df):
    fig = px.scatter_3d(
        df, x='PC1', y='PC2', z='PC3', color='type',symbol='type', opacity=0.5, size_max=10,height=600
    )
    fig.show()

In [872]:
for i in tdf:
    draw_chart(i)

### DBSCAN

In [873]:
target_df = tdf7
X_train = target_df.iloc[:,:-1]
y_train = target_df[['type']]

In [874]:
# dbscan = DBSCAN(eps=0.1, min_samples=8)
# dbscan.fit_predict(X_train)

In [875]:
epsilons = np.linspace(0.1, 1, num=15)
epsilons

array([0.1       , 0.16428571, 0.22857143, 0.29285714, 0.35714286,
       0.42142857, 0.48571429, 0.55      , 0.61428571, 0.67857143,
       0.74285714, 0.80714286, 0.87142857, 0.93571429, 1.        ])

In [876]:
min_samples = np.arange(2, 20, step=3)
min_samples

array([ 2,  5,  8, 11, 14, 17])

In [877]:
import itertools

combinations = list(itertools.product(epsilons, min_samples))
combinations

[(0.1, 2),
 (0.1, 5),
 (0.1, 8),
 (0.1, 11),
 (0.1, 14),
 (0.1, 17),
 (0.1642857142857143, 2),
 (0.1642857142857143, 5),
 (0.1642857142857143, 8),
 (0.1642857142857143, 11),
 (0.1642857142857143, 14),
 (0.1642857142857143, 17),
 (0.2285714285714286, 2),
 (0.2285714285714286, 5),
 (0.2285714285714286, 8),
 (0.2285714285714286, 11),
 (0.2285714285714286, 14),
 (0.2285714285714286, 17),
 (0.2928571428571429, 2),
 (0.2928571428571429, 5),
 (0.2928571428571429, 8),
 (0.2928571428571429, 11),
 (0.2928571428571429, 14),
 (0.2928571428571429, 17),
 (0.3571428571428572, 2),
 (0.3571428571428572, 5),
 (0.3571428571428572, 8),
 (0.3571428571428572, 11),
 (0.3571428571428572, 14),
 (0.3571428571428572, 17),
 (0.4214285714285715, 2),
 (0.4214285714285715, 5),
 (0.4214285714285715, 8),
 (0.4214285714285715, 11),
 (0.4214285714285715, 14),
 (0.4214285714285715, 17),
 (0.48571428571428577, 2),
 (0.48571428571428577, 5),
 (0.48571428571428577, 8),
 (0.48571428571428577, 11),
 (0.48571428571428577, 14),

In [878]:
N = len(combinations)
N

90

In [879]:
def get_scores_and_labels(combinations, X):
  scores = []
  all_labels_list = []

  for i, (eps, num_samples) in enumerate(combinations):
    dbscan_cluster_model = DBSCAN(eps=eps, min_samples=num_samples).fit(X)
    labels = dbscan_cluster_model.labels_
    labels_set = set(labels)
    num_clusters = len(labels_set)
    if -1 in labels_set:
      num_clusters -= 1
    
    if (num_clusters < 2) or (num_clusters > 50):
      scores.append(-10)
      all_labels_list.append('bad')
      c = (eps, num_samples)
      print(f"Combination {c} on iteration {i+1} of {N} has {num_clusters} clusters. Moving on")
      continue
    scores.append(ss(X, labels))
    all_labels_list.append(labels)
    print(f"Index: {i}, Score: {scores[-1]}, Labels: {all_labels_list[-1]}, NumClusters: {num_clusters}")

  best_index = np.argmax(scores)
  best_parameters = combinations[best_index]
  best_labels = all_labels_list[best_index]
  best_score = scores[best_index]

  return {'best_epsilon': best_parameters[0],
          'best_min_samples': best_parameters[1], 
          'best_labels': best_labels,
          'best_score': best_score}



In [880]:
best_dict = get_scores_and_labels(combinations, X_train)

Index: 0, Score: 0.34212131274030255, Labels: [ 0  1  2  3  4  5  5  1  5  6  4  2  5  2  5  3  7  3  3  7  1  4  7  4
  6  5  1  7  2  5  1  3  5  5  4  4  7  5  1  5  5  1  6  1  3  5  3  3
  6  7  2  7  5  7  4  7  5  4  5  5  0  6  7  2  6  1  5  1  5  2  5  3
  2  6  3  7  6  5  1  6  7  5  5  6  4  5  6  8 -1  1  8  4  5  4  5  5
  5  5  8  7  4  7  1  5  4  4  1  4  3  2  7  2  6  5  5  5  2  5  6  5
  4  3  3  4  9  5  6  4  3  8  4  5  7  7  3  7  1  5  3  7  1  7  5  7
  6  1  0  5  5  1  4  4  6  6  6  7  2  3  3  1  7  7  5  9  4  3  5  5
  6  5  5  7  5  3  7  4  5  1  3  1  5  6  2  3  3  1], NumClusters: 10
Index: 1, Score: 0.47912160287632083, Labels: [-1  7  4  5  0  1  1  7  2  3  0  4  1  4  1  5  6  5  5  6  7  0  6  0
  3  1  7  6  4  1  7  5  8  1  0  0  6  2  7  2  1 -1  3 -1  5  2  5  5
  3  6  4  6  1  6  0  6  2  0  1  1 -1  3  6  4  3  7  2 -1  2  4  1  5
  4  3  5  6  3  2 -1  3  6  1  1  3  0  1  3 -1 -1  7 -1  0  1  0  2  1
  2  1 -1  6  0  6  7  2  0  0  

Combination (0.48571428571428577, 5) on iteration 38 of 90 has 1 clusters. Moving on
Combination (0.48571428571428577, 8) on iteration 39 of 90 has 1 clusters. Moving on
Combination (0.48571428571428577, 11) on iteration 40 of 90 has 1 clusters. Moving on
Combination (0.48571428571428577, 14) on iteration 41 of 90 has 1 clusters. Moving on
Combination (0.48571428571428577, 17) on iteration 42 of 90 has 1 clusters. Moving on
Combination (0.55, 2) on iteration 43 of 90 has 1 clusters. Moving on
Combination (0.55, 5) on iteration 44 of 90 has 1 clusters. Moving on
Combination (0.55, 8) on iteration 45 of 90 has 1 clusters. Moving on
Combination (0.55, 11) on iteration 46 of 90 has 1 clusters. Moving on
Combination (0.55, 14) on iteration 47 of 90 has 1 clusters. Moving on
Combination (0.55, 17) on iteration 48 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 2) on iteration 49 of 90 has 1 clusters. Moving on
Combination (0.6142857142857143, 5) on iteration 50 of 90 has 1 c

In [881]:
best_dict

{'best_epsilon': 0.1642857142857143,
 'best_min_samples': 2,
 'best_labels': array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
        1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
        0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
        1, 1, 0, 1, 1, 1, 0, 0, 0, 1], dtype=int64),
 'best_score': 0.6539291521493606}

In [882]:
X_train['cluster'] = best_dict['best_labels']
X_train['cluster'].value_counts()

1    95
0    91
Name: cluster, dtype: int64

In [883]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186 entries, 2277 to 2462
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   PC1      186 non-null    float64
 1   PC2      186 non-null    float64
 2   PC3      186 non-null    float64
 3   cluster  186 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 7.3 KB


In [884]:
fig = px.scatter_3d(
    X_train, x='PC1', y='PC2', z='PC3', color='cluster',symbol='cluster', opacity=0.5, size_max=10,height=600,
)
fig.show()

In [885]:
# joblib.dump(dbscan, './models/dbscan_02.joblib')