In [397]:
import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import pairwise_distances
import plotly.express as px
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score as ss


import joblib
import warnings
warnings.filterwarnings(action='ignore') 

In [398]:
test_data = pd.read_csv("./dataset/test_data.csv")
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7389 entries, 0 to 7388
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   air_inflow     7389 non-null   float64
 1   air_end_temp   7389 non-null   float64
 2   out_pressure   7389 non-null   float64
 3   motor_current  7389 non-null   float64
 4   motor_rpm      7389 non-null   int64  
 5   motor_temp     7389 non-null   float64
 6   motor_vibe     7389 non-null   float64
 7   type           7389 non-null   int64  
dtypes: float64(6), int64(2)
memory usage: 461.9 KB


In [399]:
features = test_data.columns[:-1]
features

Index(['air_inflow', 'air_end_temp', 'out_pressure', 'motor_current',
       'motor_rpm', 'motor_temp', 'motor_vibe'],
      dtype='object')

In [400]:
X = test_data[features]
X.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe
0,2.51,53.28,0.7,32.54,2662,69.58,3.48
1,2.66,55.24,0.7,34.45,2819,71.32,3.57


In [401]:
y = test_data[['type']]
y.head(2)

Unnamed: 0,type
0,0
1,0


In [402]:
sc = StandardScaler()
X[features]= sc.fit_transform(X)
X.head(3)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe
0,0.34957,0.487848,0.029701,0.387062,0.419055,0.491978,0.208765
1,0.478445,0.713509,0.029701,0.531396,0.647355,0.716757,0.348482
2,-0.329171,-0.725652,0.029701,-0.39204,-0.806787,-0.718468,-0.520873


In [403]:
sc_df = X
sc_df['type'] = y
sc_df.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,0.34957,0.487848,0.029701,0.387062,0.419055,0.491978,0.208765,0
1,0.478445,0.713509,0.029701,0.531396,0.647355,0.716757,0.348482,0


In [404]:
sc_df['type'].value_counts()

0    1296
1    1107
2    1098
3     918
4     918
5     747
6     747
7     558
Name: type, dtype: int64

In [405]:
# PCA 대상 그룹핑으로 나누기
g1 = sc_df[['air_inflow','motor_current']]
g2 = sc_df[['air_end_temp', 'motor_rpm', 'motor_temp']]
g3 = sc_df[['motor_vibe', 'out_pressure']]
y = sc_df[['type']]
combined = [g1, g2, g3]

In [406]:
pca = PCA(n_components=1)
pca_list = []
for i in combined:
    result = pca.fit_transform(i)
    pca_list.append(result)
    total_var = pca.explained_variance_ratio_.sum() * 100
    print(f"{i}total_var : {total_var}")
    
flat_pca_list = []
for i in pca_list:
    result = [j for sub in i for j in sub]
    flat_pca_list.append(result)
pca_df = pd.DataFrame({"PC1": flat_pca_list[0], "PC2": flat_pca_list[1], "PC3": flat_pca_list[2]})
pca_df = pd.concat([pca_df, y], axis=1)
pca_df.info()

      air_inflow  motor_current
0       0.349570       0.387062
1       0.478445       0.531396
2      -0.329171      -0.392040
3       0.083228       0.081769
4      -0.037055      -0.056520
...          ...            ...
7384    0.014495       0.002423
7385   -0.535371      -0.621765
7386   -0.466638      -0.549220
7387   -0.440863      -0.521260
7388   -0.346355      -0.408664

[7389 rows x 2 columns]total_var : 99.55080635654555
      air_end_temp  motor_rpm  motor_temp
0         0.487848   0.419055    0.491978
1         0.713509   0.647355    0.716757
2        -0.725652  -0.806787   -0.718468
3         0.012350  -0.060812    0.017876
4        -0.202949  -0.278934   -0.197860
...            ...        ...         ...
7384     -0.110842  -0.185868   -0.104848
7385     -1.083715  -1.168869   -1.076305
7386     -0.970885  -1.053991   -0.962624
7387     -0.927134  -1.010367   -0.918702
7388     -0.752132  -0.832962   -0.744304

[7389 rows x 3 columns]total_var : 99.22881044537634
    

In [407]:
tdf0 = pca_df[sc_df.type == 0]
tdf1 = pca_df[sc_df.type == 1]
tdf2 = pca_df[sc_df.type == 2]
tdf3 = pca_df[sc_df.type == 3]
tdf4 = pca_df[sc_df.type == 4]
tdf5 = pca_df[sc_df.type == 5]
tdf6 = pca_df[sc_df.type == 6]
tdf7 = pca_df[sc_df.type == 7]
tdf = [tdf0, tdf1, tdf2, tdf3, tdf4, tdf5, tdf6, tdf7]

In [408]:
def draw_chart(df):
    fig = px.scatter_3d(
        df, x='PC1', y='PC2', z='PC3', color='type',symbol='type', opacity=0.5, size_max=10,height=600
    )
    fig.show()

In [409]:
for i in tdf:
    draw_chart(i)

In [410]:
target_df = tdf7
X_test = target_df.iloc[:,:-1]
y_test = target_df[['type']]

In [411]:
eps = 0.1642857142857143
min_sample = 2

In [412]:
dbscan = DBSCAN(eps=eps, min_samples=min_sample)
dbscan.fit_predict(X_test)

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [413]:
dbscan.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [414]:
X_test['label'] = dbscan.labels_
X_test = pd.concat([X_test, y_test], axis=1)
X_test.head()
X_test['label'].value_counts()

 0    554
-1      4
Name: label, dtype: int64

In [415]:
try: 
    sil_score = ss(X_test, X_test['label'])
    sil_score
    
except:
    pass

In [416]:
def 결과정리(label):
    if label == -1:
        return 1
    else:
        return 0

In [417]:
X_test['label'] = X_test['label'].apply(결과정리)

In [418]:
fig = px.scatter_3d(
    X_test, x='PC1', y='PC2', z='PC3', color='label',symbol='label', opacity=0.5, size_max=10,height=800,
)
fig.show()

In [419]:
submit_df = X_test[['type', 'label']]
submit_df['label'].value_counts()

0    554
1      4
Name: label, dtype: int64

In [420]:
submit_df.to_csv('./submits/submit_07.csv', index=False)