In [1449]:
import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import pairwise_distances
import plotly.express as px

import joblib
import warnings
warnings.filterwarnings(action='ignore') 

In [1450]:
from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
# display(HTML("<style>.prompt { display:none !important; }</style>"))

In [1451]:
test_data = pd.read_csv("./dataset/test_data.csv")

In [1452]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7389 entries, 0 to 7388
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   air_inflow     7389 non-null   float64
 1   air_end_temp   7389 non-null   float64
 2   out_pressure   7389 non-null   float64
 3   motor_current  7389 non-null   float64
 4   motor_rpm      7389 non-null   int64  
 5   motor_temp     7389 non-null   float64
 6   motor_vibe     7389 non-null   float64
 7   type           7389 non-null   int64  
dtypes: float64(6), int64(2)
memory usage: 461.9 KB


In [1453]:
test_data.describe()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
count,7389.0,7389.0,7389.0,7389.0,7389.0,7389.0,7389.0,7389.0
mean,2.103129,49.042735,0.69984,27.417935,2373.820003,65.771623,3.345523,2.957369
std,1.163998,8.686208,0.005377,13.234089,687.737182,7.741473,0.644199,2.229811
min,0.34,32.08,0.49,5.37,1200.0,50.42,1.97,0.0
25%,1.28,41.51,0.7,17.2,1774.0,59.11,2.89,1.0
50%,1.94,48.76,0.7,26.17,2355.0,65.52,3.26,3.0
75%,2.77,56.58,0.7,35.86,2972.0,72.44,3.73,5.0
max,6.24,65.54,0.7,74.0,3564.0,82.39,6.26,7.0


In [1454]:
X_test = test_data.iloc[:,:-1]
y_test = test_data[['type']]

In [1455]:
# PCA 대상 그룹핑으로 나누기
g1 = X_test[['air_inflow','motor_current']]
g2 = X_test[['air_end_temp', 'motor_rpm', 'motor_temp']]
g3 = X_test[['motor_vibe', 'out_pressure']]
combined = [g1, g2, g3]

In [1456]:
sc = StandardScaler()
# sc = MinMaxScaler()
scaled_list = []
for i in combined:
    result = sc.fit_transform(i)
    scaled_list.append(result)
len(scaled_list)

3

In [1457]:
pca = PCA(n_components=1)
pca_list = []
for i in scaled_list:
    result = pca.fit_transform(i)
    total_var = pca.explained_variance_ratio_.sum() * 100
    pca_list.append(result)
    print(f"{i}total_var : {total_var}")

[[ 0.34956982  0.38706193]
 [ 0.47844475  0.53139596]
 [-0.32917145 -0.39203953]
 ...
 [-0.46663804 -0.54922004]
 [-0.44086306 -0.52126004]
 [-0.34635478 -0.40866439]]total_var : 99.55080635654565
[[ 0.48784827  0.4190547   0.49197811]
 [ 0.71350861  0.64735503  0.71675676]
 [-0.7256517  -0.80678719 -0.71846784]
 ...
 [-0.97088462 -1.05399137 -0.96262396]
 [-0.92713414 -1.0103671  -0.91870169]
 [-0.75213225 -0.83296175 -0.74430447]]total_var : 99.22881044537628
[[ 0.20876468  0.02970139]
 [ 0.34848248  0.02970139]
 [-0.52087275  0.02970139]
 ...
 [-0.66059055  0.02970139]
 [-0.64506635  0.02970139]
 [-0.53639695  0.02970139]]total_var : 50.146303916486204


In [1458]:
pca_list[0]

array([[ 0.52087731],
       [ 0.71406521],
       [-0.50997318],
       ...,
       [-0.71832014],
       [-0.68032377],
       [-0.53387918]])

In [1459]:
# 2D 리스트를 1D로 Flatten
flat_pca_list = []
for i in pca_list:
    result = [j for sub in i for j in sub]
    flat_pca_list.append(result)

In [1460]:
pca_df = pd.DataFrame({"PC1": flat_pca_list[0], "PC2": flat_pca_list[1], "PC3": flat_pca_list[2]})
pca_df.head()

Unnamed: 0,PC1,PC2,PC3
0,0.520877,-0.807738,0.126617
1,0.714065,-1.199604,0.225412
2,-0.509973,1.299447,-0.389315
3,0.116671,0.017558,-0.070974
4,-0.066167,0.392344,-0.169769


In [1461]:
pca_test = pd.concat([pca_df, y_test], axis=1)
pca_test.head()

Unnamed: 0,PC1,PC2,PC3,type
0,0.520877,-0.807738,0.126617,0
1,0.714065,-1.199604,0.225412,0
2,-0.509973,1.299447,-0.389315,0
3,0.116671,0.017558,-0.070974,0
4,-0.066167,0.392344,-0.169769,0


In [1462]:
fig = px.scatter_3d(
    pca_test, x='PC1', y='PC2', z='PC3', color='type',symbol='type', opacity=0.5, size_max=10,height=600,
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

In [1463]:
dbscan = joblib.load('./models/dbscan.joblib')

In [1464]:
X_test = pca_test.iloc[:,:-1]
y_test = pca_test[['type']]

In [1465]:
labels = dbscan.fit_predict(X_test)
labels

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [1466]:
X_test['label'] = labels
X_test

Unnamed: 0,PC1,PC2,PC3,label
0,0.520877,-0.807738,0.126617,0
1,0.714065,-1.199604,0.225412,0
2,-0.509973,1.299447,-0.389315,0
3,0.116671,0.017558,-0.070974,0
4,-0.066167,0.392344,-0.169769,0
...,...,...,...,...
7384,0.011963,0.231736,-0.125860,0
7385,-0.818219,1.921813,-0.542996,0
7386,-0.718320,1.724716,-0.488110,0
7387,-0.680324,1.648911,-0.477133,0


In [1467]:
X_test['label'].value_counts()

 0      3975
-1       462
 30       82
 28       75
 14       67
        ... 
 191       5
 190       4
 175       4
 64        4
 110       4
Name: label, Length: 193, dtype: int64

In [1468]:
X_test = pd.concat([X_test, y_test], axis=1)
X_test.head()

Unnamed: 0,PC1,PC2,PC3,label,type
0,0.520877,-0.807738,0.126617,0,0
1,0.714065,-1.199604,0.225412,0,0
2,-0.509973,1.299447,-0.389315,0,0
3,0.116671,0.017558,-0.070974,0,0
4,-0.066167,0.392344,-0.169769,0,0


In [1469]:
fig = px.scatter_3d(
    X_test, x='PC1', y='PC2', z='PC3', color='label',symbol='type', opacity=0.5, size_max=10,height=600,
)
fig.show()

In [1470]:
X_test.describe().round(2)

Unnamed: 0,PC1,PC2,PC3,label,type
count,7389.0,7389.0,7389.0,7389.0,7389.0
mean,0.0,0.0,0.0,31.61,2.96
std,1.41,1.73,1.0,49.71,2.23
min,-2.25,-3.17,-1.53,-1.0,0.0
25%,-1.05,-1.5,-0.52,0.0,1.0
50%,-0.16,0.04,-0.11,0.0,3.0
75%,0.85,1.51,0.4,56.0,5.0
max,4.9,3.25,27.96,191.0,7.0


In [1471]:
def 결과정리(label):
    if label == -1:
        return 1
    else:
        return 0
        

In [1472]:
X_test['label'] = X_test['label'].apply(결과정리)

In [1473]:
fig = px.scatter_3d(
    X_test, x='PC1', y='PC2', z='PC3', color='label',symbol='type', opacity=0.5, size_max=10,height=800,
)
fig.show()

In [1474]:
submit_df = X_test[['type', 'label']]

In [1475]:
submit_df['label'].value_counts()

0    6927
1     462
Name: label, dtype: int64

In [1476]:
submit_df.to_csv('./submits/submit_20230409_08.csv', index=False)