In [35]:
import random
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.decomposition import PCA


from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore')

In [36]:
# Random Seed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(69) # Seed 고정

## Data Load

In [37]:
train = pd.read_csv('./data/train_data.csv')
test = pd.read_csv('./data/test_data.csv')

In [38]:
train.head(2)

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,1.59,41.0,0.7,20.53,1680.0,58.67,2.93,0
1,2.97,59.28,0.7,38.4,3142.0,74.91,3.75,0


In [39]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2463 entries, 0 to 2462
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   air_inflow     2463 non-null   float64
 1   air_end_temp   2463 non-null   float64
 2   out_pressure   2463 non-null   float64
 3   motor_current  2463 non-null   float64
 4   motor_rpm      2463 non-null   float64
 5   motor_temp     2463 non-null   float64
 6   motor_vibe     2463 non-null   float64
 7   type           2463 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 154.1 KB


## EDA

#### type

In [19]:
train['type'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64)

## Pre-Processing

#### Scaling

In [40]:
num_features = ['air_inflow', 'air_end_temp', 'out_pressure',
                'motor_current', 'motor_rpm', 'motor_temp',
                'motor_vibe']

In [41]:
scaler = MinMaxScaler()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.transform(test[num_features])

#### One Hot Encoding

In [42]:
# int -> str
train['type'] = train['type'].astype(str)
test['type'] = test['type'].astype(str)

In [43]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [44]:
train.head()

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7
0,0.213922,0.272396,0.0,0.227873,0.202708,0.275531,0.048241,1,0,0,0,0,0,0,0
1,0.448217,0.827513,0.0,0.49595,0.821413,0.813992,0.089447,1,0,0,0,0,0,0,0
2,0.268251,0.402672,0.0,0.290879,0.347863,0.401857,0.057789,1,0,0,0,0,0,0,0
3,0.34635,0.586092,0.0,0.379388,0.552264,0.579576,0.071357,1,0,0,0,0,0,0,0
4,0.266553,0.400243,0.0,0.289679,0.345324,0.399536,0.057789,1,0,0,0,0,0,0,0


#### Drop column

In [45]:
train.drop('out_pressure', axis=1, inplace=True)
test.drop('out_pressure', axis=1, inplace=True)

## Train

In [46]:
# PCA는 웬만하면 스케일링 하는게 좋을껄?
pca = PCA()
train_pca = pca.fit_transform(train)

In [47]:
model = IsolationForest(max_samples=256, n_estimators=200, random_state=69)
model.fit(train_pca)

IsolationForest(max_samples=256, n_estimators=200, random_state=69)

## Predict

### Isolation Forest일 경우

In [29]:
# IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
def get_pred_label(model_pred):
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [30]:
pred_test = model.predict(test)
pred_test = get_pred_label(pred_test)

#### 1845 5544 이 결과가 좀 이상하긴 해용

In [31]:
# 정상 (0), 이상 (1)
list(pred_test).count(0), list(pred_test).count(1)

(1845, 5544)

## Submission

In [49]:
submit = pd.read_csv('./data/answer_sample.csv')

In [50]:
submit['label'] = pred_test
submit.head()

Unnamed: 0,type,label
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0


In [51]:
submit.to_csv('./submit/pca_isolation2.csv', index=False)