In [1]:
# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-3.0.0-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting plotly-resampler>=0.8.3.1
  Downloading plotly_resampler-0.8.3.2.tar.gz (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting scikit-plot>=0.3.7
  Downloading scikit_plot-0.3.7-py3-none-any.whl (3

In [3]:
import random
import pandas as pd
import numpy as np
import os
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from pycaret.anomaly import *

from sklearn.manifold import TSNE

from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

In [4]:
# Random Seed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(69) # Seed 고정

In [5]:
pd.set_option('display.max_columns', None)

## Data Load

In [6]:
data_path = '/content/drive/MyDrive/Colab Notebooks/AI_SPARK/Data'
train = pd.read_csv(data_path+'/train_data.csv')
test = pd.read_csv(data_path+'/test_data.csv')

In [7]:
test

Unnamed: 0,air_inflow,air_end_temp,out_pressure,motor_current,motor_rpm,motor_temp,motor_vibe,type
0,2.51,53.28,0.7,32.54,2662.0,69.58,3.48,0
1,2.66,55.24,0.7,34.45,2819.0,71.32,3.57,0
2,1.72,42.74,0.7,22.23,1819.0,60.21,3.01,0
3,2.20,49.15,0.7,28.50,2332.0,65.91,3.30,0
4,2.06,47.28,0.7,26.67,2182.0,64.24,3.21,0
...,...,...,...,...,...,...,...,...
7384,2.12,48.08,0.7,27.45,2246.0,64.96,3.25,7
7385,1.48,39.63,0.7,19.19,1570.0,57.44,2.87,7
7386,1.56,40.61,0.7,20.15,1649.0,58.32,2.92,7
7387,1.59,40.99,0.7,20.52,1679.0,58.66,2.93,7


## Pre-Processing

#### 1) '마력' 변수 생성

In [8]:
train.loc[train['type'] == 0, 'HP'] = 30
train.loc[train['type'] == 1, 'HP'] = 20
train.loc[train['type'] == 2, 'HP'] = 10
train.loc[train['type'] == 3, 'HP'] = 50
train.loc[train['type'] == 4, 'HP'] = 30
train.loc[train['type'] == 5, 'HP'] = 30
train.loc[train['type'] == 6, 'HP'] = 30
train.loc[train['type'] == 7, 'HP'] = 30

test.loc[test['type'] == 0, 'HP'] = 30
test.loc[test['type'] == 1, 'HP'] = 20
test.loc[test['type'] == 2, 'HP'] = 10
test.loc[test['type'] == 3, 'HP'] = 50
test.loc[test['type'] == 4, 'HP'] = 30
test.loc[test['type'] == 5, 'HP'] = 30
test.loc[test['type'] == 6, 'HP'] = 30
test.loc[test['type'] == 7, 'HP'] = 30

#### 2) 변수 생성

In [9]:
train['volt'] = ((1 / 0.746) * train['HP']) / train['motor_current']
test['volt'] = ((1 / 0.746) * test['HP']) / test['motor_current']

In [10]:
train['torque'] = train['HP'] / train['motor_rpm']
test['torque'] = test['HP'] / test['motor_rpm']

In [11]:
train['각속도'] = ((1 / 0.746) * train['HP']) / train['torque']
test['각속도'] = ((1 / 0.746) * test['HP']) / test['torque']

In [12]:
train['회전수/진동'] = train['motor_rpm'] / train['motor_vibe']
test['회전수/진동'] = test['motor_rpm'] / test['motor_vibe']

In [13]:
train['회전수/전류'] = train['motor_rpm'] / train['motor_current']
test['회전수/전류'] = test['motor_rpm'] / test['motor_current']

In [14]:
train['temp'] = train['air_end_temp'] + train['motor_temp']
test['temp'] = test['air_end_temp'] + test['motor_temp']

In [15]:
train['회전수/유량'] = train['motor_rpm'] / train['air_inflow']
test['회전수/유량'] = test['motor_rpm'] / test['air_inflow']

In [16]:
train['진동수/유량'] = train['motor_vibe'] / train['air_inflow']
test['진동수/유량'] = test['motor_vibe'] / test['air_inflow']

In [17]:
train['전류/진동수'] = train['motor_current'] / train['motor_vibe']
test['전류/진동수'] = test['motor_current'] / test['motor_vibe']

In [18]:
train['주기'] = (1 / (60 * train['motor_rpm'])) * 100000
test['주기'] = (1 / (60 * test['motor_rpm'])) * 100000

## t-sne + polynomial_features

In [51]:
anom = setup(data = train, verbose = 0, session_id = 69, normalize = False, polynomial_features = True, use_gpu=True, polynomial_degree=3)
anom_train = anom.X_train_transformed

In [52]:
anom = setup(data = test, verbose = 0, session_id = 69, normalize = False, polynomial_features = True, polynomial_degree=3 , use_gpu=True)
anom_test = anom.X_train_transformed

In [53]:
tsne_df = pd.concat([anom_train,anom_test])

In [54]:
model = TSNE(3)

tsne_data = model.fit_transform(tsne_df)

In [55]:
tsne_data.shape

(9852, 3)

In [56]:
tsne_train = tsne_data[:2463]
tsne_test = tsne_data[2463:]

In [57]:
kf = KFold(n_splits=20)
model_list = []

for train_index, test_index in kf.split(tsne_train):
    X_train, X_test = tsne_train[train_index], tsne_train[test_index]

    anom = setup(data = X_train, verbose = 0, session_id = 69, normalize = False, )
    model = create_model('abod', fraction=0.02)
    model_list.append(model)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [58]:
pred_list = []

for m in model_list:
  pred = m.predict(tsne_test)
  pred_list.append(pred)

In [59]:
preds = 1*(np.sum(pred_list,axis=0) >= len(model_list))
# 20 voting 실패


In [61]:
list(preds).count(0), list(preds).count(1)

(7081, 308)

## 언더샘플링 + polynomial_features + tsne

In [63]:
import imblearn
from imblearn.under_sampling import ClusterCentroids

label = train['type']
sm = ClusterCentroids(sampling_strategy='auto', )
sm_train, label = sm.fit_resample(train, label)
print(train.shape)
print(sm_train.shape)

anom = setup(data = sm_train, verbose = 0, session_id = 69, normalize = False, polynomial_features = True, use_gpu=True, polynomial_degree=3)
anom_train = anom.X_train_transformed

anom = setup(data = test, verbose = 0, session_id = 69, normalize = False, polynomial_features = True, polynomial_degree=3 , use_gpu=True)
anom_test = anom.X_train_transformed


tsne_df = pd.concat([anom_train,anom_test])

model = TSNE(3)

tsne_data = model.fit_transform(tsne_df)

(2463, 19)
(1488, 19)


In [64]:
tsne_train = tsne_data[:sm_train.shape[0]]
tsne_test = tsne_data[sm_train.shape[0]:]

label = train['type']
kf = KFold(n_splits=20)
model_list = []

for train_index, test_index in kf.split(tsne_train):
    X_train, X_test = tsne_train[train_index], tsne_train[test_index]

    anom = setup(data = X_train, verbose = 0, session_id = 69, normalize = False, )
    model = create_model('abod', fraction=0.02)
    model_list.append(model)


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [65]:
pred_list = []

for m in model_list:
  pred = m.predict(tsne_test)
  pred_list.append(pred)


In [70]:
preds = 1*(np.sum(pred_list,axis=0) >= 9)

In [71]:
list(preds).count(0), list(preds).count(1)

(7086, 303)

## polynomial_features 삭제

In [44]:
tsne_df = pd.concat([train,test])

model = TSNE(3)

tsne_data = model.fit_transform(tsne_df)

tsne_train = tsne_data[:2463]
tsne_test = tsne_data[2463:]


In [45]:
kf = KFold(n_splits=20)
model_list = []

for train_index, test_index in kf.split(tsne_train):
    X_train, X_test = tsne_train[train_index], tsne_train[test_index]

    anom = setup(data = X_train, verbose = 0, session_id = 69, normalize = False, )
    model = create_model('abod', fraction=0.02)
    model_list.append(model)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [46]:
pred_list = []

for m in model_list:
  pred = m.predict(tsne_test)
  pred_list.append(pred)

In [49]:
preds = 1*(np.sum(pred_list,axis=0) >= len(model_list)-1)

In [50]:
list(preds).count(0), list(preds).count(1)

(7018, 371)

# Submission

In [68]:
submit = pd.read_csv(data_path+'/answer_sample.csv')
submit['label'] = pred
submit.to_csv('ABOD_20fold_TSNE_세은최고.csv', index=False)