In [1]:
import pandas as pd

df_A = pd.read_csv("../data/train/TRAIN_A.csv")
df_B = pd.read_csv("../data/train/TRAIN_B.csv")

test_C = pd.read_csv("../data/test/C/TEST_C_0000.csv")
test_D = pd.read_csv("../data/test/D/TEST_D_0000.csv")

df_A['timestamp'] = pd.to_datetime(df_A['timestamp'], format='%y/%m/%d %H:%M')
df_B['timestamp'] = pd.to_datetime(df_B['timestamp'], format='%y/%m/%d %H:%M')

df_A.set_index('timestamp', inplace=True)
df_B.set_index('timestamp', inplace=True)

In [2]:
x_c_col = ['P1','P2','P3','P4','P5','P6','P7','P8']
x_d_col = ['P1','P2','P3','P4','P5','P6']

y_col = 'anomaly'

In [3]:
df_A_C = df_A[x_c_col + [y_col]]
df_B_C = df_B[x_c_col + [y_col]]

df_C = pd.concat([df_A_C, df_B_C])

In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

df_C.loc[:, x_c_col] = scaler.fit_transform(df_C.loc[:, x_c_col])

df_C.head()

Unnamed: 0_level_0,P1,P2,P3,P4,P5,P6,P7,P8,anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-05-27 00:00:00,0.054363,0.102138,0.038389,0.034544,0.035041,0.039863,0.609078,0.025795,0
2024-05-27 00:01:00,0.054363,0.098943,0.04087,0.036465,0.038125,0.043725,0.605461,0.025373,0
2024-05-27 00:02:00,0.056457,0.098943,0.03297,0.033707,0.033943,0.034179,0.606383,0.021824,0
2024-05-27 00:03:00,0.056457,0.098943,0.03297,0.033707,0.033943,0.034179,0.606383,0.021824,0
2024-05-27 00:04:00,0.052269,0.102138,0.034102,0.033481,0.036793,0.035539,0.609929,0.021004,0


In [5]:
from sklearn.model_selection import train_test_split

# 독립 변수(X)와 종속 변수(y) 분리
X = df_C.drop(columns=['anomaly'])
y = df_C['anomaly']

# 시계열 순서 유지하며 분리
split_idx = int(len(X) * 0.8)
x_train, x_val = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

In [6]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score

# Isolation Forest 모델 학습
model_p1 = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
model_p1.fit(x_train['P1'].values.reshape(-1,1))

model_p2 = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
model_p2.fit(x_train['P2'].values.reshape(-1,1))

model_p3 = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
model_p3.fit(x_train['P3'].values.reshape(-1,1))

model_p4 = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
model_p4.fit(x_train['P4'].values.reshape(-1,1))

model_p5 = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
model_p5.fit(x_train['P5'].values.reshape(-1,1))

model_p6 = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
model_p6.fit(x_train['P6'].values.reshape(-1,1))

model_p7 = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
model_p7.fit(x_train['P7'].values.reshape(-1,1))

model_p8 = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
model_p8.fit(x_train['P8'].values.reshape(-1,1))

# RUN

In [7]:
from tqdm import tqdm

from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [8]:
sub_df = pd.read_csv("../data/sample_submission.csv")

In [9]:
sub_df.head()

Unnamed: 0,ID,flag_list
0,TEST_C_0000,"[0, 0, 0, 0, 0, 0, 0, 0]"
1,TEST_C_0001,"[0, 0, 0, 0, 0, 0, 0, 0]"
2,TEST_C_0002,"[0, 0, 0, 0, 0, 0, 0, 0]"
3,TEST_C_0003,"[0, 0, 0, 0, 0, 0, 0, 0]"
4,TEST_C_0004,"[0, 0, 0, 0, 0, 0, 0, 0]"


In [18]:
count = 0

for i in tqdm(sub_df['ID']):
    if i.split('_')[1] == "C":
        test_C = pd.read_csv(f"../data/test/C/{i}.csv")
        test_C = test_C[x_c_col]
        test_C[:] = scaler.fit_transform(test_C[:])
    
        predictions = {}
        for j in x_c_col:
            model = ExponentialSmoothing(
                test_C[j],
                seasonal_periods=24,  # 24시간 주기
                trend=None,
                seasonal=None
            )
        
            fitted_model = model.fit()
            forecast = fitted_model.forecast(1).values  # 다음 1시점 예측
            predictions[j] = float(forecast)
            
        predictions_df = pd.DataFrame([predictions])
        result_df = pd.concat([test_C, predictions_df], ignore_index=True)

        result = []
        p_predict = model_p1.predict(result_df['P1'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p2.predict(result_df['P2'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p3.predict(result_df['P3'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p4.predict(result_df['P4'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p5.predict(result_df['P5'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p6.predict(result_df['P6'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p7.predict(result_df['P7'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p8.predict(result_df['P8'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        sub_df.loc[count, 'flag_list'] = result
        # print(sub_df.loc[count, 'flag_list'])
        count += 1
    else:
        test_D = pd.read_csv(f"../data/test/D/{i}.csv")
        test_D = test_D[x_d_col]
        test_D[:] = scaler.fit_transform(test_D[:])
    
        predictions = {}
        for j in x_d_col:
            model = ExponentialSmoothing(
                test_D[j],
                seasonal_periods=24,  # 24시간 주기
                trend=None,
                seasonal=None
            )
        
            fitted_model = model.fit()
            forecast = fitted_model.forecast(1).values  # 다음 1시점 예측
            predictions[j] = float(forecast)
            
        predictions_df = pd.DataFrame([predictions])
        result_df = pd.concat([test_D, predictions_df], ignore_index=True)

        result = []
        p_predict = model_p1.predict(result_df['P1'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p2.predict(result_df['P2'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p3.predict(result_df['P3'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p4.predict(result_df['P4'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p5.predict(result_df['P5'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        p_predict = model_p6.predict(result_df['P6'].values.reshape(-1,1))[-1]
        p_predict = (p_predict == -1).astype(int)
        result.append(p_predict)

        sub_df.loc[count, 'flag_list'] = result
        # print(sub_df.loc[count, 'flag_list'])
        count += 1

100%|██████████████████████████████████████████████████████████████████████████████████████████| 5658/5658 [35:24<00:00,  2.66it/s]


In [22]:
sub_df.to_csv("../data/sub/sub_001.csv", index=False)