# Classification for identifying abnormal instances

***
**Contest:** 2023년 지질자원 데이터 활용 및 인공지능 경진대회 (link [here](http://www.geodata-con.kr/2023/overview.php)).  
**Original:** <i><b>A Realistic and Public Dataset with Rare Undesirable Real Events in Oil Wells</i></b> published in the <i><b>Journal of Petroleum Science and Engineering</i></b> (link [here](https://doi.org/10.1016/j.petrol.2019.106223)).  
**Editor:** Jongwook Kim, Dogyun Kim  
**Advisor:** Jonggeun Choe  
**Last updated:** 08-16-2023
***

# 1. Imports and Configurations

In [2]:
import datetime
import os

import matplotlib.dates
import numpy as np
import torch.cuda
%load_ext autoreload
%autoreload 2
from pathlib import Path
from module import *
from argparse import ArgumentParser
from models import *
from copy import copy
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform

data_path = Path('data')
fix_seed()
events_names = {0: 'Normal',
                1: 'Abrupt Increase of BSW',
                2: 'Spurious Closure of DHSV',
                3: 'Severe Slugging',
                4: 'Flow Instability',
                5: 'Rapid Productivity Loss',
                6: 'Quick Restriction in PCK',
                7: 'Scaling in PCK',
                8: 'Hydrate in Production Line'
               }
vars = ['P-PDG',
        'P-TPT',
        'T-TPT',
        'P-MON-CKP',
        'T-JUS-CKP',
        'P-JUS-CKGL',
        'T-JUS-CKGL',
        'QGL']

#parsing the parameters
args = ArgumentParser()
args.columns = ['timestamp'] + vars + ['class']
args.abnormal_classes_codes = [0, 1, 2, 3, 4, 5, 6, 7, 8]
args.downsample_rate = 30            # Adjusts frequency of sampling to the dynamics
                                # of the undesirable event of interest
args.sample_size_default = 60        # In observations (after downsample)
args.sample_size_normal_period = 5   # In observations (after downsample)
args.max_nan_percent = 0.15           # For selection of useful variables
args.std_vars_min = 0.01             # For selection of useful variables
args.max_frozen_percent = 0.1
args.disable_progressbar = True      # For less output

In [3]:
df_instances_clean, clean_vars, instances, real_instances, sim_instances, drawn_instances = Preprocessing(args, vars, data_path)
df = copy(df_instances_clean)
df.loc[df['source']=='drawn', ['P-TPT', 'P-MON-CKP']] = df.loc[df['source']=='drawn', ['P-TPT', 'P-MON-CKP']] * 1e5
new_instances = copy(instances)

df, new_instances = make_augmentation(df, new_instances, real_instances, sim_instances, drawn_instances)
add_instances = new_instances[1984:]

number of total instances = 1984
number of real instances = 1025
number of simulated instances = 939
number of drawn instances = 20

Clean Variables: ['P-TPT', 'T-TPT', 'P-MON-CKP']


In [4]:
df_real = df_instances_clean[df_instances_clean['source']=='real']
# X (real)
real_id = list(set(df_real['instance_id']))
# Y (real)
real_class= list(instances.loc[real_id]['class_code'])

df_sim = df_instances_clean[df_instances_clean['source']=='simulated']
sim_id = list(set(df_sim['instance_id']))
sim_class= list(instances.loc[sim_id]['class_code'])


df_drawn = df_instances_clean[df_instances_clean['source']=='drawn']
drawn_id = list(set(df_drawn['instance_id']))
drawn_class= list(instances.loc[drawn_id]['class_code'])

df_synthetic = pd.concat([df_sim, df_drawn], axis=0).reset_index(drop=True)
synthetic_id = sim_id + drawn_id
synthetic_class = sim_class + drawn_class

## 2. LSTM Autoencoder

In [7]:
args.train_ratio = 0.6
args.validation_ratio = 0.2
args.test_ratio = 0.2
RF = False
by_instance = True
use_synthetic = True
use_augmentation = True
integrate = False
integrate_syn = False

# Real_Only | With_Sim_Drawn | With_Sim_Drawn_Augmentation
# 첫번째 Each => scaler를 real이랑 synthetic이랑 분리
# 두번째 Each => scaler를 synthetic 내에서 simulatd랑 drawn이랑 분리
# Byinstance => instance마다 스케일링
name = 'Byinstance_With_Sim_Drawn_Augmentation_multi'
if not 'fig' in os.listdir():
    os.mkdir('fig')
if not 'Mission2' in os.listdir('fig'):
    os.mkdir('./fig/Mission2')
    for event in events_names:
        if not str(int(event)) in os.listdir('./fig/Mission2'):
            os.mkdir(f'./fig/Mission2/{int(event)}')

# 모델 및 학습 파라미터 설정
param = ArgumentParser()
param.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
param.num_epoch = 100
param.batch_size = 64
param.learning_rate = 0.003
param.window_size = 30
param.stride = 15

param.input_dim = len(clean_vars)
param.n_features = len(events_names)
param.hidden_dim = 128
param.num_layers = 3
param.vicinal_risk_minimization = False
param.masking = False
param.mask_ratio = 0.9
param.batch_max_mask_ratio = 0.25
param.sim_use = False
param.drawn_use = False

### Stratified sampling & Scaling

In [8]:
fix_seed()
train_id, valid_id, test_id = [], [], []
real_idx_dict = {}
train_idx_dict = {}
valid_idx_dict = {}
test_idx_dict = {}

for class_code in set(real_instances['class_code']):
    real_idx_dict[class_code] = list(real_instances[real_instances['class_code'] == class_code].index.values)

for class_code in set(real_instances['class_code']):
    # 2023-08-11: Compatible with Simulated and Drawn dataset
    if ceil(args.test_ratio * len(real_instances[real_instances['class_code'] == class_code])) <= 1:
        class_tmp = [int(real_idx_dict[class_code].pop(np.random.randint(len(real_idx_dict[class_code]))))]
        test_id += class_tmp
        train_valid_id = real_idx_dict[class_code]
        if use_synthetic:
            train_valid_id += list(sim_instances[sim_instances['class_code'] == class_code].index.values + 1025)
            train_valid_id += list(drawn_instances[drawn_instances['class_code'] == class_code].index.values + 1964)
        if use_augmentation:
            train_valid_id += list(add_instances[add_instances['class_code'] == class_code].index.values)
        train_id_class, valid_id_class = train_test_split(train_valid_id, test_size=args.validation_ratio/(1-args.test_ratio))
        train_id += train_id_class
        valid_id += valid_id_class

        train_idx_dict[class_code] = train_id_class
        valid_idx_dict[class_code] = valid_id_class
        test_idx_dict[class_code] = class_tmp
    else:
        test_id_class = np.random.choice(real_idx_dict[class_code], size=ceil(args.test_ratio * len(real_instances[real_instances['class_code'] == class_code])), replace=False)
        test_id += list(test_id_class)
        train_valid_id = list(set(real_idx_dict[class_code]) - set(test_id_class))
        if use_synthetic:
            train_valid_id += list(sim_instances[sim_instances['class_code'] == class_code].index.values + 1025)
            train_valid_id += list(drawn_instances[drawn_instances['class_code'] == class_code].index.values + 1964)
        if use_augmentation:
            train_valid_id += list(add_instances[add_instances['class_code'] == class_code].index.values)
        train_id_class, valid_id_class = train_test_split(train_valid_id, test_size=args.validation_ratio/(1-args.test_ratio))
        train_id += train_id_class
        valid_id += valid_id_class

        train_idx_dict[class_code] = train_id_class
        valid_idx_dict[class_code] = valid_id_class
        test_idx_dict[class_code] = list(test_id_class)
if RF:
    train_id = sorted(train_id + valid_id)
    test_id = sorted(test_id)

    print(f'Train ratio: {len(train_id) / len(train_id+test_id):.2f}')
    print(f'Test ratio: {len(test_id) / len(train_id+test_id):.2f}')
else:
    train_id = sorted(train_id)
    valid_id = sorted(valid_id)
    test_id = sorted(test_id)

    print(f'Train ratio: {len(train_id) / len(train_id+valid_id+test_id):.2f}')
    print(f'Validation ratio: {len(valid_id) / len(train_id+valid_id+test_id):.2f}')
    print(f'Test ratio: {len(test_id) / len(train_id+valid_id+test_id):.2f}')


df_train = make_data(df, train_id)
df_valid = make_data(df, valid_id)
df_test = make_data(df, test_id)

if by_instance:
    df_train = fit_scaler(train_id, df_train, clean_vars)
    df_valid = fit_scaler(valid_id, df_valid, clean_vars)
    df_test = fit_scaler(test_id, df_test, clean_vars)

else:
    if integrate:
        scaler_train = make_scaler(train_id, df_train, clean_vars)
        df_train = fit_scaler_old(train_id, df_train, clean_vars, scaler_train)
        df_valid = fit_scaler_old(valid_id, df_valid, clean_vars, scaler_train)
        df_test = fit_scaler_old(test_id, df_test, clean_vars, scaler_train)
    else:
        train_id = np.array(train_id)
        train_id_real = train_id[train_id < 1025]
        scaler_train_real = make_scaler(train_id_real, df_train, clean_vars)
        df_train = fit_scaler_old(train_id_real, df_train, clean_vars, scaler_train_real)
        df_valid = fit_scaler_old(valid_id, df_valid, clean_vars, scaler_train_real)
        df_test = fit_scaler_old(test_id, df_test, clean_vars, scaler_train_real)

        if integrate_syn:
            train_id_syn = train_id[train_id >= 1025]
            valid_id_syn = valid_id[valid_id >= 1025]
            scaler_train_syn = make_scaler(train_id_syn, df_train, clean_vars)
            df_train = fit_scaler_old(train_id_syn, df_train, clean_vars, scaler_train_syn)
            df_train = fit_scaler_old(valid_id_syn, df_train, clean_vars, scaler_train_syn)
        else:
            train_id_sim = set(df_train.loc[df_train['source']=='simulated', 'instance_id'])
            train_id_drawn = set(df_train.loc[df_train['source']=='drawn', 'instance_id'])
            valid_id_sim = set(df_valid.loc[df_valid['source']=='simulated', 'instance_id'])
            valid_id_drawn = set(df_valid.loc[df_valid['source']=='drawn', 'instance_id'])

            scaler_train_sim = make_scaler(train_id_sim, df_train, clean_vars)
            scaler_train_drawn = make_scaler(train_id_drawn, df_train, clean_vars)

            df_train = fit_scaler_old(train_id_sim, df_train, clean_vars, scaler_train_sim)
            df_train = fit_scaler_old(train_id_drawn, df_train, clean_vars, scaler_train_drawn)
            df_valid = fit_scaler_old(valid_id_sim, df_valid, clean_vars, scaler_train_sim)
            df_valid = fit_scaler_old(valid_id_drawn, df_valid, clean_vars, scaler_train_drawn)

Train ratio: 0.68
Validation ratio: 0.23
Test ratio: 0.09


In [9]:
train_dataset = ByInstanceDataset(data=df_train,instance_id=train_id, all_instances=new_instances, input_vars=clean_vars,
                                          stats = None, is_pretrain=True,
                                          scaler=None, already_scaled=True)
valid_dataset = ByInstanceDataset(data=df_valid, instance_id=valid_id, all_instances=new_instances, input_vars=clean_vars,
                                  stats = None, is_pretrain=True,
                                  scaler=None, already_scaled=True)
test_dataset = ByInstanceDataset(data=df_test, instance_id=test_id, all_instances=new_instances, input_vars=clean_vars,
                                 stats = None, is_pretrain=True,
                                 scaler=None, already_scaled=True)

dataloader_dict = {}
dataloader_dict['Train'] = DataLoader(train_dataset, batch_size=1, shuffle=True)
dataloader_dict['Valid'] = DataLoader(valid_dataset, batch_size=1, shuffle=True)
dataloader_dict['Test'] = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
fix_seed()
model = LSTMClassifier(input_dim=param.input_dim,
                       n_features=param.n_features,
                       window_size=param.window_size,
                       latent_dim=param.hidden_dim,
                       device=param.device,
                       num_layers=param.num_layers)
criterion = nn.MSELoss()
params_to_optimize = [{'params': model.encoder.parameters()}, {'params': model.reconstruct_decoder.parameters()}]
optimizer = torch.optim.Adam(params_to_optimize, lr=param.learning_rate)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

model, logger = Pretraining(param, model, criterion, optimizer, scheduler, dataloader_dict, model_name=f'{name}_autoencoder')
draw_loss(logger,fname=f'{name}_autoencoder')

## 3. Data selection using LSTM Autoencoder

In [11]:
by_instance = True
use_synthetic = False
use_augmentation = False
integrate = False
integrate_syn = False

In [12]:
fix_seed()
train_id, valid_id, test_id = [], [], []
real_idx_dict = {}
train_idx_dict = {}
valid_idx_dict = {}
test_idx_dict = {}

for class_code in set(real_instances['class_code']):
    real_idx_dict[class_code] = list(real_instances[real_instances['class_code'] == class_code].index.values)

for class_code in set(real_instances['class_code']):
    # 2023-08-11: Compatible with Simulated and Drawn dataset
    if ceil(args.test_ratio * len(real_instances[real_instances['class_code'] == class_code])) <= 1:
        class_tmp = [int(real_idx_dict[class_code].pop(np.random.randint(len(real_idx_dict[class_code]))))]
        test_id += class_tmp
        train_valid_id = real_idx_dict[class_code]
        if use_synthetic:
            train_valid_id += list(sim_instances[sim_instances['class_code'] == class_code].index.values + 1025)
            train_valid_id += list(drawn_instances[drawn_instances['class_code'] == class_code].index.values + 1964)
        if use_augmentation:
            train_valid_id += list(add_instances[add_instances['class_code'] == class_code].index.values)
        train_id_class, valid_id_class = train_test_split(train_valid_id, test_size=args.validation_ratio/(1-args.test_ratio))
        train_id += train_id_class
        valid_id += valid_id_class

        train_idx_dict[class_code] = train_id_class
        valid_idx_dict[class_code] = valid_id_class
        test_idx_dict[class_code] = class_tmp
    else:
        test_id_class = np.random.choice(real_idx_dict[class_code], size=ceil(args.test_ratio * len(real_instances[real_instances['class_code'] == class_code])), replace=False)
        test_id += list(test_id_class)
        train_valid_id = list(set(real_idx_dict[class_code]) - set(test_id_class))
        if use_synthetic:
            train_valid_id += list(sim_instances[sim_instances['class_code'] == class_code].index.values + 1025)
            train_valid_id += list(drawn_instances[drawn_instances['class_code'] == class_code].index.values + 1964)
        if use_augmentation:
            train_valid_id += list(add_instances[add_instances['class_code'] == class_code].index.values)
        train_id_class, valid_id_class = train_test_split(train_valid_id, test_size=args.validation_ratio/(1-args.test_ratio))
        train_id += train_id_class
        valid_id += valid_id_class

        train_idx_dict[class_code] = train_id_class
        valid_idx_dict[class_code] = valid_id_class
        test_idx_dict[class_code] = list(test_id_class)
if RF:
    train_id = sorted(train_id + valid_id)
    test_id = sorted(test_id)

    print(f'Train ratio: {len(train_id) / len(train_id+test_id):.2f}')
    print(f'Test ratio: {len(test_id) / len(train_id+test_id):.2f}')
else:
    train_id = sorted(train_id)
    valid_id = sorted(valid_id)
    test_id = sorted(test_id)

    print(f'Train ratio: {len(train_id) / len(train_id+valid_id+test_id):.2f}')
    print(f'Validation ratio: {len(valid_id) / len(train_id+valid_id+test_id):.2f}')
    print(f'Test ratio: {len(test_id) / len(train_id+valid_id+test_id):.2f}')

Train ratio: 0.59
Validation ratio: 0.20
Test ratio: 0.20


In [13]:
train_id = np.array(train_id)
train_real_only = list(train_id[train_id<1025])
train_id = list(train_id)

Selection

In [14]:
# [1,2,5,6,7,8]
crit_by_class = {0:0,
                 1:0.055,
                 2:0.1,
                 3:0,
                 4:0,
                 5:0.05,
                 6:0.3,
                 7:0.3,
                 8:0.01}
crit_by_class_id = {}
model = LSTMClassifier(input_dim=param.input_dim,
                       n_features=param.n_features,
                       window_size=1,
                       latent_dim=param.hidden_dim,
                       device=torch.device('cpu'),
                       num_layers=param.num_layers)
model.load_state_dict(torch.load(f'./cache/{name}_autoencoder.pth'))

<All keys matched successfully>

In [15]:
fix_seed()
# df_total = pd.concat([df_train, df_valid, df_test])
r = real_instances.loc[train_real_only]
for class_num in crit_by_class:
    real_idx =list(r[r['class_code']==class_num].index.values)
    sim_idx = list(sim_instances[sim_instances['class_code'] == class_num].index.values + 1025)
    drawn_idx = list(drawn_instances[drawn_instances['class_code'] == class_num].index.values + 1964)
    syn_idx = np.array(sim_idx + drawn_idx)
    crit = crit_by_class[class_num]
    syn_idx_fin = np.array(new_instances.loc[1025:][new_instances.loc[1025:, 'class_code']==class_num].index.values)

    tot_idx = np.append(real_idx, syn_idx_fin)
    df_total = make_data(df, tot_idx)
    df_total = fit_scaler(tot_idx, df_total, clean_vars)

    dataset = ByInstanceDataset(data=df_total, instance_id=tot_idx, all_instances=instances, is_binary=False,
                      input_vars=clean_vars, scaler=None, stats=None, is_pretrain=True, already_scaled=False)

    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

    lst = {}
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = batch.to(model.device)
            hidden_tmp = model.encoder(batch)[0]
            lst[idx] = hidden_tmp.detach().cpu().numpy().flatten()
    d = pd.DataFrame(lst).T
    pca = PCA(n_components=3)
    pca.fit(d)
    decomp_d = pca.fit_transform(d)

    decomp_d_for_pdist = decomp_d
    dist = squareform(pdist(decomp_d_for_pdist))[:len(real_idx),len(real_idx):]
    mask = np.min(dist, axis=0) < crit
    selected_id = syn_idx_fin[np.where(mask==True)]
    crit_by_class_id[class_num] = list(selected_id)

    decomp_d_real = decomp_d[:len(real_idx)]
    decomp_d_sim = decomp_d[len(real_idx):len(real_idx) + len(sim_idx)]
    decomp_d_drawn = decomp_d[len(real_idx) + len(sim_idx):]
    decomp_d_syn = decomp_d[len(real_idx):]
    decomp_d_aug = decomp_d[len(syn_idx):]

    decomp_d_select = decomp_d_syn[mask]

    plt.rcParams['figure.dpi'] = 300
    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(decomp_d_select[:, 0], decomp_d_select[:, 1], decomp_d_select[:, 2],alpha=1,marker='*', s=100, c='green', label='Select')
    ax.scatter(decomp_d_sim[:, 0], decomp_d_sim[:, 1], decomp_d_sim[:, 2],alpha=1, c='k', label='Simulate')
    ax.scatter(decomp_d_drawn[:, 0], decomp_d_drawn[:, 1], decomp_d_drawn[:, 2],alpha=1, c='b', label='Drawn')
    ax.scatter(decomp_d_aug[:, 0], decomp_d_aug[:, 1], decomp_d_aug[:, 2],alpha=0.1, c='grey', label='Augmentation')
    ax.scatter(decomp_d_real[:, 0], decomp_d_real[:, 1], decomp_d_real[:, 2],alpha=1, c='r', label='Real')
    plt.title(f'Problem {class_num}')
    plt.legend()
    plt.savefig(f'./fig/problem_{class_num}_3d.png')
    plt.close()
    fig = plt.figure(figsize=(6, 6))
    plt.scatter(decomp_d_sim[:, 0], decomp_d_sim[:, 1], c='k', label='Simulate')
    plt.scatter(decomp_d_drawn[:, 0], decomp_d_drawn[:, 1], c='b', label='Drawn')
    plt.scatter(decomp_d_aug[:, 0], decomp_d_aug[:, 1],alpha=0.1, c='grey', label='Augmentation')
    plt.scatter(decomp_d_select[:, 0], decomp_d_select[:, 1],marker='*', c='green',s=130, label='Select')
    plt.scatter(decomp_d_real[:, 0], decomp_d_real[:, 1],  c='r', label='Real')
    plt.title(f'Problem {class_num}')
    plt.legend()
    plt.savefig(f'./fig/problem_{class_num}_x.png')
    plt.close()
    fig = plt.figure(figsize=(6, 6))
    plt.scatter(decomp_d_sim[:, 0], decomp_d_sim[:, 2], c='k', label='Simulate')
    plt.scatter(decomp_d_drawn[:, 0], decomp_d_drawn[:, 2], c='b', label='Drawn')
    plt.scatter(decomp_d_aug[:, 0], decomp_d_aug[:, 2],alpha=0.1, c='grey', label='Augmentation')
    plt.scatter(decomp_d_select[:, 0], decomp_d_select[:, 2],marker='*', c='green',s=130, label='Select')
    plt.scatter(decomp_d_real[:, 0], decomp_d_real[:, 2],  c='r', label='Real')
    plt.title(f'Problem {class_num}')
    plt.legend()
    plt.savefig(f'./fig/problem_{class_num}_y.png')
    plt.close()

    fig = plt.figure(figsize=(6, 6))
    plt.scatter(decomp_d_sim[:, 2], decomp_d_sim[:, 1], c='k', label='Simulate')
    plt.scatter(decomp_d_drawn[:, 2], decomp_d_drawn[:, 1], c='b', label='Drawn')
    plt.scatter(decomp_d_aug[:, 2], decomp_d_aug[:, 1],alpha=0.1, c='grey', label='Augmentation')
    plt.scatter(decomp_d_select[:, 2], decomp_d_select[:, 1],marker='*', c='green',s=130, label='Select')
    plt.scatter(decomp_d_real[:, 2], decomp_d_real[:, 1],  c='r', label='Real')
    plt.title(f'Problem {class_num}')
    plt.legend()
    plt.savefig(f'./fig/problem_{class_num}_z.png')
    plt.close()

    id_set = real_idx
    max_v = df_total[clean_vars].max()
    min_v = df_total[clean_vars].min()
    plt.rcParams['figure.figsize'] = (8, 8)
    fig, ax = plt.subplots(len(clean_vars), 1)
    for id in id_set:
        for idx, var in enumerate(clean_vars):
            ax[idx].plot(range(len(df_total.loc[df_total['instance_id']==id])), df_total.loc[df_total['instance_id']==id, var])
            ax[idx].set_title(var)
            ax[idx].set_ylim([min_v[var], max_v[var]])
    plt.tight_layout()
    plt.savefig(f'./fig/problem_{class_num}_real.png')
    plt.close()

    id_set = selected_id
    plt.rcParams['figure.figsize'] = (8, 8)
    fig, ax = plt.subplots(len(clean_vars), 1)
    for id in id_set:
        for idx, var in enumerate(clean_vars):
            ax[idx].plot(range(len(df_total.loc[df_total['instance_id']==id])), df_total.loc[df_total['instance_id']==id, var])
            ax[idx].set_title(var)
            ax[idx].set_ylim([min_v[var], max_v[var]])
    plt.tight_layout()
    plt.savefig(f'./fig/problem_{class_num}_selected.png')
    plt.close()


In [None]:
for i in crit_by_class_id:
    print(f'Problem{i}: {len(crit_by_class_id[i])}')
crit_by_class_id[0] = []

## 4. Train LSTM Classifier

In [17]:
param.num_epoch = 100
param.batch_size = 64
param.learning_rate = 0.003
by_instance = True
use_synthetic = False
use_augmentation = False
use_selected = True
integrate = False
integrate_syn = False
# Real_Only | With_Sim_Drawn | With_Sim_Drawn_Augmentation | Byinstance_Add_selected
# 첫번째 Each => scaler를 real이랑 synthetic이랑 분리
# 두번째 Each => scaler를 synthetic 내에서 simulatd랑 drawn이랑 분리
# Byinstance => instance마다 스케일링
fix_seed()


for class_code in set(real_instances['class_code']):
        if use_selected and (len(crit_by_class_id[class_code]) != 0):
            train_valid_id = crit_by_class_id[class_code]
            train_id_class, valid_id_class = train_test_split(train_valid_id, test_size=args.validation_ratio/(1-args.test_ratio))
            train_id += train_id_class
            valid_id += valid_id_class

            train_idx_dict[class_code] += train_id_class
            valid_idx_dict[class_code] += valid_id_class

if RF:
    train_id = sorted(train_id + valid_id)
    test_id = sorted(test_id)

    print(f'Train ratio: {len(train_id) / len(train_id+test_id):.2f}')
    print(f'Test ratio: {len(test_id) / len(train_id+test_id):.2f}')
else:
    train_id = sorted(train_id)
    valid_id = sorted(valid_id)
    test_id = sorted(test_id)

    print(f'Train ratio: {len(train_id) / len(train_id+valid_id+test_id):.2f}')
    print(f'Validation ratio: {len(valid_id) / len(train_id+valid_id+test_id):.2f}')
    print(f'Test ratio: {len(test_id) / len(train_id+valid_id+test_id):.2f}')

Train ratio: 0.63
Validation ratio: 0.22
Test ratio: 0.16


In [18]:
df_train = make_data(df, train_id)
df_valid = make_data(df, valid_id)
df_test = make_data(df, test_id)

if by_instance:
    df_train = fit_scaler(train_id, df_train, clean_vars)
    df_valid = fit_scaler(valid_id, df_valid, clean_vars)
    df_test = fit_scaler(test_id, df_test, clean_vars)

else:
    if integrate:
        scaler_train = make_scaler(train_id, df_train, clean_vars)
        df_train = fit_scaler_old(train_id, df_train, clean_vars, scaler_train)
        df_valid = fit_scaler_old(valid_id, df_valid, clean_vars, scaler_train)
        df_test = fit_scaler_old(test_id, df_test, clean_vars, scaler_train)
    else:
        train_id = np.array(train_id)
        train_id_real = train_id[train_id < 1025]
        scaler_train_real = make_scaler(train_id_real, df_train, clean_vars)
        df_train = fit_scaler_old(train_id_real, df_train, clean_vars, scaler_train_real)
        df_valid = fit_scaler_old(valid_id, df_valid, clean_vars, scaler_train_real)
        df_test = fit_scaler_old(test_id, df_test, clean_vars, scaler_train_real)

        if integrate_syn:
            train_id_syn = train_id[train_id >= 1025]
            valid_id_syn = valid_id[valid_id >= 1025]
            scaler_train_syn = make_scaler(train_id_syn, df_train, clean_vars)
            df_train = fit_scaler_old(train_id_syn, df_train, clean_vars, scaler_train_syn)
            df_train = fit_scaler_old(valid_id_syn, df_train, clean_vars, scaler_train_syn)
        else:
            train_id_sim = set(df_train.loc[df_train['source']=='simulated', 'instance_id'])
            train_id_drawn = set(df_train.loc[df_train['source']=='drawn', 'instance_id'])
            valid_id_sim = set(df_valid.loc[df_valid['source']=='simulated', 'instance_id'])
            valid_id_drawn = set(df_valid.loc[df_valid['source']=='drawn', 'instance_id'])

            scaler_train_sim = make_scaler(train_id_sim, df_train, clean_vars)
            scaler_train_drawn = make_scaler(train_id_drawn, df_train, clean_vars)

            df_train = fit_scaler_old(train_id_sim, df_train, clean_vars, scaler_train_sim)
            df_train = fit_scaler_old(train_id_drawn, df_train, clean_vars, scaler_train_drawn)
            df_valid = fit_scaler_old(valid_id_sim, df_valid, clean_vars, scaler_train_sim)
            df_valid = fit_scaler_old(valid_id_drawn, df_valid, clean_vars, scaler_train_drawn)

In [19]:
for class_num in range(9):
    train_idx_dict[class_num]=sorted(train_idx_dict[class_num])
    valid_idx_dict[class_num]=sorted(valid_idx_dict[class_num])
    test_idx_dict[class_num]=sorted(test_idx_dict[class_num])

In [None]:
for class_num in [1,2,5,6,7,8]:
    length = 0
    length_min = 1e8
    for i in train_idx_dict[class_num]:
        len_tmp = len(df_train[df_train['instance_id'] == i])
        length += len_tmp
        length_min = min(length_min, len_tmp)
    length /= len(train_idx_dict[class_num])

    window_size =  min(int(length/30), int(length_min/15))
    stride = max(int(window_size / 3),1)

    if class_num ==1:
        window_size = 15
        stride = 3
    if class_num ==2:
        window_size = 15
        stride = 3
    if class_num == 8:
        window_size = 4
        stride = 2

    train_dataset = SlidingWindowDataset(data=df_train, instance_id=train_idx_dict[class_num], window_size=window_size, stride=stride,
                                                 input_vars=clean_vars, already_scaled=True)
    valid_dataset = SlidingWindowDataset(data=df_valid, instance_id=valid_idx_dict[class_num], window_size=window_size, stride=stride,
                                                 input_vars=clean_vars, already_scaled=True)
    test_dataset = SlidingWindowDataset(data=df_test, instance_id=test_idx_dict[class_num], window_size=window_size, stride=stride,
                                                 input_vars=clean_vars, already_scaled=True)
    dataloader_dict = {}
    dataloader_dict['Train'] = DataLoader(train_dataset, batch_size=param.batch_size, shuffle=False)
    dataloader_dict['Valid'] = DataLoader(valid_dataset, batch_size=param.batch_size, shuffle=False)
    dataloader_dict['Test'] = DataLoader(test_dataset, batch_size=param.batch_size, shuffle=False)

    fix_seed()
    model = LSTMClassifier(input_dim=param.input_dim,
                           n_features=param.n_features,
                           window_size=window_size,
                           latent_dim=param.hidden_dim,
                           device=param.device,
                           num_layers=param.num_layers)

    model.load_state_dict(torch.load(f'./cache/{name}_autoencoder.pth'))
    input_size = model.classifier.input_size
    model.classifier.fc[-1] = nn.Linear(int(input_size/4), 2)

    for p in model.reconstruct_decoder.parameters():
        p.requires_grad = False
    for p in model.encoder.parameters():
        p.requires_grad = True
    params_to_optimize = [{'params': model.encoder.parameters()}, {'params': model.classifier.parameters()}]

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(params_to_optimize, lr=param.learning_rate)
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.96)

    fix_seed()
    model, logger = Train_Classifier(param, model, criterion, optimizer, scheduler, dataloader_dict, model_name=f'{name}_binary_class{class_num}')

    draw_loss(logger, view='loss',fname=f'./Mission2/{name}_binary_class{class_num}_ws{window_size}_stride{stride}_loss')
    draw_loss(logger, view='acc',fname=f'./Mission2/{name}_binary_class{class_num}_ws{window_size}_stride{stride}_acc')
    prediction, real = Inference_Classifier(model, dataloader_dict['Test'], criterion=criterion)

    for inst in test_idx_dict[class_num]:
        tmp_dataset = SlidingWindowDataset(data=df_test, instance_id=[inst], window_size=window_size, stride=stride,
                                                     input_vars=clean_vars, already_scaled=True)
        tmp_dataloader = DataLoader(tmp_dataset, batch_size=1, shuffle=False)
        prediction, real = Inference_Classifier(model, tmp_dataloader, criterion=criterion)

        _, pred_labels = torch.max(prediction, dim=1)
        _, true_labels = torch.max(real, dim=1)
        pred_labels = pred_labels.detach().cpu()
        true_labels = true_labels.detach().cpu()

        precision, recall, f1, _ = precision_recall_fscore_support(true_labels.numpy(), pred_labels.numpy(), average='micro')
        df_tmp = df_test[df_test['instance_id']==inst].iloc[window_size::stride].reset_index(drop=True)
        df_tmp.loc[:, 'timestamp'] = pd.to_datetime(df_tmp.loc[:, 'timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
        time = df_tmp.loc[:, 'timestamp']
        # df_tmp.loc[:, clean_vars] = scaler_train.inverse_transform(df_tmp.loc[:, clean_vars])
        plt.rcParams['figure.figsize'] = (8, 8)
        fig, ax = plt.subplots(len(clean_vars), 1)

        label = {'P-TPT':'Pressure, Pa',
                 'P-MON-CKP': 'Pressure, Pa',
                 'T-TPT': 'Temperature, \u00B0C'}
        df_tmp_realscale = df[df['instance_id']==inst].iloc[window_size::stride].reset_index(drop=True)
        for idx, var in enumerate(clean_vars):
            ax[idx].plot(time, df_tmp_realscale.loc[:, var].reset_index(drop=True), c='k')
            ax[idx].set_title(var)
            ax[idx].axvline(time[np.where(true_labels==1)[0][0]],c=[0, 0, 0.8],linestyle='--', label='Event Period')
            ax[idx].axvline(time[np.where(pred_labels==1)[0][0]],c=[0.8, 0, 0],linestyle='-', label='Pred Period')

            ax[idx].legend()
            ax[idx].axvspan(time[0], time[np.where(pred_labels==1)[0][0]], facecolor='green', alpha=0.2)
            ax[idx].axvspan(time[np.where(pred_labels==1)[0][0]], time[len(df_tmp)-1], facecolor='r', alpha=0.2)
            ax[idx].set_xlim([time[0], time[len(df_tmp)-1]])
            ax[idx].set_ylabel(label[var])
            ax[idx].set_xlabel('Time, Minutes')
            ax[idx].xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %d\n%H:%M'))
        plt.tight_layout()

        # plt.plot(true_labels.detach().cpu(), label='True')
        # plt.plot(pred_labels.detach().cpu(), label='Prediction')
        plt.legend()
        plt.savefig(f'./fig/Mission2/{class_num}/{name}_ws{window_size}_stride{stride}_{inst}_f1_{np.average(f1):.3f}.png')
        plt.show()