In [1]:

import numpy as np
from jddb.file_repo import FileRepo
from jddb.processor import ShotSet
from util.basic_processor import SliceProcessor, FFTProcessor, find_tags, AlarmTag, get_machine_tags
from jddb.processor.basic_processors import ResamplingProcessor,TrimProcessor
import pandas as pd

# Data preprocessing and feature extraction

This notebook demonstrates how to access the provided data -3 file repo for 3 machines- with JDDB library. 

It show how 3 machines may have different names for the same -equivalent- diagnostics, and how to use the `ITU data - -signal.csv` to find common names for them.

It also shows how to use JDDB processors to extract new features like dominant frequency.

It also shows how to use JDDB processors to down sample the signals and clip only useful part of the signals and trim them to the same length.

The output of this notebook is a Filerepo -same format as the provided data- of the processed data.

You do not have to follow this routine, and you can process the data the way you like. This just gives you same examples of how to read the data.

## choose common signal

### read signal csv file

this file have signal names for 3 machines, the same row means the same -equivalent- diagnostics.

if the machine lacks some diagnostics, it will be left blank.

The left most column is the common names for the diagnostics from 3 machines.

In [2]:
df_signal = pd.read_csv('ITU data - signal.csv') 

 ### Choose only signals that presented in all 3 machines

 check every signal, if it isn't empty in all 3 machine signal list, it should be included in common_signal.

In [3]:
common_signal = []
for signal_name in df_signal.Diagnostics:
    target_row = df_signal.loc[df_signal.Diagnostics == signal_name]
    if ~target_row['J-TEXT MDSplus Tag'].isna().values[0] and \
            ~target_row['C-Mod MDSplus Tag'].isna().values[0]:
        common_signal.append(signal_name)


### change common_signal to different sub list according to whether it belongs to array mirnove, sxr, axuv.

In [4]:
# below 3 signals are array diagnostics signals, signals from same array can be treated the same way -feature extraction-
mir_name_list = find_tags('poloidal', common_signal) + find_tags('toroidal Mir', common_signal)
# sxr and axuv is not used in this example
sxr_name_list = find_tags('soft', common_signal)
axuv_name_list = find_tags('AXUV', common_signal)
# the rest is not array signals they will be used as is, not feature extraction is needed
basic_name_list = list(set(common_signal) - set(mir_name_list + sxr_name_list + axuv_name_list))


### choose machine for processing and get machine tags by common name list

in this example model for 1 machines is to be built, so choose one machine and use its signal names to access its Filerepo.

In [5]:
machine_name = 'J-TEXT'
basic_machine_tags = get_machine_tags(machine_name, basic_name_list, df_signal)
mir_machine_tags = get_machine_tags(machine_name, mir_name_list, df_signal)

## Create file repos for processed data and select shots meets the requirement

### initial the filerepo which should be processed and set the filerepo for training. these 2 filerepo should have different file path.

In [6]:
# this file repo is the provided original data -so extract the data into the "data" folder-
source_file_repo = FileRepo('.//data//jtext//$shot_2$00//')
# this is a new filerepo you created to hold the processed data
train_file_repo = FileRepo('.//data//jtext_data_train//$shot_2$00//')
    

  ### create a valid shot set, the valid shots should contain target tags and enough flattop time.

In [7]:
source_shotset = ShotSet(source_file_repo)
shot_list = source_shotset.shot_list



Define the target tags which should be contained 

In [8]:
targ_tags = basic_machine_tags + mir_machine_tags


 ### Get valid shot set
 Initialize an empty list to store valid shots.  
 Check if all target tags are present in the shot's tags and down_time of shot is greater than 0.2s -start time is ignored since they are all similar and matter little in this case-. 
 Create a new ShotSet object using the valid shots

In [9]:
valid_shots = [] 
for shot in shot_list:
    all_tags = list(source_shotset.get_shot(shot).tags)
    last_time = list(source_file_repo.read_labels(shot, ['DownTime']).values())
    
    if all(tag in all_tags for tag in targ_tags) & (last_time[0] > 0.2):
        valid_shots.append(shot)
valid_shotset = ShotSet(source_file_repo, valid_shots) 

    

## extract features and create training data

### 1. FFT processing to extract dominant frequency and amplitude of mirnov signals

Caution! this step will take very long if you have lots of data, and uses big amount of ram, may cause OOM exception.

Steps:
1. get mir signal tags of machine 
1. slicing, using slicing windows to create the data for FFT processing  -using JDDB processors-
3. fft  to extract new features -using JDDB processors-


In [10]:
for signal_name in mir_name_list:
    target_row = df_signal.loc[df_signal.Diagnostics == signal_name]
    mir_tag = target_row['{} MDSplus Tag'.format(machine_name)].values[0]
    processed_shotset = valid_shotset.process(
        processor=SliceProcessor(window_length=250, overlap=0.9),
        input_tags=[mir_tag],
        output_tags=["sliced_MA_{}".format(mir_tag)],
        save_repo=train_file_repo)

    processed_shotset = processed_shotset.process(
        processor=FFTProcessor(),
        input_tags=["sliced_MA_{}".format(mir_tag)],
        output_tags=[["fft_amp_{}".format(mir_tag), "fft_fre_{}".format(mir_tag)]],
        save_repo=train_file_repo)


### 2. remove redundant tags and keep tags for model training

remove remove the sliced signals.

extracted features and basics signals should be kept.
  
mir signals have been processed, they shouldn't be kept.

In [None]:
    
shot_list = processed_shotset.shot_list
all_tags = list(processed_shotset.get_shot(shot_list[0]).tags)
fft_tag = find_tags('fft_', all_tags) 
keep_tags = basic_machine_tags + fft_tag  
processed_shotset = processed_shotset.remove_signal(tags=keep_tags, keep=True,
                                                    save_repo=train_file_repo)


### 3. resample high frequency tags

down sample the signals to 1kHz

In [12]:

down_tags = fft_tag
processed_shotset = processed_shotset.process(
    processor=ResamplingProcessor(1000),
    input_tags=down_tags,
    output_tags=down_tags,
    save_repo=train_file_repo)


### 4. trim  signal
change the machine tags to common names  
keep common names

so you can use common names in training, the training part will be machine agnostic.

In [13]:
common_tags = basic_name_list + fft_tag  
processed_shotset = processed_shotset.process(
    TrimProcessor(),
    input_tags=[keep_tags],
    output_tags=[common_tags],
    save_repo=train_file_repo)
keep_tags = basic_name_list + fft_tag  
processed_shotset = processed_shotset.remove_signal(tags=keep_tags, keep=True,
                                                    save_repo=train_file_repo)


### 5. add disruption labels for each time point as a signal called alarm_tag

alarm tag is 0,1 labels for every time steps, 0 for non-disruption-precursor samples, 1 for disruption-precursor samples. all samples 'lead_time ' before disruptions is labeled 1. 

In [14]:
processed_shotset = processed_shotset.process(
    processor=AlarmTag(lead_time=0.1, disruption_label="IsDisrupt", downtime_label="DownTime"),
    input_tags=["plasma current"],
    output_tags=["alarm_tag"],
    save_repo=train_file_repo)
