# Creating a Database of Scalar Values for WEST C5 Experimental Campaign 

In [1]:
# assume working in Jupyter Lab
%matplotlib inline 

%load_ext autoreload
%autoreload 2

In [2]:
import sys
# WEST libraries
#sys.path.append('C:\\Users\\JH218595\\Documents\\pywed')
#sys.path.append('C:\\Users\\JH218595\\Documents\\IRFMtb')
#sys.path.append('C:\\Users\\JH218595\\Documents\\PPPAT')

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
plt.rcParams['figure.figsize'] = (10,6)
try:
    from tqdm.notebook import tqdm
except ImportError as e:
    from tqdm import tqdm

try:
    from pppat.control_room.signals import *
except ImportError as e:
    sys.path.append('../../PPPAT/')
    from pppat.control_room.signals import *

from pulse_database import PulseDB

IMAS only available on linux machines


The database has been created in another notebook. Importing database : 

In [4]:
dt = 0.10 # s

hdf5_filename = 'databases/WEST_C5_pulse_data.hdf5'
file_output = 'WEST_C5_database_resumed_parameters.csv'

In [5]:
db = PulseDB(hdf5_filename)
print(f'Database contains {len(db.pulse_list)} shots, from #{db.pulse_list[0]} to #{db.pulse_list[-1]} ')

Database contains 172 shots, from #56287 to #56672 


## Creating a meaningfull database with pandas
The idea is to split time in small pieces and to calculate scalar values for each of them

In [6]:
def split_in_pieces(y, t, nb_pieces):
    """Split a time signel y(t) into smaller piece of length dt, and return t, average, min, max and std of each of them"""
    y_mean_min_max, t_pieces = [], []
    if nb_pieces > 0: 
        ts = np.array_split(np.squeeze(t), nb_pieces)
        ys = np.array_split(np.squeeze(y), nb_pieces)
        for (_y, _t) in zip(ys, ts):
            # Get the mean, min and max values of the data in the time piece
            # Keep only if the mean value is within +/- 5% of the min/max
            # otherwise use NaN
            _mean, _mini, _maxi = mean_min_max(_y)
            
            if _mean == _mini == _maxi:  # no data
                y_mean_min_max.append([_mean, _mini, _maxi])           
                t_pieces.append(np.mean(_t))                
            elif (np.abs(_mean - _mini)/_mean < 30/100) and (np.abs(_mean - _maxi)/_mean < 30/100):
                y_mean_min_max.append([_mean, _mini, _maxi])           
                t_pieces.append(np.mean(_t))
            else:
                y_mean_min_max.append([np.nan, np.nan, np.nan]) 
                t_pieces.append(np.nan)
        return np.array(y_mean_min_max), np.array(t_pieces)    
    else:
        return np.array([np.nan, np.nan, np.nan]), np.array([np.nan])
    

In [14]:
data = pd.DataFrame()


for pulse in tqdm(db.pulse_list):
    # start as ip > 100 kA
    ip, t_ip = db.get_signal(pulse, 'Ip')
    
    t_start = t_ip[(ip > 0.3).squeeze()][0]
    t_end = t_ip[(ip > 0.3).squeeze()][-1]

    nb_pieces = int(np.round((t_end - t_start)/dt))
    
    rows = {'pulse': pulse}
    for signame in db.list_signal(pulse):           
        try:
            y, t = db.get_signal(pulse, signame)
            # Smooth some noisy signals
            if any(sig in signame for sig in ['Cu', 'Ag18', 'Ag19', 'Langmuir', 'Prad', 'Prad_bulk','LH_P_tot', 'IC_P_tot', 'IC_P_Q1', 'IC_P_Q2', 'IC_P_Q4']):
                y = smooth(y, window_length=21)

            # splitting signals in pieces
            _y, _t = in_between(y, t, t_start, t_end)
            ys, ts = split_in_pieces(_y, _t, nb_pieces)

            # Taking reference time from the plasma current
            if signame == 'Ip':
                rows['time'] = np.squeeze(ts)
                
        except IndexError as e:
            # deals with resumed data (like IC frequencies):
            # replicate the data for the number of pieces
            if y.ndim == 1 and len(y)>1:
                ys = np.tile(y, (int(nb_pieces),1))
            else:
                # fill with 0 if empty arrays
                ys = np.zeros(nb_pieces)
                
        except ValueError as e:
            ys = np.zeros(nb_pieces)
        
        # add data points into the Dataframe
        if signame == 'IC_Frequencies':
            rows['freq_Q1'] = ys[:,0]
            rows['freq_Q2'] = ys[:,1]
            rows['freq_Q4'] = ys[:,2]
        if signame == 'LH_Positions':
            rows['R_LH1'] = ys[:,0]
            rows['R_LH2'] = ys[:,1]
        if signame == 'IC_Positions':
            rows['R_Q1'] = ys[:,0]
            rows['R_Q2'] = ys[:,1]
            rows['R_Q4'] = ys[:,2]
        try:
            if signame == 'Datetime':
                rows['year'] = ys[:,0]
                rows['month']= ys[:,1]
                rows['day']  = ys[:,2]
        except Exception as e:
            pass
        else:
            
            if ys.ndim > 1:  # if to deal with the case of zeros (ValueError previously)
                ys = ys[:,0]  # keep only mean (yet)
            rows[signame] = np.squeeze(ys)
            
        _df = pd.DataFrame(rows)
    # append data from each pulse to the final DF
    data = data.append(_df)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  app.launch_new_instance()
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
100%|██████████| 172/172 [00:42<00:00,  3.61it/s]


Exporting the Dataframe:

In [15]:
data.columns

Index(['Ag18', 'Ag19', 'Cu', 'Datetime', 'Fe', 'IC_Frequencies', 'IC_P_Q1',
       'IC_P_Q2', 'IC_P_Q4', 'IC_P_tot', 'IC_Positions', 'IC_Rc_Q1_avg',
       'IC_Rc_Q2_avg', 'IC_Rc_Q4_avg', 'IC_Voltage_left_lower_Q1',
       'IC_Voltage_left_lower_Q2', 'IC_Voltage_left_lower_Q4',
       'IC_Voltage_left_upper_Q1', 'IC_Voltage_left_upper_Q2',
       'IC_Voltage_left_upper_Q4', 'IC_Voltage_right_lower_Q1',
       'IC_Voltage_right_lower_Q2', 'IC_Voltage_right_lower_Q4',
       'IC_Voltage_right_upper_Q1', 'IC_Voltage_right_upper_Q2',
       'IC_Voltage_right_upper_Q4', 'Ip', 'Isotopic Ratio INBUM04',
       'Isotopic Ratio LODIVIN19', 'Isotopic Ratio LODIVOU15', 'LH_P_tot',
       'LH_Positions', 'MHD', 'Ohmic_P', 'Prad', 'R_LH1', 'R_LH2', 'R_Q1',
       'R_Q2', 'R_Q4', 'Rext_median', 'Separatrix_P', 'day', 'freq_Q1',
       'freq_Q2', 'freq_Q4', 'month', 'nl', 'pulse', 'time', 'year'],
      dtype='object')

In [16]:
data.to_csv(file_output)

In [17]:
data.head()

Unnamed: 0,Ag18,Ag19,Cu,Datetime,Fe,IC_Frequencies,IC_P_Q1,IC_P_Q2,IC_P_Q4,IC_P_tot,...,Separatrix_P,day,freq_Q1,freq_Q2,freq_Q4,month,nl,pulse,time,year
0,1.5407,,3.959761,2020,,55.299999,0.0,0.0,0.0001,0.0,...,0.0,10,55.299999,55.799999,55.799999,12,1.435414,56287,0.381975,2020
1,,1.033671,3.550893,2020,,55.299999,0.0,0.0,0.0001,0.0,...,0.0,10,55.299999,55.799999,55.799999,12,1.545745,56287,0.484375,2020
2,,1.235728,,2020,,55.299999,0.0,0.0,0.0001,0.0,...,0.0,10,55.299999,55.799999,55.799999,12,1.573494,56287,0.586775,2020
3,,1.143154,,2020,,55.299999,0.0,0.0,0.0001,0.0,...,0.0,10,55.299999,55.799999,55.799999,12,1.567759,56287,0.689175,2020
4,1.0,1.0,2.425662,2020,,55.299999,0.0,0.0,0.0001,0.0,...,0.0,10,55.299999,55.799999,55.799999,12,1.566279,56287,0.791575,2020
