# Creating a database of scalar values for ICRH coupling analysis

In [1]:
# assume working in Jupyter Lab
%matplotlib inline 

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('C:\\Users\\JH218595\\Documents\\pywed')
sys.path.append('C:\\Users\\JH218595\\Documents\\IRFMtb')

In [3]:
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt 
plt.rcParams['figure.figsize'] = (10,6)

from control_room import *
from pulse_database import PulseDB
from tqdm.notebook import tqdm

import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')

IMAS only available on linux machines


The database has been created in another notebook. Importing database : 

In [4]:
dt = 0.050 # s

hdf5_filename = 'databases/WEST_C4_ICRH_pulse_data.hdf5'
file_output = 'WEST_C4_database_resumed_parameters.csv'

db = PulseDB(hdf5_filename)
print(f'Database contains {len(db.pulse_list)} shots, from #{db.pulse_list[0]} to #{db.pulse_list[-1]} ')

Database contains 391 shots, from #54404 to #55809 


## Creating a meaningfull database with pandas
The idea is to split time in small pieces and to calculate scalar values for each of them

In [5]:
def split_in_pieces(y, t, nb_pieces):
    """Split a time signel y(t) into smaller piece of length dt, and return t, average, min, max and std of each of them"""
    y_mean_min_max, t_pieces = [], []
    if nb_pieces > 0: 
        ts = np.array_split(np.squeeze(t), nb_pieces)
        ys = np.array_split(np.squeeze(y), nb_pieces)
        for (_y, _t) in zip(ys, ts):
            t_pieces.append(np.mean(_t))
            y_mean_min_max.append(mean_min_max(_y))
        return np.array(y_mean_min_max), np.array(t_pieces)    
    else:
        return np.array([np.nan, np.nan, np.nan]), np.array([np.nan])
    

In [6]:
data = pd.DataFrame()


for pulse in tqdm(db.pulse_list):
    # start as ip > 100 kA
    ip, t_ip = db.get_signal(pulse, 'Ip')
    
    t_start = t_ip[(ip > 100).squeeze()][0]
    t_end = t_ip[(ip > 100).squeeze()][-1]

    nb_pieces = int(np.round((t_end - t_start)/dt))
    
    rows = {'pulse': pulse}
    for signame in db.list_signal(pulse):           
        try:
            y, t = db.get_signal(pulse, signame)
            _y, _t = in_between(y, t, t_start, t_end)
            ys, ts = split_in_pieces(_y, _t, nb_pieces)

        except IndexError as e:
            y, t = db.get_signal(pulse, signame)
            # deals with resumed data (like IC frequencies):
            # replicate the data for the number of pieces
            if y.ndim == 1:
                ys = np.tile(y, (int(nb_pieces),1))
                
        except ValueError as e:
            ys = np.zeros(nb_pieces)
        
        # add data points into the Dataframe
        if signame == 'IC_Frequencies':
            rows['freq_Q1'] = ys[:,0]
            rows['freq_Q2'] = ys[:,1]
            rows['freq_Q4'] = ys[:,2]
        if signame == 'IC_Positions':
            rows['R_Q1'] = ys[:,0]
            rows['R_Q2'] = ys[:,1]
            rows['R_Q4'] = ys[:,2]
        if signame == 'Datetime':
            rows['year'] = ys[:,0]
            rows['month']= ys[:,1]
            rows['day']  = ys[:,2]
        else:
            
            if ys.ndim > 1:  # if to deal with the case of zeros (ValueError previously)
                ys = ys[:,0]  # keep only mean (yet)
            rows[signame] = np.squeeze(ys)
            
        _df = pd.DataFrame(rows)
    # append data from each pulse to the final DF
    data = data.append(_df)

HBox(children=(FloatProgress(value=0.0, max=391.0), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,





Exporting the Dataframe:

In [7]:
data.to_csv('WEST_C4_database_resumed_parameters.csv')