# Import packages

In [1]:
import numpy as np
import pandas as pd
import scipy.io # read matlab file

In [2]:
FILE_PATH_SHORT = 'input/short_simulation.mat'
FILE_PATH_LONG = 'input/long_simulation.mat'

# Description of Simulation

## Independent variables

The file is a `.mat` file, it is a struct with **8** fields. The independent variables are: 

* **snr** (signal to noise ratio), 
* **con** (presence or not of the connection between the two sources), 
* **len** (length of the simulated signal), 
* **distance_sources** (distance between the two sources). 
*  **depth** : We can either keep the two depths separately, as "depth_1" and "depth_2", or consider a "depth_mean" (respectively the distance of the first and of the second source from the middle of the brain, and their mean value). Either we take the first two, or the third one, or all three values, in which case we drop the independence assumption.

> For each experiment, all factor values are **randomly chosen.**

## Dependent variables
The dependent variables are collected in the field **"Results"**, inside you can find two matrices **FPR** and **FNR**. Their dimensions are **4x1000** where each row is a different result since for each simulation (experiment) we have two source localization approaches and two different connectivity estimates (so row1 is met. 1 est. 1; row2 is met. 1 est. 2; row3 is met. 2 est. 1; row4 is met. 2 est. 2). 

**Columns are different experiments.**  The two different source localization approaches, and the two methods used to evaluate connectivity, are to be considered as factors. 

# Structure of `Matlab` file

Simulation is performed by using Matlab as software and High Performance Computer of Ghent University. Matlab file consists of two parts:
* **First part** independent variables of the simulation.
* **Second part** dependent variable. Here we have 4 results, which caused by two binary variables or two different algorithms used.

# List of the functions

In [3]:
def _read_matlab_file(file_name):
    '''Read in Matlab file and convert into Pandas DataFrame'''
    
    # read .mat file
    eeg = scipy.io.loadmat(file_name)

    # Get variable names of matlab file
    print(list(eeg.keys()))

    # convert .mat file to numpy array
    eeg_array = eeg['Simulation']
    
    # print message
    print('Matlab file is converted to DataFrame')
    
    return eeg_array

In [4]:
def _get_simulation_parameters(eeg_array):
    '''Extract first part (parameters of simulation) of the parameters'''
    
    # generate id for each simulation
    number_of_simulation = eeg_array['len'][0][0][0].shape[0]
    simulation_id = [x for x in range(0, number_of_simulation)]

    # combines variables
    simulation = np.column_stack((simulation_id,
                                  eeg_array['len'][0][0][0],
                                  eeg_array['distance_sources'][0][0][0],
                                  eeg_array['con'][0][0][0],
                                  eeg_array['snr'][0][0][0],
                                  eeg_array['depth_1'][0][0][0],
                                  eeg_array['depth_2'][0][0][0]))
    # convert to Data Frame
    simulation = pd.DataFrame(simulation)

    simulation.columns = ['id', 'len', 
                          'distance_source', 
                          'con', 'snr', 
                          'depth_1', 'depth_2']
    print('Independent variables have been read : ', simulation.shape)
              
    return simulation

In [24]:
def _get_simulation_result(file_name, result_type='fpr'):
    '''Read second part of the Matlab file and transform into required format.
    
    Result is the second part of the simulation, and for each simulation we have 4 target values, 
    for different algorithms (localization source and connectivity estimate):
    
    0 | 0.5 | 0 | 0
    0 | 1   | 1 | 1
    
    The goal is to transform Result matrix to column matrix, where each target value placed on the seperate row:
    
    localization_source | connectivity_estimate | target
    0 | 0 | 0
    0 | 1 | 0.5
    1 | 0 | 0
    1 | 1 | 0
    0 | 0 | 0
    0 | 1 | 1
    1 | 0 | 1
    1 | 1 | 1
    
    '''
    # get values
    eeg_array = _read_matlab_file(file_name)
    simulation = _get_simulation_parameters(eeg_array)
    
    # calculate variables about simulation
    number_of_simulation = simulation.shape[0]
    
    # replicate each simulation independent variables 4 times
    replicated_simulation = pd.concat([simulation] * 4).sort_index().reset_index()
    
    # get FPR and FNR from Result matrix
    if result_type == 'fpr':
        false_rate = np.transpose(eeg_array['Results'][0][0][0][0][0]) # False negative rate
    else:
        false_rate = np.transpose(eeg_array['Results'][0][0][0][0][1]) # False negative rate
    
    # encode 2 by 2 table with dummy variables: result contains 4 columns
    # localization_source | connectivity_estimate
    # 0 | 0
    # 0 | 1
    # 1 | 0
    # 1 | 1
    # total 4 possible values.
    two_by_two_table = [0, 0, 0, 1, 1, 0, 1, 1]

    # create numpy array from 2 by 2 table (list)
    numpy_tbt_table = np.array(two_by_two_table * number_of_simulation)

    # convert to correct size: from 1D to 2D
    numpy_tbt_table = numpy_tbt_table.reshape(4 * number_of_simulation, 2)
    
    # reshape result matrix
    fr_ = false_rate.reshape(false_rate.shape[0] * false_rate.shape[1])
    
    # combine and convert matrix to data frame
    df_fr = pd.DataFrame(np.column_stack([numpy_tbt_table, fr_]))
    
    # rename columns
    df_fr.rename(columns={0:'localization_source', 1:'connectivity_estimate', 2:'y'}, inplace=True)
    
    # combine with simulated dataframe
    everything = pd.concat([replicated_simulation, df_fr], axis=1)
    
    # remove missing values from target value
#     everything.dropna(axis=0, inplace=True)
    
    # delete (automatic) index column
    del everything['index']
    
    print('Simulation Result is read')
    
    return everything

In [6]:
def _save_simulation(df, place_to_save):
    '''Save simulation data into `csv`'''
    df.to_csv(place_to_save, index=False)

In [7]:
def read_and_save_simulation(file_name, save=None):
    '''Read Simulation file and save into csv file'''
    
    everything = _get_simulation_result(file_name)
    
    # save data frame at save file
    if save is not None:
        _save_simulation(everything, save)
    
    # return final result
    return everything

# Convert Matlab file into Pandas Data frame

In [None]:
simulation = read_and_save_simulation(FILE_PATH_LONG, 'input/long_simulation.csv')

## Basic Statistics about data set

In [28]:
simulation.shape

(20000000, 10)

In [29]:
# print first 5 rows
simulation.head()

Unnamed: 0,id,len,distance_source,con,snr,depth_1,depth_2,localization_source,connectivity_estimate,y
0,0.0,1179.0,89.452006,1.0,0.657092,93.914788,46.159509,0.0,0.0,1.0
1,0.0,1179.0,89.452006,1.0,0.657092,93.914788,46.159509,0.0,1.0,1.0
2,0.0,1179.0,89.452006,1.0,0.657092,93.914788,46.159509,1.0,0.0,1.0
3,0.0,1179.0,89.452006,1.0,0.657092,93.914788,46.159509,1.0,1.0,1.0
4,1.0,1214.0,61.476372,0.0,0.806123,65.795515,7.270739,0.0,0.0,0.5


In [27]:
# descriptive statistics about features
simulation.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,20000000.0,2500000.0,1443376.0,0.0,1250000.0,2500000.0,3749999.0,4999999.0
len,20000000.0,1100.042,519.7037,200.0,650.0,1100.0,1550.0,2000.0
distance_source,20000000.0,83.55302,31.99951,1.064023,59.9234,83.13348,106.9384,181.3417
con,20000000.0,0.5000292,0.5,0.0,0.0,1.0,1.0,1.0
snr,20000000.0,0.7499424,0.1443817,0.5,0.6248526,0.7499275,0.875037,1.0
depth_1,20000000.0,64.07928,20.56746,5.967457,50.91088,65.92941,79.20287,107.0131
depth_2,20000000.0,64.0788,20.56036,5.967457,50.91088,65.92941,79.20287,107.0131
localization_source,20000000.0,0.5,0.5,0.0,0.0,0.5,1.0,1.0
connectivity_estimate,20000000.0,0.5,0.5,0.0,0.0,0.5,1.0,1.0
y,20000000.0,0.4102542,0.4723381,0.0,0.0,0.0,1.0,1.0


In [26]:
# number of missing values per column
simulation.isnull().sum()

id                       0
len                      0
distance_source          0
con                      0
snr                      0
depth_1                  0
depth_2                  0
localization_source      0
connectivity_estimate    0
y                        0
dtype: int64