In [1]:
from scipy.io import loadmat
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [6]:
def make_windows(matfile, window_size = 6000):
    """ This function creates a dataframe with discrete time windows from the raw MEA signals.
        Signals begin as .mat files

        Parameters
        ----------
        matfile: dict that is the output of loadmat() from scipy.io
                
        window_size: number of rows per window
                default = 6000
                    6 second windows. Chosen based on average cpm of GI slow waves
        
        Returns
        -------
        df: DataFrame with columns 'time', 'window_id' and electrode numbers
         
    """
    # retrieve the relevant data from the dictionaries
    # combine the MEA data into one data frame
    data = pd.DataFrame(matfile['filt_data'])
    df_ts = data.T
    
    # create id for each discrete window
    win = 1
    ids = pd.DataFrame(index=df_ts.index)
    for i in range(0,len(df_ts),window_size):
        ids.loc[i:i+window_size,'id'] = win
        win += 1
    
    df_ts.insert(0, 'id', ids)

    return df_ts

In [7]:
def label_MEA_data(filenames, window_size = 6000):
    """ This function creates a labelled dataset of MEA signals.
        The MEA data must be in .mat files. The dataset is discretised into time windows.

        Arguments
        ---------
        filenames: list of strings, file paths of the MEA .mat files
        
        window_size: int number of rows per window
            default = 6000
                6 second windows. Chosen based on average cpm of GI slow waves
                    
        cols: list of column names to include in the processed dataset
            for the MEA problem, this refers to the electrode numbers to include 
            time and window_id are always included
            default = 'all'
                includes every column
                    
        Returns
        -------
        dataset: DataFrame containing 'time', 'window_id', electrode readings
            numbered 0-59 and target variable 'y'
    """
    last_win = 0
    dataset = pd.DataFrame()
    
    print('Creating data set with discrete time windows of size: '+str(window_size))
    
    for file in filenames:
        matfile = loadmat(file)
        f_name = os.path.split(file)[1]
        f_name = f_name[:-4]
        
        df_subject = make_windows(matfile, window_size)
        df_subject['id'] += last_win
        df_subject['subject'] = f_name
        
        last_win = df_subject['id'].tail(1).values[0]
        print('Total Distinct Samples: '+str(last_win))
        
        # determine which label needs to be applied
        if f_name.endswith("0"):
            # baseline
            df_subject['y'] = 0
        elif f_name.endswith("1"):
            # Ach applied
            df_subject['y'] = 1
        elif f_name.endswith("at_2"):
            # AT applied after Ach
            df_subject['y'] = 2
        elif f_name.endswith("hex_2"):
            # Hex applied after Ach
            df_subject['y'] = 3
        
        dataset = dataset.append(df_subject)
        print('Subject Added: '+f_name)
            
    return dataset.reset_index(drop=True)

In [8]:
def load_MEA_data(folder = "data/raw/Ach-AT-Hex"):
    """ This function retrieves a list of file paths of MEA data

        Arguments
        ---------
        folder: specifies the root folder from where to load the data
                default = "data/raw/Ach-AT"
        
        Returns
        -------
        list of files in specified folder

    """
    d = folder
    filenames = []
    
    # get all the paths of the files to be loaded in
    for root, _unused, files in os.walk(d):
        for file in files:
            if file.endswith(".mat"):
                filenames.append(os.path.join(root, file))

    return sorted(filenames)

In [9]:
files = load_MEA_data(folder = "../data/raw/Ach-AT-Hex")

In [10]:
dataset = label_MEA_data(files)

Creating data set with discrete time windows of size: 6000
Total Distinct Samples: 30.0
Subject Added: 00_0315_ach-at_0
Total Distinct Samples: 60.0
Subject Added: 00_0315_ach-at_1
Total Distinct Samples: 90.0
Subject Added: 00_0315_ach-at_2
Total Distinct Samples: 120.0
Subject Added: 01_0126_ach-hex_0
Total Distinct Samples: 150.0
Subject Added: 01_0126_ach-hex_1
Total Distinct Samples: 180.0
Subject Added: 01_0126_ach-hex_2
Total Distinct Samples: 210.0
Subject Added: 02_0126_ach-hex_0
Total Distinct Samples: 240.0
Subject Added: 02_0126_ach-hex_1
Total Distinct Samples: 270.0
Subject Added: 02_0126_ach-hex_2
Total Distinct Samples: 300.0
Subject Added: 02_0315_ach-at_0
Total Distinct Samples: 330.0
Subject Added: 02_0315_ach-at_1
Total Distinct Samples: 360.0
Subject Added: 02_0315_ach-at_2
Total Distinct Samples: 390.0
Subject Added: 03_0126_ach-hex_0
Total Distinct Samples: 420.0
Subject Added: 03_0126_ach-hex_1
Total Distinct Samples: 450.0
Subject Added: 03_0126_ach-hex_2
Total

In [11]:
dataset.tail()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,52,53,54,55,56,57,58,59,subject,y
5939995,990.0,-3.658905,-4.346508,-1.331769,-2.762749,-2.302423,10.158801,-2.483325,-29.747597,0.009037,...,-4.231518,-4.574937,-4.408384,231.14787,-4.679772,-4.222581,-5.545412,0.438134,08_0201_ach-hex_2,3
5939996,990.0,-3.652134,-4.341225,-1.335025,-2.757062,-2.29689,10.164411,-2.48009,-29.759384,0.000961,...,-4.228152,-4.570826,-4.403699,230.810232,-4.674589,-4.217453,-5.539538,0.439285,08_0201_ach-hex_2,3
5939997,990.0,-3.645355,-4.335929,-1.338284,-2.75137,-2.291353,10.169998,-2.476847,-29.771261,-0.007258,...,-4.224779,-4.566707,-4.399005,230.472133,-4.669395,-4.212316,-5.533652,0.440442,08_0201_ach-hex_2,3
5939998,990.0,-3.638566,-4.330621,-1.341546,-2.745674,-2.285813,10.175562,-2.473599,-29.78323,-0.015619,...,-4.221398,-4.56258,-4.394302,230.133574,-4.664193,-4.207169,-5.527753,0.441607,08_0201_ach-hex_2,3
5939999,990.0,-3.631769,-4.325301,-1.344811,-2.739973,-2.280269,10.181104,-2.470344,-29.795288,-0.024123,...,-4.218009,-4.558446,-4.389591,229.794556,-4.65898,-4.202014,-5.521842,0.442779,08_0201_ach-hex_2,3


In [25]:
y4 = (dataset[['id','y']]
     .drop_duplicates('id')
     .set_index('id')
     .T
     .squeeze()
     .sort_index(0))

y3 = y4
y3[y3 > 2] = 2

y2 = y3
y2[y2 > 1] = 1

In [10]:
df = pd.read_hdf('../data/processed/ach_at_hex_6000.h5')

In [11]:
subject = (df[['id','subject']]
           .drop_duplicates('id')
           .set_index('id')
           .T
           .squeeze()
           .sort_index(0))

In [4]:
df.columns

Index([     'id',         0,         1,         2,         3,         4,
               5,         6,         7,         8,         9,        10,
              11,        12,        13,        14,        15,        16,
              17,        18,        19,        20,        21,        22,
              23,        24,        25,        26,        27,        28,
              29,        30,        31,        32,        33,        34,
              35,        36,        37,        38,        39,        40,
              41,        42,        43,        44,        45,        46,
              47,        48,        49,        50,        51,        52,
              53,        54,        55,        56,        57,        58,
              59, 'subject',       'y'],
      dtype='object')

In [20]:
subject.to_hdf('../data/processed/subject_6000.h5', key='data', complevel=9)

In [19]:
subject = subject.str.slice(stop=-2)

In [21]:
sub4 = pd.read_hdf('../data/processed/subject_4000.h5')

In [23]:
sub4 = sub4.str.slice(stop=-2)

In [24]:
sub4.to_hdf('../data/processed/subject_4000.h5', key='data', complevel=9)

In [None]:
df = pd.read_hdf('../data/processed/ach_at_hex_10000.h5')
subject = (df[['id','subject']]
           .drop_duplicates('id')
           .set_index('id')
           .T
           .squeeze()
           .sort_index(0))
subject = subject.str.slice(stop=-2)
subject.to_hdf('../data/processed/subject_10000.h5', key='data', complevel=9)