In [2]:
from scipy.io import loadmat
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
matfile = loadmat("data/raw/Ach-AT/04_0316_ach-at_0.mat")

In [3]:
df = pd.DataFrame(matfile['filt_data'])
df=df.T
df_2 = pd.DataFrame()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,7.400816,-5.906791,1.142726,0.191898,69.562051,9.012236,-1.880871,8.45731,8.753557,9.518978,...,10.667,21.348741,24.569333,28.577441,-6.224571,-12.086969,13.948367,30.062192,16.205018,-2.557572
1,7.177488,-6.135068,0.915841,-0.03506,70.621536,8.793167,-2.098095,8.270087,8.722532,9.349439,...,10.660654,21.315009,24.523723,28.519099,-6.261806,-12.145811,13.811402,29.889178,16.011065,-2.787902
2,6.954183,-6.363301,0.68898,-0.262074,71.684292,8.574064,-2.315308,8.082819,8.691471,9.179862,...,10.654873,21.281652,24.478398,28.461005,-6.299013,-12.204617,13.674545,29.716228,15.817129,-3.018183
3,6.730903,-6.591484,0.462145,-0.489139,72.750299,8.35493,-2.532505,7.895511,8.660374,9.01025,...,10.649656,21.24867,24.433359,28.403161,-6.336192,-12.263387,13.537799,29.543346,15.623212,-3.248411
4,6.507654,-6.819615,0.235341,-0.716252,73.819535,8.135771,-2.749683,7.708165,8.629241,8.840605,...,10.645005,21.216063,24.388607,28.345568,-6.373342,-12.322119,13.401168,29.370533,15.429319,-3.478581


In [4]:
df_2 = pd.DataFrame()
for j in range(60):
    index = 0
    window = 1
    while index < 180000:
        # one row should contain 6000 voltage measurements (6s worth), the window number and the electrode number
        row = df.iloc[:,j]
        row = pd.DataFrame(row[index:index+6000])
        row = row.reset_index(drop=True)
        row = row.T
        row['electrode'] = j
        row['window'] = window
        df_2 = df_2.append(row)
        index += 6000
        window += 1

In [5]:
file = "data/raw/Ach-AT/04_0316_ach-at_0.mat"
name = os.path.split(file)[1]
y = name[-5]
df_2['y'] = y
df_2.head()
df_2['subject'] = name[:7]
df_2.to_hdf('test.h5', key = name[:7], mode = 'a')

In [6]:
test = pd.read_hdf('test.h5')

In [4]:
# load 
# get a list of the file names
d = "data/raw/Ach-AT"
filenames = []

for root, dirs, files in os.walk(d):
    for file in files:
        if file.endswith(".mat"):
            filenames.append(os.path.join(root, file))

for file in filenames:
    matfile = loadmat(file)
    df_raw = pd.DataFrame(matfile['filt_data'])
    df_raw = df_raw.T
    df_clean = pd.DataFrame()
    for j in range(60):
        index = 0
        window = 1
        while index < 180000:
            # one row should contain 6000 voltage measurements (6s worth), the window number and the electrode number
            row = df_raw.iloc[:,j]
            row = pd.DataFrame(row[index:index+6000])
            row = row.reset_index(drop=True)
            row = row.T
            row['electrode'] = j
            row['window'] = window
            df_clean = df_clean.append(row)
            index += 6000
            window += 1
    name = os.path.split(file)[1]
    y = name[-5]
    df_clean['y'] = y
    df_clean['subject'] = name[:7]
    df_clean.to_hdf('data/processed/ach_at_table.h5', key = 'Ach-AT', mode = 'a', format = 'table', append = True)

# THE CODE ABOVE THIS IS HORRENDOUS, PLEASE SEE BELOW FOR CLEAN IMPLEMENTATION

In [46]:
def make_windows(matfile, window_size = 6000):
    """ This function creates a dataframe with discrete time windows from the raw MEA signals.
        Signals begin as .mat files

        Parameters
        ----------
        matfile: dict that is the output of loadmat() from scipy.io
                
        window_size: number of rows per window
                default = 6000
                    6 second windows. Chosen based on average cpm of GI slow waves
        
        Returns
        -------
        df: DataFrame with columns 'time', 'window_id' and electrode numbers
         
    """
    # retrieve the relevant data from the dictionaries
    # combine the MEA data into one data frame
    data = pd.DataFrame(matfile['filt_data'])
    df_ts = data.T
    
    # create id for each discrete window
    win = 1
    ids = pd.DataFrame(index=df_ts.index)
    for i in range(0,len(df_ts),window_size):
        ids.loc[i:i+window_size,'id'] = win
        win += 1
    
    df_ts.insert(0, 'id', ids)

    return df_ts

In [37]:
def label_MEA_data(filenames, window_size = 6000):
    """ This function creates a labelled dataset of MEA signals.
        The MEA data must be in .mat files. The dataset is discretised into time windows.

        Arguments
        ---------
        filenames: list of strings, file paths of the MEA .mat files
        
        window_size: int number of rows per window
            default = 6000
                6 second windows. Chosen based on average cpm of GI slow waves
                    
        cols: list of column names to include in the processed dataset
            for the MEA problem, this refers to the electrode numbers to include 
            time and window_id are always included
            default = 'all'
                includes every column
                    
        Returns
        -------
        dataset: DataFrame containing 'time', 'window_id', electrode readings
            numbered 0-59 and target variable 'y'
    """
    last_win = 0
    
    dataset = pd.DataFrame()
    
    for file in filenames:
        matfile = loadmat(file)
        f_name = os.path.split(file)[1]
        f_name = f_name[:-4]
        
        df_subject = make_windows(matfile, window_size)
        df_subject['id'] += last_win
        df_subject['subject'] = f_name
        
        last_win = df_subject['id'].tail(1).values[0]
        print(last_win)
        
        # determine which label needs to be applied
        if f_name.endswith("0"):
            # baseline
            df_subject['y'] = 0
        elif f_name.endswith("1"):
            # Ach applied
            df_subject['y'] = 1
        elif f_name.endswith("at_2"):
            # AT applied after Ach
            df_subject['y'] = 2
        elif f_name.endswith("hex_2"):
            # Hex applied after Ach
            df_subject['y'] = 3
        
        dataset = dataset.append(df_subject)
        
        print(f_name)
        
        # update previous name
        prev_name = f_name
    
    return dataset.reset_index(drop=True)

In [38]:
def load_MEA_data(folder = "data/raw/Ach-AT-Hex"):
    """ This function retrieves a list of file paths of MEA data

        Arguments
        ---------
        folder: specifies the root folder from where to load the data
                default = "data/raw/Ach-AT"
        
        Returns
        -------
        list of files in specified folder

    """
    d = folder
    filenames = []

    # get all the paths of the files to be loaded in
    for root, dirs, files in os.walk(d):
        for file in files:
            if file.endswith(".mat"):
                filenames.append(os.path.join(root, file))
    
    return sorted(filenames)

In [49]:
files = load_MEA_data(folder = "../data/raw/Ach-AT")

In [50]:
dataset = label_MEA_data(files)

30.0
00_0315_ach-at_0
60.0
00_0315_ach-at_1
90.0
00_0315_ach-at_2
120.0
02_0315_ach-at_0
150.0
02_0315_ach-at_1
180.0
02_0315_ach-at_2
210.0
04_0316_ach-at_0
240.0
04_0316_ach-at_1
270.0
04_0316_ach-at_2
300.0
05_0316_ach-at_0
330.0
05_0316_ach-at_1
360.0
05_0316_ach-at_2
390.0
06_0317_ach-at_0
420.0
06_0317_ach-at_1
450.0
06_0317_ach-at_2


In [51]:
dataset

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,52,53,54,55,56,57,58,59,subject,y
0,1.0,-135.904409,-126.704873,-129.462254,-121.519825,-124.371209,-102.553959,-426.883940,-103.540012,-293.788800,...,-140.741374,-136.679302,-182.152656,-145.942890,-145.357315,-137.695972,-143.410471,-141.207962,00_0315_ach-at_0,0
1,1.0,-135.970862,-126.796326,-129.559494,-121.627592,-124.485424,-102.687438,-426.496982,-103.706237,-293.582639,...,-140.782349,-136.725187,-181.736284,-145.996543,-145.411239,-137.754658,-143.477315,-141.291289,00_0315_ach-at_0,0
2,1.0,-136.036568,-126.887027,-129.655970,-121.734591,-124.598914,-102.820126,-426.112029,-103.871656,-293.377845,...,-140.822649,-136.770327,-181.322447,-146.049450,-145.464410,-137.812580,-143.543406,-141.373857,00_0315_ach-at_0,0
3,1.0,-136.101527,-126.976977,-129.751683,-121.840824,-124.711680,-102.952022,-425.729129,-104.036270,-293.174450,...,-140.862274,-136.814723,-180.911182,-146.101611,-145.516827,-137.869738,-143.608744,-141.455666,00_0315_ach-at_0,0
4,1.0,-136.165739,-127.066176,-129.846632,-121.946291,-124.823721,-103.083127,-425.348329,-104.200076,-292.972489,...,-140.901224,-136.858376,-180.502526,-146.153025,-145.568492,-137.926133,-143.673330,-141.536716,00_0315_ach-at_0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699995,450.0,-3.811497,-4.925875,-6.572545,-7.509711,-10.696284,-19.359255,-40.478978,-12.398888,-5.702694,...,-14.355237,-13.943807,-24.701074,-6.406025,-9.000151,-7.542257,-4.993935,-4.013093,06_0317_ach-at_2,2
2699996,450.0,-3.735131,-4.845406,-6.490815,-7.415802,-10.617339,-19.266423,-40.417400,-12.334051,-5.631956,...,-14.267916,-13.855457,-24.610803,-6.297250,-8.912386,-7.464207,-4.920858,-3.932371,06_0317_ach-at_2,2
2699997,450.0,-3.658896,-4.765051,-6.409177,-7.321970,-10.538463,-19.173608,-40.355708,-12.269282,-5.561270,...,-14.180695,-13.767222,-24.520654,-6.188614,-8.824755,-7.386291,-4.847906,-3.851779,06_0317_ach-at_2,2
2699998,450.0,-3.582794,-4.684808,-6.327631,-7.228215,-10.459657,-19.080811,-40.293903,-12.204582,-5.490635,...,-14.093576,-13.679102,-24.430625,-6.080117,-8.737261,-7.308510,-4.775080,-3.771316,06_0317_ach-at_2,2
