Wednesday, August 23, 2017

In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

> # Data preprocessing

## Large csv file preprocessing

In [328]:
def current_file_preprocessing(csv):
    '''
    Returns df_strength, df_location, df_normal
    from the csv file containing current information.
    
    The csv file has 5 lines of header :
    - Latency
    - Residual Deviation (normalized)
    - Residual Deviation (original)
    - Explained Variance (normalized)
    - Explained Variance (original)
    
    Main dataframe is composed of three parts :
    1. Strength
        - for each activations along the latency
    2. Location
        - for each activations
    3. Normal
        - for each activations along the latency
    '''
    
    df_raw = pd.read_csv(csv, encoding='ISO-8859-1')

    # header is not used
    df_header = df_raw.ix[:4]
    df_data = df_raw.ix[4:]
    # df categories
    df_data['info'] = df_data['Latency [ms]'].str.split(' ').str[0]

    df_info_gb = df_data.groupby('info')
    
    # Strength df
    # make df_strength
    df_strength = df_info_gb.get_group('Strength')
    # column simplify
    df_strength['number'] = df_strength['Latency [ms]'].str.split(' ').str[1]
    df_strength = df_strength.drop('Latency [ms]', axis=1)
    # column order change
    cols = df_strength.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    df_strength = df_strength[cols].set_index(['info', 'number'])
    
    # Location df
    # make df_strength
    df_location = df_info_gb.get_group('Location')
    # column simplify
    df_location['number'] = df_location['Latency [ms]'].str.split(' ').str[1]
    df_location['axis'] = df_location['Latency [ms]'].str.split(' ').str[2]
    df_location['number'] = df_location['number'].astype('int')
    df_location = df_location.drop('Latency [ms]', axis=1)
    first_num_col = df_location.columns[0]
    df_location = df_location[['info', 'number', 'axis', first_num_col]]
    df_location.columns = ['info', 'number', 'axis', 'coord']
    df_location = df_location.pivot_table(index=['info','number'], columns='axis', values='coord')
    
    # Normal df
    df_normal = df_info_gb.get_group('Normal')
    # column simplify
    df_normal['number'] = df_normal['Latency [ms]'].str.split(' ').str[1]
    df_normal['axis'] = df_normal['Latency [ms]'].str.split(' ').str[2]
    df_normal['number'] = df_normal['number'].astype('int')
    df_normal = df_normal.drop('Latency [ms]', axis=1)
    df_normal = df_normal.pivot_table(index=['info', 'number'], 
                                      columns='axis', 
                                      values=[x for x in df_normal.columns if re.search('\d',x)])
    
    return df_strength, df_location, df_normal

In [334]:
csv_location = u'/Users/kangik/Dropbox/project/2017_08_23_ERP_machine_learning/data/101027_Ctrl_LSJ2_current.csv'
csv_location = u'/Users/kangik/Dropbox/project/2017_08_23_ERP_machine_learning/data/101027_HC_LSJ2_MMN_current.csv'
csv_location = u'/Users/kangik/Dropbox/project/2017_08_23_ERP_machine_learning/data/101027_HC_LSJ2_P3_current.csv'

In [335]:
a,b,c = current_file_preprocessing(csv_location)

In [336]:
a.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,250.0,251.0,252.0,253.0,254.0,255.0,256.0,257.0,258.0,259.0,...,492.0,493.0,494.0,495.0,496.0,497.0,498.0,499.0,500.0,Unnamed: 22_level_0
info,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Strength,1,0.016585,0.0176,0.018829,0.020196,0.021633,0.023082,0.024496,0.025838,0.027085,0.02822,...,0.018481,0.017893,0.017358,0.016918,0.016614,0.016481,0.016544,0.01681,0.01727,
Strength,2,0.024249,0.025292,0.026546,0.027953,0.029456,0.031001,0.032542,0.034043,0.035475,0.036819,...,0.023817,0.022657,0.021419,0.020141,0.018868,0.017652,0.016552,0.015626,0.014929,
Strength,3,0.025989,0.02686,0.027826,0.028859,0.029931,0.031012,0.032077,0.033101,0.034067,0.034959,...,0.021138,0.020635,0.020165,0.019751,0.019418,0.019187,0.019072,0.019083,0.01922,
Strength,4,0.031114,0.032179,0.033284,0.03441,0.035534,0.036635,0.037693,0.038691,0.039615,0.040452,...,0.022475,0.021863,0.021246,0.020643,0.020079,0.019576,0.019156,0.018836,0.018627,
Strength,5,0.028598,0.029507,0.030545,0.031678,0.032872,0.034093,0.035309,0.036493,0.037623,0.038681,...,0.024083,0.023139,0.02214,0.021114,0.020094,0.019115,0.018217,0.017436,0.016804,


In [337]:
b.head()

Unnamed: 0_level_0,axis,x,y,z
info,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Location,1,69.0,-18.4,4.4
Location,2,69.9,-22.7,3.3
Location,3,69.6,-21.1,7.7
Location,4,69.6,-24.7,9.8
Location,5,70.3,-24.4,6.2


In [338]:
c.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,250.0,250.0,250.0,251.0,251.0,251.0,252.0,252.0,252.0,253.0,...,497.0,498.0,498.0,498.0,499.0,499.0,499.0,500.0,500.0,500.0
Unnamed: 0_level_1,axis,x,y,z,x,y,z,x,y,z,x,...,z,x,y,z,x,y,z,x,y,z
info,number,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Normal,1,-0.872,-0.473,-0.127,-0.836,-0.5,-0.227,-0.797,-0.519,-0.308,-0.761,...,-0.295,0.555,0.808,-0.2,0.565,0.818,-0.104,0.57,0.821,-0.01
Normal,2,-0.757,-0.623,-0.195,-0.722,-0.636,-0.273,-0.686,-0.645,-0.338,-0.652,...,-0.743,0.388,0.622,-0.68,0.412,0.685,-0.6,0.435,0.748,-0.502
Normal,3,-0.901,-0.432,-0.037,-0.89,-0.447,-0.09,-0.877,-0.46,-0.138,-0.863,...,-0.438,0.385,0.842,-0.377,0.415,0.855,-0.312,0.442,0.862,-0.247
Normal,4,-0.829,-0.548,-0.114,-0.818,-0.554,-0.152,-0.808,-0.559,-0.187,-0.797,...,-0.622,0.077,0.811,-0.58,0.119,0.839,-0.532,0.161,0.863,-0.479
Normal,5,-0.769,-0.621,-0.152,-0.748,-0.63,-0.21,-0.726,-0.637,-0.261,-0.704,...,-0.745,0.217,0.684,-0.696,0.243,0.732,-0.636,0.27,0.779,-0.566


## Text file preprocessing

In [324]:
def peak_preprocessing(textfile):
    df = pd.read_csv(text_data_loc, 
                 sep='\t', 
                 skiprows=5, 
                 names=['channel', 'x', 'y', 'z', 'minmax', 'latency'],
                 encoding='ISO-8859-1')
    return df

In [339]:
text_data_loc = u'/Users/kangik/Dropbox/project/2017_08_23_ERP_machine_learning/data/101027_HC_LSJ2_MMN_peak.txt'
text_data_loc = u'/Users/kangik/Dropbox/project/2017_08_23_ERP_machine_learning/data/101027_HC_LSJ2_P3_peak.txt'

In [340]:
d = peak_preprocessing(text_data_loc)

In [341]:
d.head()

Unnamed: 0,channel,x,y,z,minmax,latency
0,FP1-avg,29.0,-106.5,32.9,-7.049,500.0
1,FPZ-avg,-0.0,-112.2,38.3,-5.449,491.0
2,FP2-avg,-29.0,-107.5,32.8,-4.88,490.0
3,AF3-avg,34.0,-104.9,62.3,-3.424,500.0
4,AF4-avg,-36.0,-104.9,62.3,-2.628,500.0
