In [1]:
import pandas as pd
import numpy as np
import os

# Loading and pre-processing data into alcoholics and controls

In [2]:
def load_data():
    '''
    Loads EEG data downloaded from UCI ML Repo @ https://archive.ics.uci.edu/ml/datasets/eeg+database
    ''' 
    alcoholic_data = pd.DataFrame()
    control_data = pd.DataFrame()
    MAINPATH = '/home/jmcarpenter/Desktop/msan622/EEG/data/SMNI_CMI_TRAIN'
    subs = os.listdir(MAINPATH)
    for subidx, sub in enumerate(subs):
        print('subject: ' + str(subidx))
        subject_data = pd.DataFrame()
        SUBJECTPATH = os.path.join(MAINPATH, sub)
        trials = os.listdir(SUBJECTPATH)
        for trialidx, t in enumerate(trials):
            print('\ttrial: ' + str(trialidx))
            TRIALPATH = os.path.join(SUBJECTPATH, t)
            with open(TRIALPATH, 'r') as f:
                trial = [line.strip().split() for line in f.readlines()[5:]]
                trial_df = pd.DataFrame(trial, columns=['trial_num', 'channel', 'timepoint', 'signal'])
                trial_df = trial_df[trial_df['trial_num'] != '#']
                trial_df['trial_num'] = trial_df['trial_num'].astype(int)
                trial_df['timepoint'] = trial_df['timepoint'].astype(int)
                trial_df['signal'] = trial_df['signal'].astype(float)
                trial_df['subject'] = subidx
                subject_data = pd.concat([subject_data, trial_df])
        if sub[3] == 'a':
            alcoholic_data = pd.concat([alcoholic_data, subject_data])
        elif sub[3] == 'c':
            control_data = pd.concat([control_data, subject_data])
        else:
            return sub
    return alcoholic_data.reset_index(drop=True), control_data.reset_index(drop=True)

In [4]:
alcoholic_data, control_data = load_data()

subject: 0
	trial: 0
	trial: 1
	trial: 2
	trial: 3
	trial: 4
	trial: 5
	trial: 6
	trial: 7
	trial: 8
	trial: 9
	trial: 10
	trial: 11
	trial: 12
	trial: 13
	trial: 14
	trial: 15
	trial: 16
	trial: 17
	trial: 18
	trial: 19
	trial: 20
	trial: 21
	trial: 22
	trial: 23
	trial: 24
	trial: 25
	trial: 26
	trial: 27
	trial: 28
	trial: 29
subject: 1
	trial: 0
	trial: 1
	trial: 2
	trial: 3
	trial: 4
	trial: 5
	trial: 6
	trial: 7
	trial: 8
	trial: 9
	trial: 10
	trial: 11
	trial: 12
	trial: 13
	trial: 14
	trial: 15
	trial: 16
	trial: 17
	trial: 18
	trial: 19
	trial: 20
	trial: 21
	trial: 22
	trial: 23
	trial: 24
	trial: 25
	trial: 26
	trial: 27
	trial: 28
	trial: 29
subject: 2
	trial: 0
	trial: 1
	trial: 2
	trial: 3
	trial: 4
	trial: 5
	trial: 6
	trial: 7
	trial: 8
	trial: 9
	trial: 10
	trial: 11
	trial: 12
	trial: 13
	trial: 14
	trial: 15
	trial: 16
	trial: 17
	trial: 18
	trial: 19
	trial: 20
	trial: 21
	trial: 22
	trial: 23
	trial: 24
	trial: 25
	trial: 26
	trial: 27
	trial: 28
	trial: 29
subject

In [5]:
print(alcoholic_data.shape, control_data.shape)
alcoholic_data.head()

(4915200, 5) (4915200, 5)


Unnamed: 0,trial_num,channel,timepoint,signal,subject
0,27,FP1,0,-0.285,2
1,27,FP1,1,-0.285,2
2,27,FP1,2,-1.261,2
3,27,FP1,3,-3.215,2
4,27,FP1,4,-5.656,2


In [6]:
def to_upper(x):
    return x.upper()

alcoholic_data['channel'] = alcoholic_data['channel'].apply(to_upper)
control_data['channel'] = control_data['channel'].apply(to_upper)

In [7]:
def remove_unnecessary_channels(data):
    data = data[data['channel'] != 'X']
    data = data[data['channel'] != 'Y']
    data = data[data['channel'] != 'ND']
    return data

alcoholic_data = remove_unnecessary_channels(alcoholic_data)
control_data = remove_unnecessary_channels(control_data)
#alcoholic_data.reset_index(drop=True).to_feather('data/alcoholics')
#control_data.reset_index(drop=True).to_feather('data/controls')

# Reshape data for network analysis

In [2]:
alcoholic_data = pd.read_feather('data/alcoholics')
control_data = pd.read_feather('data/controls')

In [3]:
alcoholic_data['group'] = 'alcoholic'
control_data['group'] = 'control'

In [4]:
eeg_data = pd.concat([alcoholic_data, control_data])
eeg_data

Unnamed: 0,trial_num,channel,timepoint,signal,subject,group
0,27,FP1,0,-0.285,2,alcoholic
1,27,FP1,1,-0.285,2,alcoholic
2,27,FP1,2,-1.261,2,alcoholic
3,27,FP1,3,-3.215,2,alcoholic
4,27,FP1,4,-5.656,2,alcoholic
5,27,FP1,5,-8.097,2,alcoholic
6,27,FP1,6,-9.562,2,alcoholic
7,27,FP1,7,-10.050,2,alcoholic
8,27,FP1,8,-10.050,2,alcoholic
9,27,FP1,9,-8.586,2,alcoholic


In [5]:
eeg_data[eeg_data['group'] == 'alcoholic']['subject'].unique()

array([ 2,  3,  5,  6,  8,  9, 12, 15, 16, 18])

In [6]:
eeg_data[eeg_data['group'] == 'control']['subject'].unique()

array([ 0,  1,  4,  7, 10, 11, 13, 14, 17, 19])

## Group by sub, chan, and tp to extract important info for network

In [7]:
subject_channel_data = eeg_data.groupby(['subject','channel','timepoint'])[['signal', 'group']].agg({'signal': 'mean'})

In [8]:
subject_channel_reindexed = subject_channel_data.reset_index(drop=False)

In [9]:
subject_channel_reindexed.head()

Unnamed: 0_level_0,subject,channel,timepoint,signal
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,signal
0,0,AF1,0,-0.6205
1,0,AF1,1,-0.588067
2,0,AF1,2,-0.4089
3,0,AF1,3,-0.4089
4,0,AF1,4,-0.474033


## Binning data for less time points and more data per bin

In [10]:
binned_data = pd.DataFrame(columns = ['subject', 'channel', 'bin',
                                      'tp00', 'tp01', 'tp02', 'tp03',
                                      'tp04', 'tp05', 'tp06', 'tp07',
                                      'tp08', 'tp09', 'tp10', 'tp11',
                                      'tp12', 'tp13', 'tp14', 'tp15'])
tp_names = ['tp00', 'tp01', 'tp02', 'tp03',
            'tp04', 'tp05', 'tp06', 'tp07',
            'tp08', 'tp09', 'tp10', 'tp11',
            'tp12', 'tp13', 'tp14', 'tp15']
idx = 0
for samp in range(0, 241, 4):
    tmp_df = subject_channel_reindexed.loc[subject_channel_reindexed['timepoint'].isin(range(samp, samp + 16))]
    pivoted_df = tmp_df.pivot_table(values='signal',index='timepoint', columns=['subject', 'channel']).T.reset_index().drop('level_0', axis=1)
    pivoted_df.rename(columns = {cur_col: new_col for cur_col, new_col in 
                                 zip(pivoted_df.columns.drop(['subject','channel']), tp_names)}, inplace=True)
    pivoted_df['bin'] = idx
    binned_data = pd.concat([binned_data, pivoted_df])
    
    idx += 1


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [11]:
binned_data.reset_index(drop=True)

Unnamed: 0,bin,channel,subject,tp00,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10,tp11,tp12,tp13,tp14,tp15
0,0,AF1,0,-0.620500,-0.588067,-0.408900,-0.408900,-0.474033,-0.620500,-0.701900,-0.750700,-0.767000,-0.701867,-0.750700,-0.783333,-1.043767,-1.076233,-1.076267,-0.799433
1,0,AF2,0,-0.167867,-0.118933,-0.135233,-0.330533,-0.265533,0.125067,0.597167,0.824933,0.678433,-0.037633,-0.705000,-0.997933,-0.786367,-0.330733,0.092633,0.092600
2,0,AF7,0,-3.163400,-3.846933,-3.521400,-2.447100,-1.519433,-1.275267,-1.893800,-2.837800,-3.277200,-2.935333,-2.154300,-1.454300,-1.503200,-2.186800,-3.065633,-3.521367
3,0,AF8,0,-1.862200,-1.829767,-1.878533,-2.008700,-1.943667,-1.634400,-1.064733,-0.739167,-0.755500,-0.999567,-1.439100,-1.797100,-1.927367,-1.878533,-1.748333,-1.471600
4,0,AFZ,0,-0.107767,-0.075233,-0.107833,-0.107833,-0.091533,-0.010167,0.038667,-0.026467,-0.026400,-0.270733,-0.530967,-0.628667,-0.514633,-0.384500,-0.319400,-0.221767
5,0,C1,0,0.031500,0.129233,0.096600,-0.147500,-0.456833,-0.700967,-0.831067,-0.847333,-0.749733,-0.619500,-0.570733,-0.342833,-0.196400,-0.098667,-0.033533,-0.066133
6,0,C2,0,-0.057033,-0.138333,-0.236033,-0.219733,-0.203500,-0.089533,-0.008167,0.105800,0.219633,0.170933,0.008167,-0.219700,-0.317500,-0.415033,-0.431233,-0.398700
7,0,C3,0,-0.009200,0.397633,0.381467,-0.090500,-0.643933,-1.164733,-1.490300,-1.685533,-1.766900,-1.653100,-1.311300,-0.871800,-0.399733,0.007100,0.218667,0.169800
8,0,C4,0,-0.062000,-0.094667,-0.208667,-0.273633,-0.338733,-0.371267,-0.192200,0.003033,0.198367,0.149500,-0.013267,-0.306167,-0.452733,-0.534067,-0.533967,-0.631733
9,0,C5,0,-0.592400,0.140000,0.433000,0.156233,-0.299367,-0.690000,-0.869067,-1.015533,-1.210733,-1.406200,-1.438667,-1.080700,-0.543467,0.221467,0.530700,0.433000


In [14]:
binned_data['bin'] = binned_data['bin'].astype(int)
binned_data['subject'] = binned_data['subject'].astype(int)
binned_data.reset_index(drop=True).to_feather('binned_data_notsmoothed')

In [12]:
alc_binned_data = binned_data[binned_data['subject'].isin(eeg_data[eeg_data['group'] == 'alcoholic']['subject'].unique())]
ctrl_binned_data = binned_data[binned_data['subject'].isin(eeg_data[eeg_data['group'] == 'control']['subject'].unique())]

In [20]:
def save_data_to_feather(binned_data, filepath):
    binned_data['bin'] = binned_data['bin'].astype(int)
    binned_data['subject'] = binned_data['subject'].astype(int)
    binned_data.reset_index(drop=True).to_feather(filepath)

In [21]:
save_data_to_feather(alc_binned_data, 'data/alc_binned_data')
save_data_to_feather(ctrl_binned_data, 'data/ctrl_binned_data')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
np.save('data/channel_order.npy', alc_binned_data['channel'].unique())