In [19]:
from collections import defaultdict
from os import listdir
from os.path import isfile, join
import re
import pandas as pd
TESTING = True

In [20]:
"""
SQL DISCOVERY QUERIES FOR METADATA (used to find e.g. all linear experiments)
select f.topdir, f.session from session s, file f where s.behavior = 'linear' and s.session=f.session 
order by f.topdir desc;
"""

"\nSQL DISCOVERY QUERIES FOR METADATA (used to find e.g. all linear experiments)\nselect f.topdir, f.session from session s, file f where s.behavior = 'linear' and s.session=f.session \norder by f.topdir desc;\n"

In [21]:
"""
| Record data from channels (amplify by 1000x, record at 20Khz (20,000Hz) or 32,552Hz sample
 | rate, bandpass 1-5kHz). Recordings made by either DataMax recording device
 | (DataMax system; RC Electronics) at 20kHz, or a NeuraLynx recording device
 | (NeuraLynx system) at 32,552Hz. Data sets recorded using DataMax (20KHz) are:
 | ec012, ec013, ec014, ec016, f01_m, g01_m, i01_m, j01_m. Data sets recorded using
 | NeuraLynx (32,552Hz) are: gor01, pin01, vvp01, ec014 (ec014.n329 only, all other
 | sessions from rat ec014 were recorded by DataMax). The sampling frequency is also
 | available in .xml files.
"""
def get_freq(session_dir):
    frequency_is_32552 = ["gor01*", "pin01*", "vvp01*", "ec014\.n329*"]
    frequency_is_32552 = [re.compile(x) for x in frequency_is_32552]
    for freq_reg in frequency_is_32552:
        if freq_reg.match(session_dir):
            return 32552
    return 20000

In [22]:
linear_experiments = {'ec016.59': ['ec016.1047'],
 'ec016.53': ['ec016.931'],
 'ec016.49': ['ec016.850'],
 'ec016.45': ['ec016.749'],
 'ec016.44': ['ec016.733'],
 'ec016.41': ['ec016.674'],
 'ec016.19': ['ec016.269'],
 'ec016.17': ['ec016.233', 'ec016.234'],
 'ec014.36': ['ec014.639'],
 'ec014.29': ['ec014.468'],
 'ec013.56': ['ec013.978', 'ec013.979', 'ec013.980'],
 'ec013.55': ['ec013.965', 'ec013.966', 'ec013.969'],
 'ec013.54': ['ec013.949', 'ec013.950', 'ec013.951'],
 'ec013.53': ['ec013.932', 'ec013.933', 'ec013.934'],
 'ec013.51': ['ec013.906', 'ec013.910', 'ec013.911'],
 'ec013.49': ['ec013.874', 'ec013.880', 'ec013.881', 'ec013.882'],
 'ec013.48': ['ec013.859', 'ec013.860', 'ec013.861'],
 'ec013.47': ['ec013.840', 'ec013.842', 'ec013.843'],
 'ec013.45': ['ec013.799', 'ec013.805', 'ec013.806', 'ec013.807'],
 'ec013.44': ['ec013.788'],
 'ec013.42': ['ec013.761', 'ec013.762', 'ec013.764'],
 'ec013.41': ['ec013.737', 'ec013.738', 'ec013.739'],
 'ec013.40': ['ec013.718', 'ec013.719', 'ec013.720'],
 'ec013.39': ['ec013.683', 'ec013.684', 'ec013.685'],
 'ec013.38': ['ec013.669', 'ec013.670', 'ec013.671'],
 'ec013.37': ['ec013.639', 'ec013.642', 'ec013.643'],
 'ec013.36': ['ec013.626', 'ec013.627', 'ec013.628'],
 'ec013.35': ['ec013.589', 'ec013.599', 'ec013.600', 'ec013.601'],
 'ec013.34': ['ec013.573', 'ec013.574', 'ec013.576'],
 'ec013.33': ['ec013.554', 'ec013.555', 'ec013.556'],
 'ec013.32': ['ec013.531', 'ec013.532', 'ec013.533'],
 'ec013.31': ['ec013.502', 'ec013.503', 'ec013.504'],
 'ec013.30': ['ec013.454', 'ec013.465', 'ec013.466', 'ec013.469'],
 'ec013.29': ['ec013.440', 'ec013.441', 'ec013.442'],
 'ec013.28': ['ec013.395', 'ec013.412', 'ec013.413', 'ec013.414'],
 'ec013.27': ['ec013.374', 'ec013.375', 'ec013.386', 'ec013.387', 'ec013.388'],
 'ec013.21': ['ec013.251', 'ec013.252'],
 'ec013.18': ['ec013.205', 'ec013.206', 'ec013.208'],
 'ec013.15': ['ec013.156', 'ec013.157'],
 'ec012ec.27': ['ec012ec.560', 'ec012ec.561'],
 'ec012ec.24': ['ec012ec.503', 'ec012ec.504'],
 'ec012ec.22': ['ec012ec.465', 'ec012ec.466', 'ec012ec.467'],
 'ec012ec.21': ['ec012ec.444', 'ec012ec.445'],
 'ec012ec.18': ['ec012ec.374', 'ec012ec.375'],
 'ec012ec.17': ['ec012ec.356', 'ec012ec.357'],
 'ec012ec.14': ['ec012ec.269', 'ec012ec.270', 'ec012ec.271'],
 'ec012ec.13': ['ec012ec.239', 'ec012ec.240']}

In [153]:
def get_datadirs():
    if TESTING:
        yield 'data/ec013.40/ec013.719/'
    else:
        for key, value in linear_experiments.iteritems():
            for session in value:
                yield 'data/' + key + '/' + session + '/'

In [154]:
data = defaultdict(lambda: {})
# parse time, location, and spike data for each electrode
for session_dir in get_datadirs():
    location_reg = re.compile(".*\.whl")
    time_reg = re.compile(".*\.res\.*")
    cluster_reg = re.compile(".*\.clu\.*")
    freq = get_freq(session_dir)
    files = [f for f in listdir(session_dir) if isfile(join(session_dir, f))]
    data_files = defaultdict(list)
    for file in files:
        if location_reg.match(file):
            location_df = pd.read_csv(join(session_dir, file), delimiter='\t', header=None)
            location_df['time'] = location_df.index / 39.0625
            location_df['time'] = pd.to_timedelta(location_df['time'], unit="sec")
            location_df.drop_duplicates(subset=[0,1,2,3], keep=False, inplace=True)
            data[session_dir]['location'] = location_df
        elif time_reg.match(file):
            electrode_num = int(file.rsplit('.', 1)[1])
            time_series = pd.read_csv(join(session_dir, file), delimiter='\n', header=None, squeeze=True)
            time_series /= freq
            time_series = pd.to_timedelta(time_series, unit="sec")
            if electrode_num not in data[session_dir]:
                data[session_dir][electrode_num] = pd.DataFrame()
            data[session_dir][electrode_num]['time'] = time_series
        elif cluster_reg.match(file):
            electrode_num = int(file.rsplit('.', 1)[1])
            series = pd.read_csv(join(session_dir, file), delimiter='\n', header=None, squeeze=True)
            n_clusters = series.iloc[0]
            series = series.iloc[1:]
            series.reset_index(drop=True, inplace=True)
            if electrode_num not in data[session_dir]:
                data[session_dir][electrode_num] = pd.DataFrame()
            data[session_dir][electrode_num]['spikes'] = series
            
# combine data from each electrode into one concantenated dataframe
concantenated_data = {}
for session, session_data in data.items():
    concantenated_spikes = pd.DataFrame(columns=['spikes', 'time'])
    for electrode_num, spike_data in session_data.items():
        if electrode_num == "location":
            continue
        spike_data['spikes'] = spike_data['spikes'].apply(lambda x: str(electrode_num) + '-' + str(x))
        concantenated_spikes = pd.concat([concantenated_spikes, spike_data], ignore_index=True)
    concantenated_spikes
    concantenated_data[session] = concantenated_spikes.sort_values('time').reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [155]:
data['data/ec013.40/ec013.719/']['location'].describe()

Unnamed: 0,0,1,2,3,time
count,45103.0,45103.0,45103.0,45103.0,45103
mean,179.877237,102.837839,181.47509,102.262024,0 days 00:10:04.757540
std,79.549754,71.200105,77.771977,68.573864,0 days 00:05:34.688902
min,-1.0,-1.0,59.37998,9.970636,0 days 00:00:26.598400
25%,91.246255,31.68301,90.69484,31.76491,0 days 00:05:15.251200
50%,201.9935,84.99194,206.0501,83.74844,0 days 00:10:03.904000
75%,262.6063,178.73235,261.0311,180.28415,0 days 00:14:52.556800
max,282.5051,217.0655,281.6409,211.668,0 days 00:19:55.699200


In [156]:
len(set(list(concantenated_data['data/ec013.40/ec013.719/'].spikes)))

82

In [157]:
data['data/ec013.40/ec013.719/'][1]

Unnamed: 0,spikes,time
0,1-4,00:00:00.010550
1,1-12,00:00:00.013850
2,1-13,00:00:00.023450
3,1-12,00:00:00.025050
4,1-12,00:00:00.031950
5,1-12,00:00:00.040550
6,1-12,00:00:00.046850
7,1-1,00:00:00.050350
8,1-12,00:00:00.055200
9,1-13,00:00:00.072700


In [158]:
# TODO: match location data with spike data. seems weird that if both are sampled at the same frequency, which the
# processing flowchart claims they are (https://crcns.org/files/data/hc3/crcns-hc3-processing-flowchart.pdf)
# that the location data only has ~30k vectors (which would amount to 1s of video)

# https://crcns.org/forum/using-datasets/62983963#926694452 - sampling rate of .whl is 39.06 Hz

In [159]:
def bucket_spikes(spike_train, bucket_size=1):
#     freq = pd.timedelta_range(0, max(data['data/ec013.40/ec013.719/'][1]['time']), freq="1S")
#     spike_train = spike_train.append([pd.Timedelta(0, unit="sec"), "-1"], ignore_index=True)
#     spike_train.iloc[0]
    spike_train.loc[len(spike_train)-1] = [float('nan'), pd.Timedelta(0, unit="sec")]
    return spike_train
    binned = spike_train.groupby(pd.Grouper(key='time', freq='{}S'.format(bucket_size), base=spike_train['time'][0].seconds))
    binned = binned['spikes'].value_counts()
    binned = pd.DataFrame(binned).unstack(fill_value=0)
    binned.columns = binned.columns.droplevel()
    binned = binned.reindex(sorted(binned.columns), axis=1)
    return binned

In [160]:
a = bucket_spikes(data['data/ec013.40/ec013.719/'][1])

In [161]:
a.iloc[0]

spikes                       1-4
time      0 days 00:00:00.010550
Name: 0, dtype: object

In [167]:
len(a)

66457

In [172]:
a

Unnamed: 0,spikes,time
0,1-4,00:00:00.010550
1,1-12,00:00:00.013850
2,1-13,00:00:00.023450
3,1-12,00:00:00.025050
4,1-12,00:00:00.031950
5,1-12,00:00:00.040550
6,1-12,00:00:00.046850
7,1-1,00:00:00.050350
8,1-12,00:00:00.055200
9,1-13,00:00:00.072700


In [171]:
a.loc[len(a)-1] = [float('nan'), pd.Timedelta(0, unit="sec")]

In [144]:
data['data/ec013.40/ec013.719/'][1]

Unnamed: 0,spikes,time
0,1-4,0 days 00:00:00.010550
1,1-12,0 days 00:00:00.013850
2,1-13,0 days 00:00:00.023450
3,1-12,0 days 00:00:00.025050
4,1-12,0 days 00:00:00.031950
5,1-12,0 days 00:00:00.040550
6,1-12,0 days 00:00:00.046850
7,1-1,0 days 00:00:00.050350
8,1-12,0 days 00:00:00.055200
9,1-13,0 days 00:00:00.072700


In [99]:
data['data/ec013.40/ec013.719/'][1].set_index('time')[0].seconds

0

In [108]:
data['data/ec013.40/ec013.719/'][1].set_index('time').resample('S').sum()

Unnamed: 0_level_0,spikes
time,Unnamed: 1_level_1
00:00:00.010550,1-41-121-131-121-121-121-121-11-121-131-11-41-...
00:00:01.010550,1-131-121-11-121-121-121-121-121-121-121-121-1...
00:00:02.010550,1-121-121-121-121-131-41-121-131-121-41-121-12...
00:00:03.010550,1-121-11-121-121-121-121-11-121-11-121-121-121...
00:00:04.010550,1-71-71-41-121-121-121-11-121-121-61-121-121-1...
00:00:05.010550,1-121-121-121-121-121-121-121-121-121-121-81-1...
00:00:06.010550,1-131-121-121-11-121-121-121-121-121-121-121-8...
00:00:07.010550,1-131-121-131-121-121-11-121-131-41-121-121-12...
00:00:08.010550,1-121-121-111-11-121-121-121-11-121-121-41-121...
00:00:09.010550,1-131-121-121-121-121-41-11-121-11-121-121-11-...


In [109]:
b = data['data/ec013.40/ec013.719/'][1].set_index('time')

In [117]:
b.loc[pd.Timedelta(0, unit="sec")] = float('nan')

In [118]:
b

Unnamed: 0_level_0,spikes
time,Unnamed: 1_level_1
0 days 00:00:00.010550,1-4
0 days 00:00:00.013850,1-12
0 days 00:00:00.023450,1-13
0 days 00:00:00.025050,1-12
0 days 00:00:00.031950,1-12
0 days 00:00:00.040550,1-12
0 days 00:00:00.046850,1-12
0 days 00:00:00.050350,1-1
0 days 00:00:00.055200,1-12
0 days 00:00:00.072700,1-13
