# CRCNS - Collaborative Research in Computational Neuroscience

## Installing requirements

## Available data sets

In [2]:
%%html
<iframe src="https://crcns.org/data-sets" width="100%" height="600"></iframe>

## Setting the environmental variables

In [3]:
%%writefile .env
CRCNS_USERNAME=ZeTargino
CRCNS_PASSWORD=s3nh4data

Overwriting .env


In [4]:
import dotenv

In [5]:
dotenv.load_dotenv('.env')

True

## Downloading data

### Auxiliary functions

In [18]:
def download_crcns_datafile(filepath):
    """
    Download a CRCNS datafile from the web.

    Parameters
    ----------
    filepath : str
        The path to the file on the CRCNS website.
    
    Examples
    --------
    >>> download_crcns_datafile('hc-3')
    """
    import os
    
    os.makedirs('data',exist_ok=True)
    
    request_data = dict(
        username=os.getenv('CRCNS_USERNAME'),
        password=os.getenv('CRCNS_PASSWORD'),
        fn=filepath,
        submit='Login' 
        )
    
    import requests
    
    with requests.Session() as session:
        filename = request_data['fn'].split('/')
        local_filename = os.path.join('data',os.path.join(*filename))
        CRCNS_URL = 'https://portal.nersc.gov/project/crcns/download/index.php'
        response = session.post(CRCNS_URL,data=request_data,stream=True)
        with open(local_filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
        print(local_filename)


### Compounding files dataframe from file

In [7]:
CRCNS_DATASET = 'hc-3'

In [8]:
import os

In [9]:
dataset_directory = os.path.join('data',CRCNS_DATASET)

In [19]:
download_crcns_datafile('hc-3/filelist.txt')

data/hc-3/filelist.txt


In [10]:
dataset_filelist_path = os.path.join(dataset_directory,'filelist.txt')

In [10]:
with open(dataset_filelist_path,'r') as file:
    filelist = file.read().splitlines()

In [11]:
filelist = list(map(lambda s: s if s.startswith('#') or s.startswith(' ') else ' '+s,filelist))

In [12]:
marked_files,size = zip(*[line.split('\t') for line in filelist[5:]])

In [13]:
size_bytes = [int(s.split()[0]) for s in size]

In [14]:
import re

In [15]:
size_human = [re.findall('\([0-9]*\.?[0-9]* [a-zA-Z]+\)',s)[0][1:-1] for s in size]

In [16]:
mark,file_path = zip(*[mfile.split(' ') for mfile in marked_files])
mark = list(map(lambda s: s=='',mark))

In [17]:
import pathlib as pl

In [18]:
file_path = [pl.Path(f) for f in file_path]
file_parents = [f.parent for f in file_path]
file_names = [f.name for f in file_path]
file_extension = [''.join([s for s in f.suffixes if not s[1:].isdecimal()]) for f in file_path]
file_names = [f.replace(e,'') for f,e in zip(file_names,file_extension)]

In [19]:
file_downloaded = []
for parent,name,extension in zip(file_parents,file_names,file_extension):
    entry_path = os.path.join('./data',CRCNS_DATASET,parent,name)
    file_downloaded.append(os.path.exists(entry_path) or os.path.exists(entry_path + extension))
        

In [20]:
import pandas as pd

In [21]:
file_dataframe = pd.DataFrame({
    'downloaded' : file_downloaded,
    'marked' : mark,
    'parent' : file_parents,
    'name' : file_names,
    'extension' : file_extension,
    'size' : size_human,
    'bytes' : size_bytes,
}).sort_values('bytes')

In [22]:
file_dataframe

Unnamed: 0,downloaded,marked,parent,name,extension,size,bytes
0,False,True,.,checksums,.md5,34.8 KB,35709
1,False,True,docs,crcns-hc3-channelorder,.zip,51.7 KB,52968
2,True,False,docs,crcns-hc3-data-description,.pdf,876.1 KB,897194
3,False,True,docs,crcns-hc3-metadata-tables,.zip,1.0 MB,1149865
4,False,True,docs,crcns-hc3-original-docs,.zip,1.7 MB,1844997
...,...,...,...,...,...,...,...
565,False,False,ec014.28,ec014.440,.tar.gz,5.0 GB,5423925950
566,False,False,ec014.n329,2007-3-29_20-14-02,.tar.gz,5.1 GB,5546934699
567,False,False,ec014.21,ec014.277,.tar.gz,5.6 GB,6111795124
568,False,False,ec014.n329,2007-3-29_17-42-25,.tar.gz,6.8 GB,7355408814


In [23]:
file_dataframe.sample(3).T

Unnamed: 0,106,73,105
downloaded,False,False,False
marked,False,True,False
parent,ec012ec.26,ec013.29,ec016.56
name,ec012ec.532,ec013.435,ec016.965
extension,.tar.gz,.mpg.tar.gz,.tar.gz
size,218.0 MB,179.0 MB,217.2 MB
bytes,228657031,187755986,227821770


In [24]:
import numpy as np

In [25]:
file_size = np.cumsum(file_dataframe['bytes'])/1024/1024/1024

In [26]:
file_size.values[-1]

401.8134672753513

In [27]:
file_dataframe['marked'] = file_size.values < 10

In [28]:
file_dataframe

Unnamed: 0,downloaded,marked,parent,name,extension,size,bytes
0,False,True,.,checksums,.md5,34.8 KB,35709
1,False,True,docs,crcns-hc3-channelorder,.zip,51.7 KB,52968
2,True,True,docs,crcns-hc3-data-description,.pdf,876.1 KB,897194
3,False,True,docs,crcns-hc3-metadata-tables,.zip,1.0 MB,1149865
4,False,True,docs,crcns-hc3-original-docs,.zip,1.7 MB,1844997
...,...,...,...,...,...,...,...
565,False,False,ec014.28,ec014.440,.tar.gz,5.0 GB,5423925950
566,False,False,ec014.n329,2007-3-29_20-14-02,.tar.gz,5.1 GB,5546934699
567,False,False,ec014.21,ec014.277,.tar.gz,5.6 GB,6111795124
568,False,False,ec014.n329,2007-3-29_17-42-25,.tar.gz,6.8 GB,7355408814


In [29]:
with open(dataset_filelist_path,'w+') as file:
    file.writelines('\n'.join(filelist[:5]))
    for entry in file_dataframe.iloc:
        entry_string = '\n'
        entry_string += ' ' if entry['marked'] and not entry['downloaded'] else '# '
        entry_string += os.path.join(entry['parent'],entry['name'] + entry['extension'])
        entry_string += '\t' + str(entry['bytes'])
        entry_string += ' (' + entry['size'] + ')'
        file.write(entry_string)
    

In [30]:
filelist

["# CRCNS.org 'hc-3' dataset files",
 '# To use this for fetching files, comment out files that are',
 "# not needed by putting a '#' as the first character on the line (default mode)",
 "# or put a '+' as the first character to specifiy files to be fetched (+ mode).",
 "# mode='default' (change to '+' for + mode).",
 ' ./checksums.md5\t35709 (34.8 KB)',
 ' docs/crcns-hc3-channelorder.zip\t52968 (51.7 KB)',
 '# docs/crcns-hc3-data-description.pdf\t897194 (876.1 KB)',
 ' docs/crcns-hc3-metadata-tables.zip\t1149865 (1.0 MB)',
 ' docs/crcns-hc3-original-docs.zip\t1844997 (1.7 MB)',
 '# docs/crcns-hc3-data-description.docx\t3324528 (3.1 MB)',
 ' ec013.33/ec013.544.mpg.tar.gz\t3708367 (3.5 MB)',
 ' ec013.33/ec013.544.tar.gz\t13084214 (12.4 MB)',
 ' ec012ec.20/ec012ec.409.tar.gz\t20406661 (19.4 MB)',
 ' ec013.33/ec013.543.mpg.tar.gz\t22781426 (21.7 MB)',
 ' docs/additional_whl_files.tar.gz\t23309790 (22.2 MB)',
 ' ec013.33/ec013.541.mpg.tar.gz\t24620611 (23.4 MB)',
 ' ec016.57/ec016.986.mpg.

In [31]:
file_downloaded = file_dataframe[file_dataframe['downloaded']]
file_downloaded

Unnamed: 0,downloaded,marked,parent,name,extension,size,bytes
2,True,True,docs,crcns-hc3-data-description,.pdf,876.1 KB,897194
5,True,True,docs,crcns-hc3-data-description,.docx,3.1 MB,3324528
22,True,True,ec012ec.16,ec012ec.311,.tar.gz,84.9 MB,89064580
67,True,True,ec013.15,ec013.157,.tar.gz,173.5 MB,182005826
149,True,False,ec012ec.11,ec012ec.189,.tar.gz,271.2 MB,284447276
317,True,False,ec013.18,ec013.198,.tar.gz,470.9 MB,493779544


## Extract files

In [33]:
import zipfile

In [34]:
import tarfile

In [35]:
extract_and_remove = False

In [36]:
for entry in file_downloaded.iloc:
    compact_filepath = os.path.join(dataset_directory,entry['parent'],entry['name'] + entry['extension'])
    if not os.path.isfile(compact_filepath): continue
    match entry['extension']:
        case '.zip':
            with zipfile.ZipFile(compact_filepath, 'r') as zip_ref:
                zip_ref.extractall(dataset_directory)
            if extract_and_remove : os.remove(compact_filepath)
        case '.tar.gz':
            with tarfile.open(compact_filepath, 'r:gz') as tar_ref:
                tar_ref.extractall(dataset_directory)
            if extract_and_remove : os.remove(compact_filepath)
            