# Building a Pandas data frame from Matlab experiment files in the Nauhaus lab.

This code builds a huge table, where each row is a different experiment, and each column is an experimental variable. The information is extracted from the .analyzer files, typical contained in the folder 'AnalyzerFiles'. A given 'AnalyzerFiles' folder will have experimental information associated with potentially every experiment ever performed in the Nauhaus Lab. At a minimum, the information related to each experiment is contained in the GUIs on the stimulus controller: e.g. animal name, screen distance, visual stimulus looping parameters, parameter lists, display type, recording method, etc. 

The table that gets built is in the form of a Pandas data frame. Pandas is an analysis tool built on Python. This allows for a ton of flexibility for filtering out different experiments. Examples are given at the end.


In [177]:
import os
import numpy as np
import pandas as pd
from scipy import io
from organizemat import organizemat #Local. Taken from S.O.
from mat2list import mat2list #My function. Interprets MATLAB vector into a Python list.

In [179]:
#type in the path location of this notebook
wdir = '/Users/in2293/Desktop/nlab_experiment_finder' 
os.chdir(wdir)

#Analyzer files may be somewhere else on th eocmputer...
fileloc = '/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles' #working directory

subfolders = [f.path for f in os.scandir(fileloc) if f.is_dir() ] #list of all the folder names
print('no animals = ', len(subfolders))  #Each folder is a separate animal.

no animals =  373


In [180]:
#This takes ~10 min to run. Alternatively: pd.read_csv('MPLdf')

#Initialize the 3 data frames. We will append 1 row of data after loading each experiment
Mdf = pd.DataFrame()
Pdf = pd.DataFrame()
Ldf = pd.DataFrame()

unloaded_experiments = []

exp_count = 0;

for f in subfolders[0:100]:  #loop each animal
    
      print(f)
        
      for anafile in os.scandir(f):  #loop each experiment from the given animal
        try:

            data = io.loadmat(anafile.path, struct_as_record=False, squeeze_me=True)
            
            #Other things I tried for loading MATLAB .mat file and the reasons they didn't work:
            #Analyzer = loadmat(anafile.path)  #Could use this instead of the above, but it takes too long.          
            #Analyzer = io.loadmat(anafile.path,simplify_cells = True) #Throws error for files containing video object
            
        except:  #Sometimes there are irrelevant files saved in each folder. 
                #I want to make sure they are irrelevant by storing their name.
            
            print('error loading ', anafile.path)    
            unloaded_experiments.append(anafile.path) #I want to know what files were not loaded
            continue
                
        #Remove unnecessary "syncInfo". 
        #Otherwise it takes forever to parse into a dict below.
        data_copy = data.copy()
        for k,v in data.items():
            if k[0:4] == 'sync':        
                data_copy.pop(k)

        #Organize loaded .mat file into something readable.  Ref: StackOverflow.
        Analyzer = organizemat(data_copy)   
        
        #Move header into the M dict
        header = Analyzer['__header__']
        Analyzer = Analyzer['Analyzer']
        Analyzer['M']['header'] = header    
        
        #Experimental parameters are in 3 GUI windows at stimulus-controller:
        M = Analyzer['M'] #Parameters in the "MW" GUI
        L = Analyzer['L'] #Parameters in the "Looper" GUI
        P = Analyzer['P'] #Parameters in the "paramList" GUI
        
        if type(L['param'][0]) is str: #Asks if there is only one looping parameter
            L['param'] = [L['param']]  #Embed it to make it consistent with N>1 parameter case
            
        n_loop_param = len(L['param'])  #number of looping parameters
        #mat2list converts a matlab vector creation (e.g. '0:45:315', or [0 45]), into a complete Python list.
        #Expanding the vector into a Python list allows for easier querying of the data frame.
        for i in range(n_loop_param):
            L['paramSymbol' + str(i+1)] = L['param'][i][0]
            L['paramValues' + str(i+1)] = str(mat2list(L['param'][i][1]))
            
            L['paramValuesMatlabStr' + str(i+1)] = L['param'][i][1]
        
        L.pop('param') #No longer needed. Redundant.
        
        #json_normalize turns it into a df, and helps to unpack some fields
        Mseries = pd.json_normalize(Analyzer['M'])
        Lseries = pd.json_normalize(Analyzer['L'])
        Pseries = pd.json_normalize(Analyzer['P'])
        
        #Pseries needs reformatted for readability:
        #Create a data frame with one row: columns are the parameter symbols, entries are the values.
        columns = list(pd.DataFrame(Pseries.param[0]).iloc[:,0]) #columns
        values = list(pd.DataFrame(Pseries.param[0]).iloc[:,2]) #values
        columns = ['module'] + columns
        values = [Pseries.type[0]] + values  #append front with module: e.g. 'PG' for periodic grater
        Pseries = pd.DataFrame(columns = columns)
        Pseries.loc[0] = values 
        
        #Append this experiment:
        Mdf = Mdf.append(Mseries,ignore_index = True)
        Ldf = Ldf.append(Lseries,ignore_index = True)
        Pdf = Pdf.append(Pseries,ignore_index = True)

        


/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/nr4
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/rf6
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/ri2
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/dr4
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/rj4
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/nq5
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/rj3
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/re7
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/dr3
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/rf8
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/rl8
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/nx3
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/rc2
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/nt8
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFiles/ws5
/Users/in2293/Desktop/nlab_experiment_finder/AnalyzerFi

# Clean up and save the table as a .csv

In [184]:
df = pd.concat([Mdf,Ldf,Pdf],axis = 1)    #Create one single data frame with all the parameters.

df = df.drop_duplicates(subset=['anim', 'expt','WF']) #Remove redundant rows: copies of a file often exist.

df.to_csv('all_experiments') #saves to working dir

print(f'saved table with {df.shape[0]} experiments to ', wdir)

saved table with 1332 experiments to  /Users/in2293/Desktop/nlab_experiment_finder


# Create a table of all widefield Kalatsky retinotopy experiments.

Kalatsky experiments have some unique features. 
1) The looper only has ori = [0 90 180 270] \
2) The temporal period is really long.  e.g. > 800 frames. \
The above should narrow it down, but I include several other dependencies as well, just to make sure.



In [196]:
#Find all experiments where orientation was the only looping variable

#Make sure ori is the only looping parameter
Kdf = df.query("paramSymbol1 == 'ori'"). \
            query('paramSymbol2 != paramSymbol2'). \
            query('paramSymbol2 != paramSymbol3'). \
            query('paramSymbol3 != paramSymbol4') 
            

#ori loops through for cardinal directions: 
Kdf = Kdf.query("paramValues1 == '[0.0, 90.0, 180.0, 270.0]'")
    
print(f'n = {Kdf.shape[0]} experiments where ori is only looping variable: [0 90 180 270]')

#The bar is changing slowly
Kdf = Kdf.query('t_period > 800')
print(f'n = {Kdf.shape[0]}')

#The bar is drifting
Kdf = Kdf.query('separable == 0')
print(f'n = {Kdf.shape[0]}')

#The bar drifts over a large part of the screen
Kdf = Kdf.query('x_size > 100').query('y_size > 100')
print(f'n = {Kdf.shape[0]}')
      
#There is only one bar on the screen
Kdf = Kdf.query('s_freq < 1/80')
print(f'n = {Kdf.shape[0]}')

#Its a narrow bar and not a sinewave
Kdf = Kdf.query("st_profile == 'square'").query("s_duty < 0.4")
print(f'n = {Kdf.shape[0]} Kalatsky experiments')


Kdf = Kdf.query("WF == 1")  #Change this to 'twoP' if you want two-photon Kalatsky
print(f'n = {Kdf.shape[0]} widefield Kalatsky experiments')

Kdf = Kdf.drop_duplicates(subset = ['anim'])
print(f'n = {Kdf.shape[0]} animals in which widefield Kalatsky was run')


n = 345 experiments where ori is only looping variable: [0 90 180 270]
n = 251
n = 251
n = 230
n = 227
n = 227 Kalatsky experiments
n = 174 widefield Kalatsky experiments
n = 75 animals in which widefield Kalatsky was run


# Save table of Kalatsky experiments

In [191]:
Kdf.to_csv('Kalatsky table') #saves to working dir

print(f'saved table with {Kdf.shape[0]} experiments to ', wdir)


saved table with 75 experiments to  /Users/in2293/Desktop/nlab_experiment_finder
