# Find New TXTs

This will find any TXT files in the raw_txts folder that have not been processed, print out their names, and move them into their individual folders

In [1]:
# Save the notebook as a .py, and run on HPC
!jupyter nbconvert --to script findnewtxt.ipynb --TagRemovePreprocessor.remove_cell_tags='{"note"}' --output-dir ./pys
!sbatch --mem=20G /mnt/isilon/marsh_single_unit/MarshMountainSort/FindNewTXT.sh
!echo "COMPLETE"

Submitted batch job 23417168
COMPLETE


In [11]:
# Create new folders and move all raw .TXTs into individual folders
PUTINDIRECTORY = True

# Show converted files. Set to false if there are too many converted/outputted files
SHOWCONVERTED = False

# Show files that have figures generated. Set to false if there are too many files.
SHOWOUTPUTTED = False

# Compress all present .BIN files
COMPRESSBINS = False

# Move output files into subdirectories. You can also do this manually
MOVEOUTPUT = False

# Create new Turn Depth .XLSX
NEWTURNDEPTHS = False

In [12]:
import glob
import shutil
import os
import gzip
import numpy as np
from pathlib import Path
import pandas as pd
from datetime import date

## Define Constants

In [13]:
base_folder_path = Path(f'/mnt/isilon/marsh_single_unit/MarshMountainSort')
raw_txts_folder = 'rawtxts'
binary_folder = 'bins'
output_folder = 'output'
tetrode_names = ['ca3', 'ca1s', 'ca1o']
genotypes = [['WT', 'Bl6'], ['Exp', 'EXP']]
turndepth_xlsx = 'fileturndepths.xlsx'

txt_folder_path = base_folder_path / raw_txts_folder
bin_folder_path = base_folder_path / binary_folder
output_folder_path = base_folder_path / output_folder

print(txt_folder_path)
print(bin_folder_path)
print(output_folder_path)

/mnt/isilon/marsh_single_unit/MarshMountainSort/rawtxts
/mnt/isilon/marsh_single_unit/MarshMountainSort/bins
/mnt/isilon/marsh_single_unit/MarshMountainSort/output


## Find Unconverted .TXTs

In [14]:
# Quickly view all .TXT files in the raw_txts folder

# Find all files
txt_file_list = glob.glob(f'{txt_folder_path}/*.txt')
txt_in_folder_list = glob.glob(f'{txt_folder_path}/*/*.txt')
bin_file_list = glob.glob(f'{bin_folder_path}/*/*.bin')
gzip_file_list = glob.glob(f'{bin_folder_path}/*/*.npy.gz')

txt_file_names = [Path(e).stem for e in txt_file_list]
txt_in_folder_names = [Path(e).parent.stem for e in txt_in_folder_list]
bin_file_names = [Path(e).stem for e in bin_file_list]
gzip_file_names = [Path(e).name.partition('.')[0] for e in gzip_file_list]

# Print out files
print('===UNCONVERTED TXT FILES===')

print('NOT IN FOLDER:')
for e in txt_file_names:
    if e not in bin_file_names + gzip_file_names:
        print(f'"{e}",')
    else:
        if SHOWCONVERTED:
            print(f'\t\t[CONVERTED] {e}')
print()
print('IN FOLDER:')
for e in txt_in_folder_names:
    if e not in bin_file_names + gzip_file_names:
        print(f'"{e}",')
    else:
        if SHOWCONVERTED:
            print(f'\t\t[CONVERTED] {e}')

# 1185_Exp_4 half turns + 3-8th of a full turn_after turning_1-27-17

===UNCONVERTED TXT FILES===
NOT IN FOLDER:

IN FOLDER:


## Move .TXTs into Folders

In [15]:
# Put exposed TXT files into individual directories
if PUTINDIRECTORY:
    for e in txt_file_names:
        fpath = txt_folder_path / e / f'{e}.txt'
        opath = txt_folder_path / f'{e}.txt'
        # print(txt_folder_path / f'{e}.txt')
        
        os.makedirs(fpath.parent, exist_ok=True)
        shutil.move(opath, fpath)

## Compress .BINs

In [16]:
if COMPRESSBINS:
    for i, e in enumerate(bin_file_list):
        fcomp_name = Path(e).parent / f"{Path(e).stem}.npy.gz"
        
        # If the compressed file exists, skip
        if fcomp_name.is_file():
            continue
        
        bin_arr = np.fromfile(e, dtype=np.float32)
        with gzip.GzipFile(fcomp_name, "w") as fcomp:
            np.save(file=fcomp, arr=bin_arr)
        print(f"Compressed: {fcomp_name.name}")


## Find Unoutputted .BINs

In [17]:
# Find unoutputted binaries and print them out
bin_folder_list = glob.glob(f'{bin_folder_path}/*')
bin_folder_names = [Path(e).stem for e in bin_folder_list]

conv_folder_parent_list = glob.glob(f'{output_folder_path}/*')
conv_folder_parent_names = [Path(e).stem for e in conv_folder_parent_list]

print('===UNOUTPUTTED BINARY FILES===')
for e in bin_folder_names:
    # print(e)
    if e not in conv_folder_parent_names:
        print(f'"{e}", ')


===UNOUTPUTTED BINARY FILES===
"1238_Exp_4 half turn after turning_5_4_17", 
"1244_Bl6_Final Protocol_6th half turn_recorded after turning_Day1 morning 7-03-2017", 
"1238_Expanded_2 half turns_after turning_05-02-17", 
"1269_EXP_Final Protocol 6th half turn Morning_recorded after recovery_8-2-2017", 
"1269_EXP_5th half turn_recorded after recovery_8-1-2017", 
"766_WT_Final Protocol Night 6th half turn+2_4_recorded after turning_7-31-2017", 
"766_WT_5th half turn_recorded after turning_7-26-2017", 
"1238_Exp_3 half turn after turning_5_3_17", 
"766_WT_Final Protocol Night 6th half turn +1_4 _recorded after turning_7-28-2017", 
"1236_Exp_6 half turn after turning_Day1 night 5_5_17", 
"766_WT_3rd half turn_recorded after turning_7-24-2017", 
"1236_Exp_4 half turn after turning_5_4_17", 
"766_WT_Final Protocol Morning 6th half turn_recorded after turning_7-27-2017", 
"1236_Exp_3 half turn after turning_5_3_17", 
"1244_Bl6_3rd half turn_recorded after turning_ 6-30-2017", 
"1269_EXP_2nd hal

## Move Outputs into Folders

In [8]:
# Move all output files into respective folders if applicable.
# If an output folder exists and is not empty, do not move and report the conflict
# Otherwise, create folder and move outputs into folder

if MOVEOUTPUT:
    print('===OUTPUT FOLDERS===')
    
    for e in conv_folder_parent_names:
        print(e)
        for i, region in enumerate(tetrode_names):
            region_folder_path = Path(output_folder_path / e / region)
    
            region_files = glob.glob(f'{region_folder_path.parent}/{region}*.*')
            region_present = len(region_files) != 0
            # print(region_present)
            
            # print(region_folder_path)
            
            if os.path.exists(region_folder_path):
                if len(os.listdir(region_folder_path)) != 0 and region_present:
                    print(f'\tCONFLICT {i} : {region} folder not empty')
                    continue
            elif region_present:
                os.makedirs(region_folder_path)
    
            
            for file in region_files:
                new_file = region_folder_path / Path(file).name
                print(f'\tMOVED FILE : {file}')
                shutil.move(file, new_file)
                
            

## Find Unoutputted Tetrode .BINs

In [9]:
# Report which tetrodes have not been processed, based on presence of /ca3 /ca1s /ca1o subfolders
conv_folder_list = glob.glob(f'{output_folder_path}/*/*')
conv_folder_names = [Path(e).parent.stem for e in conv_folder_list]

print('===UNPROCESSED TETRODE BINARY FILES===')
for e in bin_file_names:
    if e not in txt_file_names + txt_in_folder_names:
        continue
    
    if e not in conv_folder_names:
        print(e)
    else:
        hasall = True
        msg = ''
        for i, region in enumerate(tetrode_names):
            if f'{output_folder_path}/{e}/{region}' not in conv_folder_list and f'{output_folder_path}/{e}/{region}_NO_UNITS' not in conv_folder_list:
                msg += f'\t\tMISSING OUTPUT {i} : {region}\n'
                hasall = False
            else:
                if SHOWOUTPUTTED:
                    msg += f'\t\tOK {i} : {region}\n'
        if not hasall:
            msg = f'\t[INCOMPLETE] {e}\n' + msg
        else:
            if SHOWOUTPUTTED:
                msg = f'\t[COMPLETE] {e}\n' + msg
        
        if msg != '':
            print(msg)

===UNPROCESSED TETRODE BINARY FILES===


## Make Turn Depth .XLSX

In [10]:
if NEWTURNDEPTHS:
    gzip_file_names.sort()
    gzip_file_names_noreg = [file for file in gzip_file_names if not any(reg in file for reg in tetrode_names)]

    gzip_file_ids = []
    gzip_file_genotypes = []
    
    for i, file in enumerate(gzip_file_names_noreg):
        gzip_file_ids.append(file.split('_')[0])  # Number before first '_' = ID
        for geno_aliases in genotypes:
            if any([True for x in geno_aliases if x in file]):
                gzip_file_genotypes.append(geno_aliases[0])
                break
                
            # if geno in file:
                
        if len(gzip_file_genotypes) < len(gzip_file_ids):
            raise Exception(f"No valid genotype label for {file}")
        
    df = pd.DataFrame({'Filename': gzip_file_names_noreg, 'ID': gzip_file_ids, 'Genotype': gzip_file_genotypes, 'Halfturns':np.nan * len(gzip_file_names_noreg)})
    df.sort_values(['ID', 'Genotype'])
    print(df)
    print(base_folder_path / turndepth_xlsx)

    today = date.today()
    today = today.strftime("%m-%d-%Y")
        
    with pd.ExcelWriter(base_folder_path / turndepth_xlsx, mode='a', if_sheet_exists='new') as writer:
        df.to_excel(writer, index=False, sheet_name=today)
    


    

    # reg_file_list = [f for f in gzip_file_list if f'{region}.npy.gz' in f]
    # gzip_folder_names = [f for f in gzip_file_names]
    # reg_file_names = [Path(e).name.partition('.')[0] for e in reg_file_list]
    

                                              Filename    ID Genotype  \
0      1132_WT Bl6_7 half turns_after turning_01-23-17  1132       WT   
1    1133_Exp_5 half turns + 1-4th of full turn_aft...  1133      Exp   
2    1133_Exp_5 half turns + 1-4th of full turn_aft...  1133      Exp   
3    1133_Exp_5 half turns + 1-4th of full turn_aft...  1133      Exp   
4    1133_Exp_5 half turns + 1-4th of full turn_rig...  1133      Exp   
..                                                 ...   ...      ...   
191  766_WT_Final Protocol Morning 6th half turn+3_...   766       WT   
192  766_WT_Final Protocol Morning 6th half turn_re...   766       WT   
193  766_WT_Final Protocol Night 6th half turn +1_4...   766       WT   
194  766_WT_Final Protocol Night 6th half turn+2_4_...   766       WT   
195  766_WT_Final Protocol Night 6th half turn_reco...   766       WT   

     Halfturns  
0          NaN  
1          NaN  
2          NaN  
3          NaN  
4          NaN  
..         ...  
191 