# Two automated ways to organize files: folder organizer after a big project 

Have you ever finished coding a project and had 50 or more files that you need to organize in a single folder.  It can be extremely time-connsuming checking which files are necessary for the project, which files are not referenced by other files, or checking if Git errors exist.  If you use Git commandline and do not commit the files correctly, Git will write comments in your code files so that you can correct the conflicting Git errors.  Sometime it can be difficult to identify which file/s is/are the problem from the git interface.

<img src="toolbox.png" alt="Drawing" style="width: 200px;"/>

## 1) Get a list of file names and folder names

In [7]:
import os
import numpy as np

import nltk
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer

path = "C:\\Users"

# Get file names 
files = os.listdir(path)
all_fn = []
for f in files:
    fn = f.partition('.')[0]
    
    if fn[0:2] != '__':
        all_fn.append(fn)
print('FILE NAMES: ', all_fn)


# Get folder names
dir_names = []
c = 0
for root, directories, files in os.walk(path):
    
    if c == 0:
        root_out = root
    for name in directories:
        dn = os.path.join(name)
        if dn[0:2] != '__':
            dir_names.append(dn)
    c =+ 1
print('FOLDER NAMES: ', dir_names)

FILE NAMES:  ['check_axes_assignmentPLOT', 'compare_condi', 'confidence_interval', 'create_labels_and_initial_feature', 'cut_initial_trials', 'datadriven_FRT_vs_expmatFRT', 'detectFB_LR_trials', 'detectUD_trials', 'detectUD_trials_cabindetect', 'detect_bad_trials_rot', 'detect_bad_trials_trans', 'detect_jumps_in_data', 'detect_jumps_in_index_vector', 'detect_jumps_in_index_vector_simple', 'detect_nonconsecutive_values', 'detect_sig_change_wrt_baseline', 'detect_vertically_short_FBLR', 'equalized_signal_len', 'explode_without_colnames1', 'explode_without_colnames2', 'filter_sig3axes', 'findall', 'find_signchange_w_window', 'freqresp_functions', 'freq_from_sig_freqresp', 'freq_from_sig_timecounting', 'full_sig_2_cell', 'generate_joy_move_sign', 'index_vector', 'interpretation_of_kstest', 'main_preprocessing_steps', 'make_a_properlist', 'make_new_vec_w_index_vec', 'my_dropna_python', 'normal_distribution_feature_data', 'numderiv', 'pad_data_2makeclasses_equivalent', 'plot_count2subplot', 

## 2) Load subfunctions that we will use

In [8]:
def make_a_properlist(vec):
    
    import numpy as np
    
    out = []
    for i in range(len(vec)):
        out = out + [np.ravel(vec[i])]
    vecout = np.concatenate(out).ravel().tolist()
    
    return vecout

In [9]:
def preprocessing(text):
    # -------------------------------------
    # Creating word tokens (recognizing each word separately)
    # -------------------------------------
    # 1. Put the text into string format
    Content = ""
    for t in text:
        Content = Content + t.replace("'",'')

    # 2. Tokenize first to get each character separate
    tok = nltk.word_tokenize(Content)
    #print('length of tok: ' + str(len(tok)))
    
    # 3. Remove undesireable words from MY OWN stopword list
    word_tokens1 = remove_stopwords(tok)
    
    return word_tokens1

In [10]:
def remove_stopwords(wordtokens):
    
    # Put words that are 4 characters long or more, like a name, location, etc that you do not want to process
    list_to_remove = ["\n"]
    
    # first let's do a marker method
    marker_vec = np.zeros((len(wordtokens), 1))

    # search for the remove tokens in tok, an put a 1 in the marker_vec
    for i in range(len(wordtokens)):
        for j in range(len(list_to_remove)):
            if wordtokens[i] == list_to_remove[j]:
                marker_vec[i] = 1

    word_tokens0 = []
    for i in range(len(marker_vec)):
        if (marker_vec[i] == 0) & (len(wordtokens[i]) > 1): # this will remove tokens that are 3 characters or less 
            word_tokens0.append(wordtokens[i])
    
    return word_tokens0

In [11]:
def get_word_count_uniquewords(word_tokens):
    
    # -------------------------------------
    # Process word tokens
    # -------------------------------------
    vectorizer = CountVectorizer()

    # -------------------------------------
    # 1. Count word tokens and get a unique list of words : count how many times a word appears
    # Get the document-term frequency array: you have to do this first because it links cvtext to vectorizer
    X = vectorizer.fit_transform(word_tokens)
    word_count0 = np.ravel(np.sum(X, axis=0)) # sum vertically
    
    # Get document-term frequency list : returns unique words in the document that are mentioned at least once
    unique_words0 = np.ravel(vectorizer.get_feature_names())
    # -------------------------------------
    # 3. Remove undesireable words AGAIN and adjust the unique_words and word_count vectors
    list_to_remove = [""]
    
    # first let's do a marker method
    marker_vec = np.zeros((len(unique_words0), 1))

    # search for the remove tokens in tok, an put a 1 in the marker_vec
    for i in range(len(unique_words0)):
        for j in range(len(list_to_remove)):
            if unique_words0[i] == list_to_remove[j]:
                marker_vec[i] = 1
    
    unique_words = []
    word_count = []
    for i in range(len(marker_vec)):
        if (marker_vec[i] == 0) & (len(unique_words0[i]) > 4):
            unique_words.append(unique_words0[i])
            word_count.append(word_count0[i])
    
    m = len(np.ravel(word_count))
    # -------------------------------------
    
    # Matrix of unique words and how many times they appear
    mat = np.concatenate([np.reshape(np.ravel(word_count), (m,1)), np.reshape(unique_words, (m,1))], axis=1)

    print('There are ' + str(len(word_tokens)) + ' word tokens, but ' + str(len(unique_words)) + ' words are unique.')

    sort_index = np.argsort(word_count)
    
    A = np.array(sort_index.T)

    # But we want the index of unique_word_count sorted max to min
    Ainvert = A[::-1]
    
    # Convert the array to a list : this is a list where each entry is a list
    Ainv_list = []
    for i in range(len(Ainvert)):
        Ainv_list.append(Ainvert[i])
        
    # Top num_of_words counted words in document : cvkeywords
    keywords = []
    wc = []
    p = np.ravel(word_count)
    
    top_words = len(Ainv_list)  # 20
    for i in range(top_words):
        keywords.append(unique_words[Ainv_list[i]])
        wc.append(p[Ainv_list[i]])
    
    # Matrix of unique words and how many times they appear
    mat_sort = np.concatenate([np.reshape(np.ravel(wc), (top_words,1)), np.reshape(np.ravel(keywords), (top_words,1))], axis=1)
    print(mat_sort)
    # -------------------------------------
    
    return wc, keywords, mat_sort

##  3) Figure out which files reference other files
Open each file and read the first 50 lines of text to obtain python reference information.

In [29]:
num_of_lines2read = 50

ref_filelist = []
HEAD_filelist = []

for i in all_fn:
    #print('i: ', i)
    filename = '%s\\%s.py' % (root_out,i)
    #print("filename: ", filename)
    with open(filename, encoding='utf8', errors="surrogateescape") as cvinfo:
        cvtext = cvinfo.readlines()  
         
    word_tokensCV = preprocessing(cvtext)
    
    # ------------------------------
    # find files that should not be in the folder
    # Physical files that are not referenced - are either main files OR not needed
    # ------------------------------
    # get referenced files
    c = num_of_lines2read  # we only want to search the first several lines of document 
    d = 0 # flag for from to import
    ref_filename = []
    for j in word_tokensCV[0:c]:
        if j == 'from' and d == 0:
            #desireable text started : reference to handmade functions
            d = 1
        elif j != 'import' and d == 1:
            ref_filename.append(j)
        elif j == 'import' and d == 1:
            #stop desirable text
            d = 0
        elif j == 'HEAD':
            HEAD_filelist.append(i)
    
    ref_filelist.append(ref_filename)

print('HEAD_filelist: ', HEAD_filelist)

HEAD_filelist:  ['detect_jumps_in_data']


### Solution 1: 
Look at the HEAD_filelist to know which files need to be corrected using git commandline!

## 4) Tokenize the reference list
Parce the reference list more so that we can compare this list to the file name list, to determine which files in the folder have been referenced.  If the file has been referenced, it is a necessary file for the project.  If the file has not been referenced, it is a MAIN file or it is an UNNEEDED file that you need to manually decide.

In [31]:
out = make_a_properlist(ref_filelist) # make arrays of strings a list of strings 
out2 = [i.replace(".",' ') for i in out]
out3 = [i.split() for i in out2]  # tokenize strings with spaces in between them
out4 = make_a_properlist(out3)
#print(out4)

wc, keywords, mat_sort = get_word_count_uniquewords(out4)

There are 285 word tokens, but 62 words are unique.
[['75' 'subfunctions']
 ['21' 'make_a_properlist']
 ['14' 'plotly']
 ['14' 'subplots']
 ['9' 'scipy']
 ['8' 'statistics']
 ['7' 'findall']
 ['6' 'detect_sig_change_wrt_baseline']
 ['3' 'signal']
 ['3' 'detect_jumps_in_data']
 ['3' 'itertools']
 ['3' 'numderiv']
 ['3' 'preprocessing']
 ['3' 'joystick']
 ['3' 'standarization_check_if_joy_moved']
 ['3' 'compare_condi']
 ['3' 'two_sample_stats']
 ['3' 'vector']
 ['2' 'detect_jumps_in_index_vector']
 ['2' 'confidence_interval']
 ['2' 'collections']
 ['2' 'first']
 ['2' 'cabin']
 ['2' 'full_sig_2_cell']
 ['2' 'index']
 ['2' 'indices']
 ['2' 'alone']
 ['2' 'interpretation_of_kstest']
 ['2' 'almost']
 ['2' 'semi_automated_gen']
 ['2' 'vertshift_segments_of_data_wrt_prevsegment']
 ['2' 'standarization_plotting']
 ['2' 'standarization_fill_in_matrix']
 ['2' 'sklearn']
 ['1' 'mlxtend']
 ['1' 'textwrap']
 ['1' 'certain']
 ['1' 'check_axes_assignmentplot']
 ['1' 'stimuli']
 ['1' 'constant']
 ['1' 

In [28]:
all_fn_lower = [i.lower() for i in all_fn]

reff = {}  # initialization
for i in all_fn_lower:
    reff[i] = 0

for i in all_fn_lower:
    if i in keywords:
        reff[i] = reff[i] + 1

# List the non-referenced files
non_reff_files = []
for x in range(len(reff)):
    if reff[all_fn_lower[x]] == 0:
        non_reff_files.append(all_fn_lower[x])
print('Number of non-referenced files :', len(non_reff_files))
print(non_reff_files)

Number of non-referenced files : 31
['create_labels_and_initial_feature', 'datadriven_frt_vs_expmatfrt', 'detectfb_lr_trials', 'detectud_trials', 'detectud_trials_cabindetect', 'detect_jumps_in_index_vector_simple', 'equalized_signal_len', 'explode_without_colnames1', 'explode_without_colnames2', 'filter_sig3axes', 'freq_from_sig_freqresp', 'freq_from_sig_timecounting', 'generate_joy_move_sign', 'index_vector', 'main_preprocessing_steps', 'make_new_vec_w_index_vec', 'my_dropna_python', 'normal_distribution_feature_data', 'pad_data_2makeclasses_equivalent', 'plot_count2subplot', 'plot_count2subplot2', 'plot_tr2subplot', 'process_index_for_fblr_trials_timedetect', 'savessq', 'scale_feature_data', 'scikit_functions', 'scikit_functions_binaryclass', 'size', 'standarization_cabanaornexttr', 'standarization_notebadtrials', 'value_counts_vector']


### Solution 2:
So there are 31 files that are not referenced by other files in the common folder.  Thus, these files are either main functions or they have nothing to do with the project and can be removed.