In [1]:
import pandas as pd, numpy as np, matplotlib as plt
import re, os, time, datetime, io

pd.set_option('display.max_colwidth', 255)

In [2]:
# initialize pandas dataframe 'projects_df' for the data

projects_df = pd.DataFrame(columns=["name", "ext", "project", "code", "year", "month", "type"])

In [3]:
# set root_dir.  should be able to orient all cwds by adding folder names to this string
root_dir = 'C:\\Users\\jhoward\\Facility Engineering Associates\\\
Facility Engineering Associates Team Site - FEA Projects'

In [4]:
# generate an initial folder list at the / level (year names)
rootfolder_list = os.listdir(root_dir)

In [5]:
# removing hidden folder
rootfolder_list = rootfolder_list[1::]

# strip out the odd folder out
#rootfolder_list.remove('NPS FY19 PSAs')

In [6]:
rootfolder_list

['2017', '2018', '2019', '2020', '2021', '2022']

In [7]:
# and for now, stripping it down to 2017-2023
#rootfolder_list = rootfolder_list[11::]

In [8]:
# sort the list
rootfolder_list.sort()

In [9]:
# initialize dataframes for new folders to be added
# initial entries for the root level folders
new_folders = pd.DataFrame({'name': rootfolder_list, 'ext': "n/a", 'project': 'n/a', 'code': "n/a",\
                            'year': rootfolder_list, 'month': "n/a", 'type': 'folder', 'path': '/'})

# join initial folder list into the main dataframe
projects_df = pd.concat([projects_df, new_folders], axis=0, join='outer')

### Functions:
<ol type="1">
    <li><strong>build_filestructure</strong> - recursively applies the previous three functions at each level until it cannot go deeper, then merges dataframes</li>
    <ol style="list-style-type: lower-alpha;">
      <li>takes one parameter:</li>
        <ol style="list-style-type: lower-roman;">
            <li>- the root-level path you wish to use (root_dir, or root_dir\\year)</li>
            <br />  
        </ol>
    </ol>    
    <li><strong>get_subfolders</strong> - outputs a list of short dictionaries representing the subfolders inside a particular folder path</li>
    <ol style="list-style-type: lower-alpha;">
      <li>takes two parameters</li>
        <ol style="list-style-type: lower-roman;">
            <li><em>folderlist</em> (a list of folder names to check for subfolders)</li>
            <li><em>cwd</em> (the path to the folder containing those subfolders, as a string)</li>
        </ol>
    </ol>
    <br />
    <li><strong>add_folders</strong> - takes a list (generated by <em>get_subfolders</em>) and outputs a pandas df containing rows for folders, ready to be joined to the main projects_df</li>
        <ol style="list-style-type: lower-alpha;">
          <li>takes one parameter:</li>
            <ol style="list-style-type: lower-roman;">
                <li>a list (generated by <em>get_subfolders</em>) of dictionaries</li>
                <li>each dict should contain path, year, and subfolders keys.</li>
                <li>expects path & year values as strings, subfolder values as lists</li>
            </ol>
        </ol>
    <br />
    <li><strong>add_files</strong> - takes a list (generated by <em>get_subfolders</em>) and outputs a pandas df containing rows for files, ready to be joined to the main <em>projects_df</em></li>
    <ol style="list-style-type: lower-alpha;">
        <li>takes one parameter:</li>
            <ol style="list-style-type: lower-roman;">
                <li>a list (the same <em>new_folders</em> list generated by get_subfolders)</li>
            </ol>
    </ol>    
</ol>

In [10]:
# blank list to hold folder/path/type dicts
filestructure_list = []

def get_filestructure(listname, cwd):
    # blank lists to hold the dict's values under different keys
    filenames = []
    paths = []
    types = []
    
    # list of root level folders (years)
    folders = os.listdir(cwd)
    for folder in folders:
        # record the basedir to reset the cwd to after the loop
        base_dir = cwd
        cwd = cwd + '\\' + folder
        subfolder_dir = cwd

        if os.path.isdir(cwd) == True:
            #print(f"{cwd} is a folder")
            get_filestructure(listname, cwd)
            cwd = subfolder_dir
            filenames.append(folder)
            path = '\\' + cwd.strip('C:\\Users\\jhoward\\Facility Engineering Associates\\\
            Facility Engineering Associates Team Site - FEA Projects')
            paths.append(path)
            types.append("folder")
        else:
            #print(f"{cwd} is a folder")
            cwd = subfolder_dir
            filenames.append(folder)
            path = '\\' + cwd.strip('C:\\Users\\jhoward\\Facility Engineering Associates\\\
            Facility Engineering Associates Team Site - FEA Projects')
            path = path.strip(folder)
            paths.append(path)
            types.append("file")
            
        cwd = base_dir
        for i in range(0, len(filenames)):
            temp_dict = {}
            temp_dict['name'] = filenames[i]
            temp_dict['path'] = paths[i]
            temp_dict['type'] = types[i]
            listname.append(temp_dict)

    return listname

In [11]:
cwd = root_dir

In [12]:
filestructure_list = []
for folder in rootfolder_list:
    cwd = root_dir + '\\' + folder
    filestructure = get_filestructure(filestructure_list, cwd)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [13]:
projects_df = pd.DataFrame(filestructure)

In [14]:
projects_df.sort_values(by='path', inplace=True)

In [34]:
projects_df.head()

Unnamed: 0,name,path,type
200,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder
142,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder
63387,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder
66632,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder
67780,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder


In [15]:
duplicate = projects_df[projects_df.duplicated()]
 
print("Duplicate Rows :")
duplicate

Duplicate Rows :


Unnamed: 0,name,path,type
142,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder
103409,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder
103820,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder
103999,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder
200,01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),\2017\01.2017.008896 (Bethesda Row Jaleo Restaurant Structural Consulting),folder
...,...,...,...
3018306,Final Proposal,\2023\08.2023.000538 (King County Rural Library District FCA and Long Range Plan)\Final Prop,folder
3018305,Proposal Prep Materials,\2023\08.2023.000538 (King County Rural Library District FCA and Long Range Plan)\Proposal Prep M,folder
3018331,Final Proposal,\2023\11.2023.000024 (Wyoming SCD Professional Qualifications)\Final Prop,folder
3018330,Final Proposal,\2023\11.2023.000024 (Wyoming SCD Professional Qualifications)\Final Prop,folder


In [16]:
projects_df.shape

(3018358, 3)

In [None]:
## CURRENT ISSUES

# folders being added more than once?
#
# looks like (maybe) folders are being added once for every file in them.
# Not entirely clear.
#
# Try counting dupes, seeing if the number varies / matches anything, 
# and see if there are any duped files or if it is just folders.
#
# Also check that the folders really are dupes, using the unmodified
# (will need to remove some of the transformations from function possibly)
# fullpath.  There are certainly going to be duplicate folder names at different locations.  
#
# Re-check function, esp. indent levels.  Make sure it's not adding 
# folders in a part of the loop where it will get repeated.




In [24]:
# testname = projects_df.iloc[14]['name']
# testname = "testfile.na.hello"
testname = "testfolder"

if testname.count('.') == 1:
    print(testname.split('.'))
elif testname.count('.') > 1:
    print(f'Error: "{testname}" contains too many periods.')
else:
    print(f'{testname} is not a file.')


testfolder is not a file.


In [27]:
for i in range(0, 6):
               #len(projects_df)):
    filename = projects_df.iloc[i]['name']
    if filename.count('.') > 1:
        print("error, too many periods in filename")
    elif filename.count('.') == 1:
        print("we got one!")
        #[ filename, ext ] = filename.split('.')
    else:
        print("nope")
#         ext = "n/a"
#         projects_df.iloc[i]['name'] = filename
#         projects_df.iloc[i]['ext'] = ext
    

we got one!
we got one!
we got one!
we got one!
we got one!
we got one!


In [16]:
projects_df.sample(6)

Unnamed: 0,name,path,type
341571,IMG_1432.JPG,\2018\01.2018.009280 (Montgomery County Public Schools Facility Condition Assessment)\2018 Original\Phase 3 - Facility Condition Survey\02 Andrew\569.Strawberry Knoll ES\Pics\,file
2414639,IMG_4080.JPG,\2021\01.2021.009736 (University of Maryland Eastern Shore (UMES) Facility Condition Assessment)\Photos\Student Res Cluster #3\SRC 3 - 3000\,file
369018,IMG_9175.JPG,\2018\01.2018.009280 (Montgomery County Public Schools Facility Condition Assessment)\2018 Original\Phase 3 - Facility Condition Survey\03 CK\302.Burtonsville ES\Photos\,file
1411697,A5.03.pdf,\2020\01.2020.009579 (Pike & Rose Block 11 Curtainwall Consulting)\Provided Documents\PR11_Architectural\,file
2529981,IMG_1076.JPG,\2021\01.2021.009766 (ABM-Bedrock RE FCA & Energy Review)\Photos\1702 W. Fort St\,file
1266270,IMG_9734.JPG,\2019\01.2019.009553 (Salisbury University Facility Condition Assessment)\JPC Building Photos\EEF Building Photos\SV (Severn Hall)\,file


In [None]:
# add tail end of name (from end of name back to .) as 'ext' col if row typw is "file" 
# and remove ".[ext]" from name

projects_df.loc[projects_df["type"] == "file", "ext"] = "testval"

# does not work
# for i in range(0, len(projects_df)):
#     year = str(projects_df.iloc[i]['path'])[1:5]
#     projects_df.iloc[i]['year'] = year

In [None]:
templist = projects_df['name'][0:4].tolist()

templist[0].split('.')

In [15]:
projects_df.iloc[0]['name']

'IMG_9471.JPG'

### Older drafts of functions, for reference

In [None]:
# used to generate a list of folders for the add_ functions to iterate through
# the folderlist param should be a list of foldernames
# cwd should be the root folder containing those folders
# ie: [FEA Projects] as cwd / [list containing '2016'-'2023'] will provide a 
# list of all the sub-folders under each year

def get_subfolders(folderlist, cwd):
    
    # init blank list to hold extracted column values
    new_folders = []
    
    # loop through the folder list making adding a line to new_folders for each entry
    for x in (range(0, len(folderlist))):
        subfolder_dict = {}
        active_folder = cwd + '\\' + folderlist[x]
        
        # the path and year keys are a single string value
        subfolder_dict['path'] = cwd + '\\' + folderlist[x]
        # the full path up to but not including the subfolder
        curr_year = subfolder_dict['path'].strip(root_dir)[0:4]
        # the year folder within that path
        subfolder_dict['year'] = curr_year
        
        # init the subfolders key value as a blank list
        subfolder_dict['subfolders'] = []
        # make a list of active_folder dir contents
        subfolder_list = os.listdir(active_folder)
        # the subfolders_list is a list of file and folder names at that path
        
        # iterate through the list and add folders only to the list in the dict
        for i in range(0, len(subfolder_list)):
            if os.path.isdir(active_folder + '\\' + subfolder_list[i]) == True:
                subfolder_dict['subfolders'].append(subfolder_list[i])
            else:
                pass
        
        # now the subfolders key-value is a list of only the folders at that path
        
        new_folders.append(subfolder_dict)
        # add the row with those three values to the df, then repeat loop for next folder
        
    # returns the final list
    return new_folders

In [None]:
def add_folders(newfolder_list):
    # lists to hold extracted column values
    foldernames, exts, projects, codes, years, months, types, paths = ([] for i in range(8))

    for i in range(0, len(newfolder_list)):
        cwd = newfolder_list[i]['path']
        curr_year = newfolder_list[i]['year']

        for x in range(0, len(newfolder_list[i]['subfolders'])):
            active_folder = newfolder_list[i]['subfolders'][x]

            if active_folder[0].isnumeric() and active_folder[2] == ".":
                months.append(active_folder[0:2])
    #             years.append(active_folder[3:7])
                codes.append(active_folder[8:14])
                foldername = re.sub("-|\(|\)", "", active_folder[15::])
                projects.append(active_folder[0:14])
            else:
                months.append("unspecified")
    #             years.append(curr_year)
                codes.append("unknown")
                projects.append(active_folder)
                foldername = active_folder

            while foldername[0] == [" ", "-"]:
                foldername = foldername[1::]
            else:
                pass
            
            foldernames.append(foldername)
            years.append(folder_list[i]['year'])
            exts.append("n/a")
            paths.append(cwd + "\\" + active_folder)
            types.append("folder")

    new_folders = pd.DataFrame(
        {'name': foldernames,
         'ext': exts,
         'project': projects,
         'code': codes,
         'year': years,     
         'month': months,
         'type': types,
         'path': paths
        })

    return new_folders

In [None]:
def get_filestructure(cwd):
    # dict to hold folders and paths
    filestructure_dict = {}
    # list of root level folders (years)
    start_folders = os.listdir(cwd)
    # drop the weird hidden folder
    for folder in start_folders:
        cwd = cwd + '//' + folder
        
        projects = os.listdir(cwd)
        for subfolder in projects:
            subfolder_cwd = cwd + '\\' + subfolder
            if os.path.isdir(subfolder_cwd) == True:
                sub_subfolders = os.listdir(subfolder_cwd)
                for sub_subfolder in sub_subfolders:
                    sub_subfolder_cwd = subfolder_cwd + '\\' + sub_subfolder
                    if os.path.isdir(subfolder_cwd) == True:
                        print(sub_subfolder_cwd)
                
                sub_subfolder_cwd = sub_subfolder_cwd.strip('\\' + sub_subfolder)
                
            subfolder_cwd = subfolder_cwd.strip('\\' + subfolder)
                

                    
            #for sub_subfolder in sub_subfolders:
#                     print(sub_subfolder)

#         folder_cwd = base_dir + '\\' + folder
        # loop to subfolders
        cwd = cwd.strip('//' + folder)

        
    
#     for folder in base_folders:
#         subfolders = os.listdir(cwd)
#         folder_cwd = base_dir + '\\' + folder
#         for subfolder in subfolders:
#             subfolder_cwd = cwd + '\\' + subfolder
#             print(subfolder)
#         while os.path.isdir(cwd) == True:
#             for subfolder in subfolders:
#                 print(subfolder)
#                 sub_subfolders = os.listdir(cwd)
#                 cwd = cwd + '\\' + subfolder
#                 while os.path.isdir(cwd) == True:
#                     print(cwd)
#                 else:
#                     print("sub-sub breakpoint")
#                     pass
#             else:
#                 print("sub breakpoint")
#                 pass
                
                
            
            # go deeper and repeat
#             for i in range(0, len(subfolders_list)):
#                 print(subfolders_list[i])
#             else:
#                 print("pass")
                

In [None]:
# add those folders and paths to the filestructure dict
        foldername = folder        
        folderpath = cwd + '//' + folder
        # if folder contains subfolders
        
    
    
    while folder 
        subfolder_list = # the subfolders
        
        if os.path.isdir(cwd + '\\' + subfolders[i]) == True:
            subfolder_dict['subfolders'].append(subfolder_list[i])
        else:
            pass
    return subfolders
    #for i in range(0, len(folder_list)):


In [None]:
### draft of get_filestructure

for i in range(0, len(folder_list)):
    # get the base path (folder that contains subfolders)
    basepath = folder_list[i]['path']
    year = folder_list[i]['year']
#    subfolders = len(folder_list[i]['subfolders'])
    new_folders = []    
    
    for x in range(0, len(folder_list[i]['subfolders'])):
        # blank list to hold subfolder names
        subfolder_list = []
        
        # get the full (path to subfolder root) path
        # these will be the same for each subfolder in a given set of subfolders.
        # essentially copying path and 
        subfolder = folder_list[i]['subfolders'][x]
        fullpath = f'{basepath}\\{subfolder}'
        # append each sub-subfolder name to list
        new_folders.append(subfolder)
        print(fullpath)
           
    print(new_folders)

#         # name of subfolder
#         #active_folder = folder_list[i]
#         # ['subfolders'][x]
#         # full path of subfolder
        
#     print(f'subfolder: {subfolder}, basepath: {basepath}, fullpath: {fullpath}')
        
#         # add the list of dicts (per get_subfolders) to subfolderlist
#         subfolderlist.append(get_subfolders(test_list[i]['subfolders'], basepath))
        # subfolderlist is now a list of lists that can be run through add_folders
    
    
        # iterate through each subfolder and add_folders
        #for f in range(0, len(subfolderlist)):
         #   add_folders(subfolderlist)
            
        

        
        #new_folders = add_folders(folder_list)
        # join new folder list into the main dataframe
        #projects_df = pd.concat([proj_filenames, new_folders], axis=0, join='outer')
        
#         new_folders.append(os.listdir(cwd))

In [None]:
cwd = root_dir
# newfolder_list = get_folders(rootfolder_list, cwd)
folder_list = get_subfolders(rootfolder_list, cwd)
new_folders = add_folders(folder_list)
# join new folder list into the main dataframe
projects_df = pd.concat([projects_df, new_folders], axis=0, join='outer')

In [None]:
projects_df.tail()

In [None]:
projects_df.sample(5)

In [None]:
test_list = folder_list[0]['subfolders']

In [None]:
test_list

In [None]:
folder_list[0]['year']

In [None]:
cwd = cwd + '\\' + folder_list[0]['year']

In [None]:
os.listdir(root_dir + '/2022')

In [None]:
cwd = root_dir
# newfolder_list = get_folders(rootfolder_list, cwd)
folder_list = get_subfolders(rootfolder_list, cwd)
new_folders = add_folders(folder_list)
# join new folder list into the main dataframe
projects_df = pd.concat([projects_df, new_folders], axis=0, join='outer')

In [None]:
new_folders = add_folders(newfolder_list, cwd)

In [None]:
projects_df = pd.concat([projects_df, new_folders], axis=0, join='outer')

In [None]:
projects_df['path'].sample(15)

In [None]:
for i in range(len(rootfolder_list)):
    subdir = rootfolder_list[i]
    cwd = (f'{root_dir}/{subdir}')
    new_folders = get_folders(rootfolder_list, cwd)
    
    
    print(new_folders)

In [None]:
subdir = rootfolder_list[0]
cwd = (f'{root_dir}/{subdir}')

In [None]:
get_folders(rootfolder_list,cwd)

In [None]:
# makes a list of subfolders, currently 1 level deep under root folders (years)
newfolder_list = get_folders(rootfolder_list, cwd)

In [None]:
rootfolder_list[0]

In [None]:
new_folders

In [None]:
projects_df