In [71]:
from pathlib import Path
import pandas as pd
import csv

# Naming of variables follows terminology of a tree - root, level 1, level 2, ...
# A folder's children = files and sub-folders in the folder.

In [72]:
# Create dictionary that will hold all the folder names and paths
# where key = name, value = folderpath
proj_dir = {}
proj_dir

{}

In [73]:
# Manual: write down root
proj_dir["root"] = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call")

In [74]:
# Check if root exists
root_set = set(proj_dir.values())
[root_folder.exists() for root_folder in root_set]

[True]

In [75]:
# Print root folder path
[root_folder for root_folder in root_set]

[WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call')]

In [76]:
# Manual: write down l1 folders
proj_dir["l1_code"] = Path(proj_dir["root"] / "code")
proj_dir["l1_documentation"] = Path(proj_dir["root"] / "documentation")
proj_dir["l1_output"] = Path(proj_dir["root"] / "output")

In [77]:
# Check if l1 folder paths exist - take the non-intersection of sets
l1_set = set(proj_dir.values()) ^ root_set
[l1_folder.exists() for l1_folder in l1_set]

[True, True, True]

In [78]:
# Print l1 folder names
[l1_folder.name for l1_folder in l1_set]

['code', 'output', 'documentation']

In [79]:
# Print out children of each l1_folder
for l1_folder in l1_set:
    print("Parent:", l1_folder.name)
    for child in l1_folder.iterdir(): 
        print(child.name)
    print("")

Parent: code
.git
.idea
.ipynb_checkpoints
00_create_proj_dir
01_download_data
02_process_cc_pdf_to_csv
03_firm_identification
04_keyword_identification
05_extract_desc_in_front_page
10_common_tasks
11_download_specified_reports
misc

Parent: output
01_download_cc
02_process_cc
03_firm_identification
04_keyword_identification
05_extract_front_page_desc
BadList.csv
misc

Parent: documentation
conferencecall_stage1-3_documentation.pdf
conferencecall_stage1-3_documentation.tex
conferencecall_stage4_documentation.pdf
figures



In [80]:
# Manual: write down l2 folders

# Children of l1_code
l1_code = proj_dir["l1_code"]

proj_dir["l2_create_proj_dir"] = Path(l1_code / "00_create_proj_dir")

# Children of l1_output
l1_output = proj_dir["l1_output"]

proj_dir["l2_download_cc"] = Path(l1_output / "01_download_cc")
proj_dir["l2_process_cc"] = Path(l1_output / "02_process_cc")
proj_dir["l2_firm_identification"] = Path(l1_output / "03_firm_identification")
proj_dir["l2_keyword_identification"] = Path(l1_output / "04_keyword_identification")
proj_dir["l2_extract_front_page_desc"] = Path(l1_output / "05_extract_front_page_desc")
proj_dir["l2_misc"] = Path(l1_output / "misc")

In [81]:
# Check if l2 folder paths exist - take the non-intersection of the sets
l2_set = set(proj_dir.values()) ^ l1_set ^ root_set
[l2_folder.exists() for l2_folder in l2_set]

[True, True, True, True, True, True, True]

In [82]:
# Print l2 folder names
[l2_folder.name for l2_folder in l2_set]

['03_firm_identification',
 '00_create_proj_dir',
 '04_keyword_identification',
 '01_download_cc',
 '02_process_cc',
 '05_extract_front_page_desc',
 'misc']

In [83]:
# Print out children of each l2_folder
for l2_folder in l2_set:
    print("Parent:", l2_folder.name)
    for child in l2_folder.iterdir(): 
        print(child.name)
    print("")

Parent: 03_firm_identification
03.1_compustat_sas7bdat
03.2_hassan_raw
03.3_compustat_processed
03.3_hassan_processed
03.4_cc_list_2
03.4_cc_list_sixun
misc
prev_20210924
readme.docx

Parent: 00_create_proj_dir
.ipynb_checkpoints
create_proj_dir.ipynb
proj_dir.csv

Parent: 04_keyword_identification
04.1_empty_set_of_50_group_folders
04.2_reference_files
04.3_filled_set_of_50_group_folders
04.4_totalcircnew
04.5_cric1_newtotal
04.6_paragraph_record
04.7_entry_files
04.8_entry_files_combined
misc
Test Set of 50 Group Folders
uploaded_on_shared_drive_with_kilian

Parent: 01_download_cc
01.1_pdf_2
01.1_xls_2
checks
currentvsprev_foldernames.txt

Parent: 02_process_cc
02.1_txt_2
02.2_csv_2
checks
misc

Parent: 05_extract_front_page_desc
05.1_converted_on_mercury
05.1_converted_on_windows_jason
05.2_converted_manually
05.3_xls_combined_withfrontpagedescriptions
05.4_entryfiles_combined
misc

Parent: misc
breakupreportidsintochunksof25
Checkreportidstallywithtitles
DataClean
extractremainingp

In [84]:
# Manual: write down l3 folders

# Children of l2_download_cc
l2_download_cc = proj_dir["l2_download_cc"]

proj_dir["l3_pdf_2"] = Path(l2_download_cc / "01.1_pdf_2")
proj_dir["l3_xls_2"] = Path(l2_download_cc / "01.1_xls_2")
proj_dir["l3_download_cc_checks"] = Path(l2_download_cc / "checks")

# Children of l2_process_cc
l2_process_cc = proj_dir["l2_process_cc"]

proj_dir["l3_txt_2"] = Path(l2_process_cc / "02.1_txt_2")
proj_dir["l3_csv_2"] = Path(l2_process_cc / "02.2_csv_2")
proj_dir["l3_process_cc_checks"] = Path(l2_process_cc / "checks")

# Children of l2_firm_identification
l2_firm_identification = proj_dir["l2_firm_identification"]

proj_dir["l3_compustat_sas7bdat"] = Path(l2_firm_identification / "03.1_compustat_sas7bdat")
proj_dir["l3_hassan_raw"] = Path(l2_firm_identification / "03.2_hassan_raw")
proj_dir["l3_compustat_processed"] = Path(l2_firm_identification / "03.3_compustat_processed")
proj_dir["l3_hassan_processed"] = Path(l2_firm_identification / "03.3_hassan_processed")
proj_dir["l3_cc_list_2"] = Path(l2_firm_identification / "03.4_cc_list_2")
proj_dir["l3_cc_list_sixun"] = Path(l2_firm_identification / "03.4_cc_list_sixun")

# Children of l2_keyword_identification
l2_keyword_identification = proj_dir["l2_keyword_identification"]

proj_dir["l3_empty_set_of_folders"] = Path(l2_keyword_identification / "04.1_empty_set_of_50_group_folders")
proj_dir["l3_reference_files"] = Path(l2_keyword_identification / "04.2_reference_files")
proj_dir["l3_filled_set_of_folders"] = Path(l2_keyword_identification / "04.3_filled_set_of_50_group_folders")
proj_dir["l3_totalcircnew"] = Path(l2_keyword_identification / "04.4_totalcircnew")
proj_dir["l3_cric1_newtotal"] = Path(l2_keyword_identification / "04.5_cric1_newtotal")
proj_dir["l3_paragraph_record"] = Path(l2_keyword_identification / "04.6_paragraph_record")
proj_dir["l3_entry_files"] = Path(l2_keyword_identification / "04.7_entry_files")
proj_dir["l3_entry_files_combined"] = Path(l2_keyword_identification / "04.8_entry_files_combined")

# Children of l2_extract_front_page_desc
l2_extract_front_page_desc = proj_dir["l2_extract_front_page_desc"]

proj_dir["l3_extract_mercury"] = Path(l2_extract_front_page_desc / "05.1_converted_on_mercury")
proj_dir["l3_extract_windows"] = Path(l2_extract_front_page_desc / "05.1_converted_on_windows_jason")
proj_dir["l3_extract_manual"] = Path(l2_extract_front_page_desc / "05.2_converted_manually")
proj_dir["l3_extract_xlscombined"] = Path(l2_extract_front_page_desc / "05.3_xls_combined_withfrontpagedescriptions")
proj_dir["l3_extract_entry_files_combined"] = Path(l2_extract_front_page_desc / "05.4_entryfiles_combined")

In [85]:
# Check if l3 folder paths exist - take the non-intersection of the sets
l3_set = set(proj_dir.values()) ^ l2_set ^ l1_set ^ root_set
[l3_folder.exists() for l3_folder in l3_set]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [86]:
# Print l3 folder names
[l3_folder.name for l3_folder in l3_set]

['03.3_compustat_processed',
 '02.2_csv_2',
 '03.4_cc_list_sixun',
 '04.8_entry_files_combined',
 '04.1_empty_set_of_50_group_folders',
 '03.3_hassan_processed',
 '03.4_cc_list_2',
 '05.1_converted_on_windows_jason',
 '01.1_pdf_2',
 '04.6_paragraph_record',
 '03.2_hassan_raw',
 '05.3_xls_combined_withfrontpagedescriptions',
 '05.1_converted_on_mercury',
 'checks',
 '05.2_converted_manually',
 '04.7_entry_files',
 '03.1_compustat_sas7bdat',
 '04.4_totalcircnew',
 '05.4_entryfiles_combined',
 '02.1_txt_2',
 'checks',
 '04.2_reference_files',
 '01.1_xls_2',
 '04.3_filled_set_of_50_group_folders',
 '04.5_cric1_newtotal']

In [87]:
proj_dir

{'root': WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call'),
 'l1_code': WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call/code'),
 'l1_documentation': WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call/documentation'),
 'l1_output': WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call/output'),
 'l2_create_proj_dir': WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call/code/00_create_proj_dir'),
 'l2_download_cc': WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call/output/01_download_cc'),
 'l2_process_cc': WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call/output/02_process_cc'),
 'l2_firm_identification': WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call/output/03_firm_identification'),
 'l2_keyword_identification': WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call/output/04_keyword_identification'),
 'l2_extract_front_page_desc': WindowsPath('C:/Users/jasonjia/Dropbo

In [88]:
# Convert dictionary proj_dir into a csv file
# But save the folder paths as strings, instead of Path objects
outputfile = "proj_dir.csv"
outputpath = Path(proj_dir["l2_create_proj_dir"] / outputfile)

with open(outputpath, 'w') as f:  
    writer = csv.writer(f)
    for key, value in proj_dir.items():
       writer.writerow([key, str(value)])

In [102]:
# Import the .csv file and convert it back to a dictionary
test = pd.read_csv(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\code\00_create_proj_dir\proj_dir.csv", header=None, index_col=0, squeeze=True).to_dict()

In [103]:
test

{'root': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call',
 'l1_code': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call\\code',
 'l1_documentation': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call\\documentation',
 'l1_output': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call\\output',
 'l2_create_proj_dir': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call\\code\\00_create_proj_dir',
 'l2_download_cc': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call\\output\\01_download_cc',
 'l2_process_cc': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call\\output\\02_process_cc',
 'l2_firm_identification': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call\\output\\03_firm_identification',
 'l2_keyword_identification': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call\\output\\04_keyword_identification',
 'l2_extract_front_page_desc': 'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call\\output\\05_extract_front_page_desc',
 

In [104]:
test["root"]

'C:\\Users\\jasonjia\\Dropbox\\Projects\\conference_call'

In [107]:
test_root = Path(test["root"])
test_root

WindowsPath('C:/Users/jasonjia/Dropbox/Projects/conference_call')

In [None]:
def proj_dir(){
    proj_dir = pd.read_csv(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\code\00_create_proj_dir\proj_dir.csv", header=None, index_col=0, squeeze=True).to_dict()
    return proj_dir
}