# Part 1: Combine fragments of the same root file

In [175]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress

In [176]:
pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

In [177]:
folder = Path(r"C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_filtered_by_columns")
outputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined")

In [179]:
# List the files in a directory (no recursive walk)
files = os.listdir(folder)
files.sort()
files

['Europe - Board Summary_filtered_cols.xlsx',
 'Europe - Company Details_filtered_cols.xlsx',
 'Europe - Director Profile - Characteristics_cols.xlsx',
 'Europe - Director Profile - Education & Achievements_sheet0_cols.xlsx',
 'Europe - Director Profile - Education & Achievements_sheet1_cols.xlsx',
 'Europe - SMDEs Org Summary - 1_filtered_cols.xlsx',
 'Europe - SMDEs Org Summary - 2_filtered_cols.xlsx',
 'Europe - SMDEs Profile - Characteristics - 1_cols.xlsx',
 'Europe - SMDEs Profile - Characteristics - 2_cols.xlsx',
 'Europe - SMDEs Profile - Education & Achievements_sheet0_cols.xlsx',
 'Europe - SMDEs Profile - Education & Achievements_sheet1_cols.xlsx',
 'NA - Board Summary - 1_filtered_cols.xlsx',
 'NA - Board Summary - 2_filtered_cols.xlsx',
 'NA - Board Summary - 3_filtered_cols.xlsx',
 'NA - Company Details_filtered_cols.xlsx',
 'NA - Director Profile - Characteristics - 1_cols.xlsx',
 'NA - Director Profile - Characteristics - 2_cols.xlsx',
 'NA - Director Profile - Educatio

In [180]:
# Initialize a list of unique root names
root_names = []

for file in files:
    filepath = Path(folder / file)
    print("File Name:", filepath.stem)
    
    # Replace _ with - for sheet, since we want sheet0, sheet1 etc. to be in the root name
    filename = file.replace("_sheet", " - sheet")
    # Get the string before the first underscore - thus keeping sheet0, sheet1, etc.
    string_before_first_underscore = re.search(r"(^.+ - [^_]+)_", filename).group(1)
    # print(string_before_first_underscore)
    
    # Get root name to compare - i.e. remove the fragment numbers, e.g. [root name] - 1, [root name] - 2, etc.
    string_root = string_before_first_underscore.strip()
    string_root = re.sub(" - \d$", "", string_root)
    print("Root Name:", string_root)
    print("------")
    
    # Check if current root name is in list of unique root names
    if not(string_root in root_names):
        # This is a new root string. If root_names is empty, do nothing. If root_names is not empty, save the previous df.
        print("This is the 1st fragment of a new root excel file.")
        if len(root_names) == 0:
            print("This is the first root df.")
        else:
            # Save df
            print("Saving previous root df at:", prev_root_df_output_path)
            df.to_csv(prev_root_df_output_path, index=False)
            print("Saved!")
            print("------")
            
        # Save output file path of new root df
        prev_root_df_output_path = Path(outputfolder / Path(str(string_root) + '.csv'))
        
        # Import the 1st instance of the new df
        print("Importing 1st fragment of new root df")
        df = pd.read_excel(filepath)
        
        # Append the root name to the list of unique root names
        root_names.append(string_root)
    else:
        # Root name is already in the list of unique root names. Append to the current df.
        print("This is the 2nd/3rd/... fragment of the root excel file.")
        print("Appending to current root df")
        df_append = pd.read_excel(filepath)
        df = pd.concat([df, df_append])
        del df_append
    print("-----------------------------")
    
# Save the last df
print("Saving previous root df")
df.to_csv(prev_root_df_output_path, index=False)
print("-----------------------------")
print("Done!")

File Name: Europe - Board Summary_filtered_cols
Root Name: Europe - Board Summary
------
This is the 1st fragment of a new root excel file.
This is the first root df.
Importing 1st fragment of new root df
-----------------------------
File Name: Europe - Company Details_filtered_cols
Root Name: Europe - Company Details
------
This is the 1st fragment of a new root excel file.
Saving previous root df at: C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined\Europe - Board Summary.csv
Saved!
------
Importing 1st fragment of new root df
-----------------------------
File Name: Europe - Director Profile - Characteristics_cols
Root Name: Europe - Director Profile - Characteristics
------
This is the 1st fragment of a new root excel file.
Saving previous root df at: C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined\Europe - Company Details.csv
Saved!
------
Importin

-----------------------------
File Name: NA - SMDEs Profile - Characteristics - 1_cols
Root Name: NA - SMDEs Profile - Characteristics
------
This is the 1st fragment of a new root excel file.
Saving previous root df at: C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined\NA - SMDEs Org Summary.csv
Saved!
------
Importing 1st fragment of new root df
-----------------------------
File Name: NA - SMDEs Profile - Characteristics - 2_cols
Root Name: NA - SMDEs Profile - Characteristics
------
This is the 2nd/3rd/... fragment of the root excel file.
Appending to current root df
-----------------------------
File Name: NA - SMDEs Profile - Characteristics - 3_cols
Root Name: NA - SMDEs Profile - Characteristics
------
This is the 2nd/3rd/... fragment of the root excel file.
Appending to current root df
-----------------------------
File Name: NA - SMDEs Profile - Characteristics - 4_cols
Root Name: NA - SMDEs Profile - Character

Saved!
------
Importing 1st fragment of new root df
-----------------------------
File Name: UK - Director Profile - Education & Achievements_sheet1_cols
Root Name: UK - Director Profile - Education & Achievements - sheet1
------
This is the 1st fragment of a new root excel file.
Saving previous root df at: C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined\UK - Director Profile - Education & Achievements - sheet0.csv
Saved!
------
Importing 1st fragment of new root df
-----------------------------
File Name: UK - SMDEs Org Summary_filtered_cols
Root Name: UK - SMDEs Org Summary
------
This is the 1st fragment of a new root excel file.
Saving previous root df at: C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined\UK - Director Profile - Education & Achievements - sheet1.csv
Saved!
------
Importing 1st fragment of new root df
-----------------------------
Fil

# Part 2: Combine the same root file from different regions

In [181]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress

In [182]:
pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

In [183]:
regions = ['Europe', 'NA', 'ROW', 'UK']
inputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined")
outputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined_fragment_and_country")

In [184]:
# List the files in a directory (no recursive walk)
files = os.listdir(inputfolder)
files.sort()
files

['Europe - Board Summary.csv',
 'Europe - Company Details.csv',
 'Europe - Director Profile - Characteristics.csv',
 'Europe - Director Profile - Education & Achievements - sheet0.csv',
 'Europe - Director Profile - Education & Achievements - sheet1.csv',
 'Europe - SMDEs Org Summary.csv',
 'Europe - SMDEs Profile - Characteristics - 1 - sheet0.csv',
 'Europe - SMDEs Profile - Characteristics.csv',
 'Europe - SMDEs Profile - Education & Achievements - sheet0.csv',
 'Europe - SMDEs Profile - Education & Achievements - sheet1.csv',
 'NA - Board Summary.csv',
 'NA - Company Details.csv',
 'NA - Director Profile - Characteristics.csv',
 'NA - Director Profile - Education & Achievements - sheet0.csv',
 'NA - Director Profile - Education & Achievements - sheet1.csv',
 'NA - SMDEs Org Summary.csv',
 'NA - SMDEs Profile - Characteristics - 2 - sheet0.csv',
 'NA - SMDEs Profile - Characteristics.csv',
 'NA - SMDEs Profile - Education & Achievements - sheet0.csv',
 'NA - SMDEs Profile - Educatio

In [185]:
# Filter out the files that start with Europe.
europefiles_mask = ['Europe' in string for string in files]
europefiles = list(compress(files, europefiles_mask))
europefiles

['Europe - Board Summary.csv',
 'Europe - Company Details.csv',
 'Europe - Director Profile - Characteristics.csv',
 'Europe - Director Profile - Education & Achievements - sheet0.csv',
 'Europe - Director Profile - Education & Achievements - sheet1.csv',
 'Europe - SMDEs Org Summary.csv',
 'Europe - SMDEs Profile - Characteristics - 1 - sheet0.csv',
 'Europe - SMDEs Profile - Characteristics.csv',
 'Europe - SMDEs Profile - Education & Achievements - sheet0.csv',
 'Europe - SMDEs Profile - Education & Achievements - sheet1.csv']

In [186]:
# Then get the root name, and add the region prefix.
# Import and append the dfs from the other 3 regions.
# Save.

# Iterate through each of the Europe files.
for file in europefiles:
    filepath = Path(inputfolder / file)
    
    # Get root name (i.e. without the region)
    root = file.replace("Europe - ", "")
    print("Root Name:", root)
    print("------")
    
    # Iterate through each region
    for region in regions:
        
        # Add the region prefix to get the input filename and filepath
        inputfilename = region + " - " + root
        inputfilepath = Path(inputfolder / inputfilename)
        print("File Name:", inputfilename)
        
        # Check that file exists
        if os.path.exists(inputfilepath):
            print("File exists")
        else:
            print("Warning! File does not exist")
            continue
        
        # Import file
        print("Importing df for region:", region)
        df_current = pd.read_csv(inputfilepath, low_memory=False)
        df_current['Region'] = region
        
        # Set df = df_current if it's the first region, 'Europe', else append to existing df
        if region == 'Europe':
            print("1st region, new root df")
            df = df_current
        else:
            print("2nd/3rd/4th region, append to current root df")
            df = pd.concat([df, df_current])
        print("------")
        
    # Once we have iterated through all regions of a root file, save the root file
    outputfilepath = Path(outputfolder / root)
    print("Saving current root df at:", outputfilepath)
    df.to_csv(outputfilepath, index=False)
    print("Saved!")
    print("-----------------------------")

print("Done!")

Root Name: Board Summary.csv
------
File Name: Europe - Board Summary.csv
File exists
Importing df for region: Europe
1st region, new root df
------
File Name: NA - Board Summary.csv
File exists
Importing df for region: NA
2nd/3rd/4th region, append to current root df
------
File Name: ROW - Board Summary.csv
File exists
Importing df for region: ROW
2nd/3rd/4th region, append to current root df
------
File Name: UK - Board Summary.csv
File exists
Importing df for region: UK
2nd/3rd/4th region, append to current root df
------
Saving current root df at: C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined_fragment_and_country\Board Summary.csv
Saved!
-----------------------------
Root Name: Company Details.csv
------
File Name: Europe - Company Details.csv
File exists
Importing df for region: Europe
1st region, new root df
------
File Name: NA - Company Details.csv
File exists
Importing df for region: NA
2nd/3rd/4th region, 

  exec(code_obj, self.user_global_ns, self.user_ns)


1st region, new root df
------
File Name: NA - SMDEs Profile - Characteristics - 1 - sheet0.csv
File Name: ROW - SMDEs Profile - Characteristics - 1 - sheet0.csv
File Name: UK - SMDEs Profile - Characteristics - 1 - sheet0.csv
Saving current root df at: C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined_fragment_and_country\SMDEs Profile - Characteristics - 1 - sheet0.csv
Saved!
-----------------------------
Root Name: SMDEs Profile - Characteristics.csv
------
File Name: Europe - SMDEs Profile - Characteristics.csv
File exists
Importing df for region: Europe
1st region, new root df
------
File Name: NA - SMDEs Profile - Characteristics.csv
File exists
Importing df for region: NA
2nd/3rd/4th region, append to current root df
------
File Name: ROW - SMDEs Profile - Characteristics.csv
File exists
Importing df for region: ROW
2nd/3rd/4th region, append to current root df
------
File Name: UK - SMDEs Profile - Characteristic

## Part 2.5: Manual Check - Adjust the columns for SMDE dfs that are not identical to that of Director dfs

In [197]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress

In [198]:
inputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined_fragment_and_country")

In [199]:
# List the files in a directory (no recursive walk)
files = os.listdir(inputfolder)
files.sort()
files

['Board Summary.csv',
 'Company Details.csv',
 'Director Profile - Characteristics.csv',
 'Director Profile - Education & Achievements - sheet0.csv',
 'Director Profile - Education & Achievements - sheet1.csv',
 'SMDEs Org Summary.csv',
 'SMDEs Profile - Characteristics.csv',
 'SMDEs Profile - Education & Achievements - sheet0.csv',
 'SMDEs Profile - Education & Achievements - sheet1.csv']

### Check 1

In [200]:
# Check Board Summary vs SMDEs Org Summary
df1 = pd.read_csv(Path(inputfolder / files[0]), low_memory=False)
df2 = pd.read_csv(Path(inputfolder / files[5]), low_memory=False)
print("df1 columns:", df1.columns)
print("df2 columns:", df2.columns)
print("df1 columns == df2 columns:", all(df1.columns == df2.columns))

df1 columns: Index(['Annual Report Year', 'Country', 'CompanyID*', 'Company Name', 'ISIN',
       'Sector', 'Director Type', 'DirectorID*', 'Individual Name',
       'Individual Role', 'Time in Role', 'Time on Board', 'Time in Company',
       'Total Number of Quoted Boards to Date', 'Age (Yrs)',
       'Number of Qualifications', 'Gender', 'Nationality Mix',
       'Bonus/ (Bonus&Salary)', 'Equity Linked/ Total', 'Performance/ Total',
       'Total Directors on the Board', 'Salary', 'Total Annual Compensation',
       'Region'],
      dtype='object')
df2 columns: Index(['Annual Report Year', 'Country', 'CompanyID*', 'Company Name', 'ISIN',
       'Sector', 'Director Type', 'DirectorID*', 'Individual Name',
       'Individual Role', 'Time in Role', 'Time on Board', 'Time in Company',
       'Total Number of Quoted Boards to Date', 'Age (Yrs)',
       'Number of Qualifications', 'Gender', 'Nationality Mix',
       'Bonus/ (Bonus&Salary)', 'Equity Linked/ Total', 'Performance/ Total',
  

There is an error here. The Directors df has an extra variable, 'Total Directors on the Board'. We insert that into the SMDE df as NaNs and reorder the columns in the same way as the Directors df.

In [201]:
# Import Board Summary and SMDEs Org Summary
df1 = pd.read_csv(Path(inputfolder / files[0]), low_memory=False)
df2 = pd.read_csv(Path(inputfolder / files[5]), low_memory=False)
print("df1 columns:", df1.columns)
print("df2 columns:", df2.columns)

df1 columns: Index(['Annual Report Year', 'Country', 'CompanyID*', 'Company Name', 'ISIN',
       'Sector', 'Director Type', 'DirectorID*', 'Individual Name',
       'Individual Role', 'Time in Role', 'Time on Board', 'Time in Company',
       'Total Number of Quoted Boards to Date', 'Age (Yrs)',
       'Number of Qualifications', 'Gender', 'Nationality Mix',
       'Bonus/ (Bonus&Salary)', 'Equity Linked/ Total', 'Performance/ Total',
       'Total Directors on the Board', 'Salary', 'Total Annual Compensation',
       'Region'],
      dtype='object')
df2 columns: Index(['Annual Report Year', 'Country', 'CompanyID*', 'Company Name', 'ISIN',
       'Sector', 'Director Type', 'DirectorID*', 'Individual Name',
       'Individual Role', 'Time in Role', 'Time on Board', 'Time in Company',
       'Total Number of Quoted Boards to Date', 'Age (Yrs)',
       'Number of Qualifications', 'Gender', 'Nationality Mix',
       'Bonus/ (Bonus&Salary)', 'Equity Linked/ Total', 'Performance/ Total',
  

In [202]:
df2['Total Directors on the Board'] = np.NaN
df2 = df2.reindex(columns=df1.columns)
print("df1 columns == df2 columns:", all(df1.columns == df2.columns))

df1 columns == df2 columns: True


Now the columns in the two dfs are identical. We save the modified SMDE df under the same file name.

In [203]:
# Save the modified SMDE df
df2.to_csv(Path(inputfolder / files[5]), index=False)

### Check 2

In [204]:
# Check Director Profile - Characteristics vs SMDEs Profile - Characteristics
df1 = pd.read_csv(Path(inputfolder / files[2]), low_memory=False)
df2 = pd.read_csv(Path(inputfolder / files[6]), low_memory=False)
print("df1 columns:", df1.columns)
print("df2 columns:", df2.columns)
print("df1 columns == df2 columns:", all(df1.columns == df2.columns))

df1 columns: Index(['DirectorID*', 'Age', 'DOB', 'Nationality', 'Region'], dtype='object')
df2 columns: Index(['DirectorID*', 'Age', 'DOB', 'Nationality', 'Region'], dtype='object')
df1 columns == df2 columns: True


### Check 3

In [205]:
# Check Director Profile - Education & Achievements - sheet0 vs SMDEs Profile - Education & Achievements - sheet0
df1 = pd.read_csv(Path(inputfolder / files[3]), low_memory=False)
df2 = pd.read_csv(Path(inputfolder / files[7]), low_memory=False)
print("df1 columns:", df1.columns)
print("df2 columns:", df2.columns)
print("df1 columns == df2 columns:", all(df1.columns == df2.columns))

df1 columns: Index(['Country', 'DirectorID*', 'Director Name', 'InstitutionID*',
       'Institution Name', 'Company Type', 'Qualification',
       'Qualification Description', 'Qualification Date', 'Region'],
      dtype='object')
df2 columns: Index(['Country', 'DirectorID*', 'Director Name', 'InstitutionID*',
       'Institution Name', 'Company Type', 'Qualification',
       'Qualification Description', 'Qualification Date', 'Region'],
      dtype='object')
df1 columns == df2 columns: True


### Check 4

In [206]:
# Check Director Profile - Education & Achievements - sheet1 vs SMDEs Profile - Education & Achievements - sheet1
df1 = pd.read_csv(Path(inputfolder / files[4]), low_memory=False)
df2 = pd.read_csv(Path(inputfolder / files[8]), low_memory=False)
print("df1 columns:", df1.columns)
print("df2 columns:", df2.columns)
print("df1 columns == df2 columns:", all(df1.columns == df2.columns))

df1 columns: Index(['Country', 'DirectorID*', 'Director Name', 'OrganisationID*',
       'OrganisationName', 'Achievement Date', 'Award/Achievement', 'Region'],
      dtype='object')
df2 columns: Index(['Country', 'DirectorID*', 'Director Name', 'OrganisationID*',
       'OrganisationName', 'Achievement Date', 'Award/Achievement', 'Region'],
      dtype='object')
df1 columns == df2 columns: True


The other 3 pairs are all ok.

# Part 3: Combine the same root file for Directors and SMDEs

In [207]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress

In [208]:
pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

In [209]:
director_smde_map_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\boardex_director_smde_map\director_smde_map.xlsx")
inputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined_fragment_and_country")
outputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined_fragment_country_and_managementtype")

In [210]:
director_smde_map = pd.read_excel(director_smde_map_filepath)
director_smde_map

Unnamed: 0,Director,SMDE
0,Board Summary,SMDEs Org Summary
1,Board Summary,
2,Committee Details,SMDEs Committee Details
3,Company Announcements,SMDEs Company Announcements
4,Company Details,
5,Company Network,
6,Compensation,SMDEs Compensation
7,Director Details Mapping File,SMDEs Mapping File
8,Director Network,SMDEs Network
9,Director Profile - Characteristics,SMDEs Profile - Characteristics


In [211]:
# List the files in a directory (no recursive walk)
files = os.listdir(inputfolder)
files.sort()
files

['Board Summary.csv',
 'Company Details.csv',
 'Director Profile - Characteristics.csv',
 'Director Profile - Education & Achievements - sheet0.csv',
 'Director Profile - Education & Achievements - sheet1.csv',
 'SMDEs Org Summary.csv',
 'SMDEs Profile - Characteristics.csv',
 'SMDEs Profile - Education & Achievements - sheet0.csv',
 'SMDEs Profile - Education & Achievements - sheet1.csv']

In [212]:
# Remove the sheet name and .csv so that we can compare with the director names
files_root = [re.sub(" - sheet.+$", "", file) for file in files]
files_root = [re.sub(".csv$", "", file) for file in files_root]
files_root

['Board Summary',
 'Company Details',
 'Director Profile - Characteristics',
 'Director Profile - Education & Achievements',
 'Director Profile - Education & Achievements',
 'SMDEs Org Summary',
 'SMDEs Profile - Characteristics',
 'SMDEs Profile - Education & Achievements',
 'SMDEs Profile - Education & Achievements']

In [213]:
# Filter out the files that belong to the 'Director' column in the df director_smde_map, i.e. get only the director files.
directorfiles_mask = [any([string == file for string in director_smde_map['Director']]) for file in files_root]
directorfiles = list(compress(files, directorfiles_mask))
directorfiles

['Board Summary.csv',
 'Company Details.csv',
 'Director Profile - Characteristics.csv',
 'Director Profile - Education & Achievements - sheet0.csv',
 'Director Profile - Education & Achievements - sheet1.csv']

In [214]:
# Get the sheet number, if one exists
directorfiles_sheetnumber = [re.search(r"( - sheet\d)", file) for file in directorfiles]
directorfiles_sheetnumber = [matchobject.group(1) if matchobject != None else '' for matchobject in directorfiles_sheetnumber]
directorfiles_sheetnumber

['', '', '', ' - sheet0', ' - sheet1']

In [215]:
directorfiles_root = list(compress(files_root, directorfiles_mask))
directorfiles_root

['Board Summary',
 'Company Details',
 'Director Profile - Characteristics',
 'Director Profile - Education & Achievements',
 'Director Profile - Education & Achievements']

In [216]:
files_map = pd.DataFrame()
files_map['Director Csv Name in Input Folder'] = directorfiles
files_map['Director Root'] = directorfiles_root
files_map['Director Sheet Number'] = directorfiles_sheetnumber
files_map

Unnamed: 0,Director Csv Name in Input Folder,Director Root,Director Sheet Number
0,Board Summary.csv,Board Summary,
1,Company Details.csv,Company Details,
2,Director Profile - Characteristics.csv,Director Profile - Characteristics,
3,Director Profile - Education & Achievements - ...,Director Profile - Education & Achievements,- sheet0
4,Director Profile - Education & Achievements - ...,Director Profile - Education & Achievements,- sheet1


In [217]:
files_map = files_map.merge(director_smde_map, how='left', left_on='Director Root', right_on='Director')
files_map = files_map.drop(['Director'], axis=1)
files_map = files_map.rename(columns = {'SMDE': "SMDE Root"})
files_map = files_map.drop_duplicates(subset=['Director Csv Name in Input Folder'], ignore_index=True)
files_map

Unnamed: 0,Director Csv Name in Input Folder,Director Root,Director Sheet Number,SMDE Root
0,Board Summary.csv,Board Summary,,SMDEs Org Summary
1,Company Details.csv,Company Details,,
2,Director Profile - Characteristics.csv,Director Profile - Characteristics,,SMDEs Profile - Characteristics
3,Director Profile - Education & Achievements - ...,Director Profile - Education & Achievements,- sheet0,SMDEs Profile - Education & Achievements
4,Director Profile - Education & Achievements - ...,Director Profile - Education & Achievements,- sheet1,SMDEs Profile - Education & Achievements


In [219]:
# Iterate through each of the director files.
for index, row in files_map.iterrows():
    director_file = row['Director Csv Name in Input Folder']
    director_root = row['Director Root']
    sheet_number = row['Director Sheet Number']
    smde_root = row['SMDE Root']
    director_filepath = Path(inputfolder / director_file)
    director_stem = filepath.stem
    if sheet_number != '':
        outputfilename = director_root + sheet_number + '_director_and_smde.csv'
    else:
        outputfilename = director_root + '_director_and_smde.csv'
    
    outputfilepath = Path(outputfolder / outputfilename)
    
    print("Director File Name:", director_file)
    print("Importing Director root df")
    df = pd.read_csv(director_filepath, low_memory=False)
    df['Mgmt_Type'] = 'Director'
    
    # Find the SMDE equivalent, if one exists
    if smde_root != smde_root:
        print("SMDE Root is NaN, SMDE version of this file does not exist.")
        print("SMDE File Name: N/A")
        print("------")
    else:
        print("SMDE Root is not NaN, SMDE version of this file exists.")
        if sheet_number != '':
            smde_file = smde_root + sheet_number + '.csv'
        else:
            smde_file = smde_root + '.csv'
        smde_filepath = Path(inputfolder / smde_file)
        print("SMDE File Name:", smde_file)
        print("------")
        
        print("Importing SMDE root df")
        df_smde = pd.read_csv(smde_filepath, low_memory=False)
        df_smde['Mgmt_Type'] = 'SMDE'
        
        print("Appending SMDE root df to Director root df")
        df = pd.concat([df, df_smde])
        
    # Save the root df
    print("Saving Director and SMDE root df at:", outputfilepath)
    df.to_csv(outputfilepath, index=False)
    print("Saved!")
    print("-----------------------------")

print("Done!")

Director File Name: Board Summary.csv
Importing Director root df
SMDE Root is not NaN, SMDE version of this file exists.
SMDE File Name: SMDEs Org Summary.csv
------
Importing SMDE root df
Appending SMDE root df to Director root df
Saving Director and SMDE root df at: C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined_fragment_country_and_managementtype\Board Summary_director_and_smde.csv
Saved!
-----------------------------
Director File Name: Company Details.csv
Importing Director root df
SMDE Root is NaN, SMDE version of this file does not exist.
SMDE File Name: N/A
------
Saving Director and SMDE root df at: C:\Users\jasonjia\Dropbox\Projects\boardex_and_execucomp\output\filter_boardex_by_cols\output_from_mercury_combined_fragment_country_and_managementtype\Company Details_director_and_smde.csv
Saved!
-----------------------------
Director File Name: Director Profile - Characteristics.csv
Importing Director root df
SM