This notebook explores an alternative approach to produce the workbook map data (relationship between shpreadsheets).
It assumes `Workbook_map.output_formulae` subroutine has run in the target workbook (eg. Workbook_map_EXAMPLE.xlsm) to produce a file with all the workbook's formulas (eg. Workbook_map_EXAMPLE_formulas.tsv).

In [1]:
import pandas as pd
from pathlib import Path
import re

In [2]:
from itertools import chain
import collections

In [3]:
sfp = Path('Workbook_map_EXAMPLE_formulas.tsv')

In [4]:
data = pd.read_csv(sfp, sep='\t')

In [5]:
# Ammend formulas to make it easier to find the sheet names
data['cell_formula_fixed'] = data['cell_formula'].str.strip("'=")

In [6]:
# Find sheet names before an exclamation mark, either enclosed in single
# quotes, or not containing certain characters
sheetname_pat = r"""(?:'.+?'|\b[^:,/\*\-+ =&<>\[\]"\(\)!]+?)(?=!)"""

In [7]:
sheetname_re = re.compile(sheetname_pat)

In [8]:
data.head()

Unnamed: 0,sheet_name,cell_address,cell_formula,cell_formula_fixed
0,CALC_A_1,C3,'=INPUT1!C3,INPUT1!C3
1,CALC_A_1,C4,'=INPUT1!C4,INPUT1!C4
2,CALC_A_1,C5,'=INPUT1!C5,INPUT1!C5
3,CALC_A_1,C6,'=INPUT1!C6,INPUT1!C6
4,CALC_A_1,C7,'=INPUT1!C7,INPUT1!C7


In [9]:
precedent_sheets = data['cell_formula_fixed'].map(sheetname_re.findall)

In [10]:
precedent_sheets.head()

0    [INPUT1]
1    [INPUT1]
2    [INPUT1]
3    [INPUT1]
4    [INPUT1]
Name: cell_formula_fixed, dtype: object

In [11]:
# Count the matched sheet names for each sheet
precedent_sheets_count = precedent_sheets.groupby(data['sheet_name']).apply(lambda S: collections.Counter(chain.from_iterable(S)))

In [12]:
precedent_sheets_count.index.names = ['sheet_name', 'sheet_name_precedent']
precedent_sheets_count = precedent_sheets_count.to_frame(name='count')

In [13]:
precedent_sheets_count.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
sheet_name,sheet_name_precedent,Unnamed: 2_level_1
CALC_A_1,INPUT1,10.0
CALC_A_2,INPUT1,10.0
CALC_A_2,INPUT2,10.0
CALC_A_3,CALC_A_1,10.0
CALC_A_3,CALC_A_2,10.0


In [14]:
# output
ofp = sfp.with_name(f'{sfp.stem}_count.csv')
precedent_sheets_count.to_csv(ofp)