# Example outputs from preprocessing.py module

In [1]:
import sys
sys.path.append('../scripts')

from preprocessing import *

We will first define an example path

In [2]:
example_path = ("../data/raw/annual_reports/2020/{0}_Annual_Report_2020.pdf".format('Amcor'))

## Example output for `define_company_dictionary`

In [3]:
company_dict = define_company_dictionary(example_path)
company_dict

{'name': 'Amcor',
 'filename': 'Amcor_Annual_Report_2020.pdf',
 'type': 'Annual Report',
 'year': '2020',
 'clean_text': {1: ['What we make matters ',
   '2020 Annual Report ',
   'Amcor Annual Report 2020 '],
  2: ['Our cover stories ',
   '2 ',
   'Annual Report 2020 ',
   'Cover stories ',
   'Annual Report 2020 ',
   'Differentiated primary packaging that brings value to consumers. ',
   'Increased exposure to high-value segments such as Healthcare and Protein. ',
   'Innovative, more sustainable products that our customers want, and consumers expect. ',
   'Resilient business that serves consumer staple end markets. ',
   'Amcor people s care and focus contributed to the supply of essential products. ',
   'We enable our customers to provide solutions for global trends including e-commerce. '],
  3: ['Contents ',
   'Message from the Chairman of the Board and the CEO 4 ',
   'Amcor at a glance 6 ',
   'Our strategy 8 ',
   'Sustainability and innovation 10 ',
   'Amcor fiscal 2020

### Example of page selection using `select_pages` for number of employees

In [4]:
page_dict = company_dict['clean_text']
select_dict = select_pages(page_dict, 'n_employees')
select_dict

{4: ['Dear Shareholder, ',
  'Fiscal year 2020 (FY20) was a milestone year for ',
  'Amcor and a year with many firsts: the first year listed ',
  'on the New York Stock Exchange, the first after the ',
  'transformational acquisition of Bemis and, of course, the ',
  'first time operating through a pandemic that significantly ',
  'impacted economies around the world. Faced with ',
  'unprecedented circumstances, we delivered strong ',
  'performance and advanced our strategic agenda. ',
  'Safe, healthy and resilient ',
  'At Amcor, safety is our most important value. We take ',
  'care of ourselves and each other so that everyone can ',
  'return home safely every day. Over the years we have built ',
  'a track record of consistently improving safety indicators ',
  'in the businesses we acquire. As we integrated the largest ',
  'acquisition in our history, we experienced 10% fewer ',
  'injuries than the prior year and 52% of our sites worldwide ',
  'operated injury-free for the 

## Example output for `run_nlp`

In [5]:
nlp_dict = run_nlp(select_dict)
nlp_dict

{4: [Dear Shareholder, ,
  Fiscal year 2020 (FY20) was a milestone year for ,
  Amcor and a year with many firsts: the first year listed ,
  on the New York Stock Exchange, the first after the ,
  transformational acquisition of Bemis and, of course, the ,
  first time operating through a pandemic that significantly ,
  impacted economies around the world. Faced with ,
  unprecedented circumstances, we delivered strong ,
  performance and advanced our strategic agenda. ,
  Safe, healthy and resilient ,
  At Amcor, safety is our most important value. We take ,
  care of ourselves and each other so that everyone can ,
  return home safely every day. Over the years we have built ,
  a track record of consistently improving safety indicators ,
  in the businesses we acquire. As we integrated the largest ,
  acquisition in our history, we experienced 10% fewer ,
  injuries than the prior year and 52% of our sites worldwide ,
  operated injury-free for the whole year. ,
  Such an improvement

## Example output for `define_matcher` for `n_employees` metric

In [6]:
patterns = pattern_definition('n_employees')
patterns

[[{'LEMMA': {'IN': ['total', 'average']}},
  {'OP': '?'},
  {'OP': '?'},
  {'LEMMA': 'number', 'OP': '?'},
  {'LEMMA': 'of', 'OP': '+'},
  {'LEMMA': {'IN': ['employee',
     'people',
     'person',
     'colleague',
     'team',
     'staff',
     'full-time',
     'fte']}}],
 [{'LEMMA': {'IN': ['employ', 'hire']}},
  {'OP': '?'},
  {'OP': '?'},
  {'OP': '?'},
  {'OP': '?'},
  {'ENT_TYPE': 'CARDINAL', 'OP': '+'},
  {'LEMMA': {'IN': ['employee',
     'people',
     'person',
     'colleague',
     'team',
     'staff',
     'full-time',
     'fte']}},
  {'POS': 'NOUN', 'OP': '?'}],
 [{'LEMMA': {'IN': ['the', 'our', 'with', 'have']}},
  {'POS': 'ADP', 'OP': '?'},
  {'POS': 'ADP', 'OP': '?'},
  {'ENT_TYPE': 'CARDINAL', 'OP': '+'},
  {'LEMMA': {'IN': ['employee',
     'people',
     'person',
     'colleague',
     'team',
     'staff',
     'full-time',
     'fte',
     'headcount',
     'workforce']}}],
 [{'LEMMA': {'IN': ['total', 'average']}},
  {'LEMMA': {'IN': ['employee',
     'peo

## Example output of `define_matcher`

In [7]:
matcher = define_matcher(patterns)
matcher

<spacy.matcher.matcher.Matcher at 0x7f2faf443b40>

## Example output for `create_match_dataframe` for `n_employees`

In [8]:
df = create_match_dataframe(example_path, metric='n_employees')
df

Unnamed: 0,filename,name,year,text,match_string,page,start,end
0,Amcor_Annual_Report_2020.pdf,Amcor,2020,The Flexibles Segment develops and supplies f...,"With approximately 40,000 employees",23,11,15
1,Amcor_Annual_Report_2020.pdf,Amcor,2020,The Rigid Packaging Segment manufacturers rig...,"employed approximately 6,000 employees",23,27,31
2,Amcor_Annual_Report_2020.pdf,Amcor,2020,"As of June 30, 2020, we employed approximatel...","employed approximately 47,000 people",24,9,13
