# Example usage of cleaning.py

In [1]:
import sys
sys.path.append('../scripts')

import os
from tqdm import tqdm
import cleaning

In [2]:
directory = r'../data/raw/annual_reports/2020'

for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.pdf'):
        path = directory + '/' + filename
        parsed_file = cleaning.parse_file(path)
        clean_file = cleaning.toy_clean(parsed_file, table_contents=True)
    
print('All cleaning ok!')

100%|███████████████████████████████████████████| 80/80 [02:20<00:00,  1.76s/it]

All cleaning ok!





## Example with Microsoft

In [3]:
Microsoft_path = "../data/raw/annual_reports/2020/Microsoft_Annual_Report_2020.pdf"

### Example of parsed output

In [4]:
Microsoft = cleaning.parse_file(Microsoft_path)
Microsoft[34]

[' \n',
 '33 \n',
 ' \n',
 '• \nGaming revenue increased $1.0 billion or 10%, driven by Xbox software and services growth of 19%, primarily \ndue to third-party title strength and subscriptions growth, offset in part by a decline in Xbox hardware of 13% \nprimarily due to a decrease in volume of consoles sold.  \n',
 '• \nSearch advertising revenue increased $616 million or 9%. Search advertising revenue, excluding traffic \nacquisition costs, increased 13%, driven by higher revenue per search.  \n',
 'Operating income increased $2.2 billion or 21%, including an unfavorable foreign currency impact of 2%.  \n',
 '• \nGross margin increased $2.0 billion or 9%, driven by growth in Windows, Gaming, and Search. Gross margin \npercentage increased slightly, due to sales mix shift to higher gross margin businesses in Windows and  \nGaming.  \n',
 '• \nOperating expenses decreased $172 million or 1%.  \n',
 'OPERATING EXPENSES  \n',
 'Research and Development  \n  \n',
 '(In millions, except p

### Example of output without tables

In [5]:
Microsoft_clean = cleaning.toy_clean(Microsoft) 
Microsoft_clean[34]

[' ',
 '33 ',
 ' ',
 '• Gaming revenue increased $1.0 billion or 10%, driven by Xbox software and services growth of 19%, primarily due to third-party title strength and subscriptions growth, offset in part by a decline in Xbox hardware of 13% primarily due to a decrease in volume of consoles sold. ',
 '• Search advertising revenue increased $616 million or 9%. Search advertising revenue, excluding traffic acquisition costs, increased 13%, driven by higher revenue per search. ',
 'Operating income increased $2.2 billion or 21%, including an unfavorable foreign currency impact of 2%. ',
 '• Gross margin increased $2.0 billion or 9%, driven by growth in Windows, Gaming, and Search. Gross margin percentage increased slightly, due to sales mix shift to higher gross margin businesses in Windows and Gaming. ',
 '• Operating expenses decreased $172 million or 1%. ',
 'OPERATING EXPENSES ',
 'Research and Development ',
 'Percentage ',
 'Change 2020 Versus 2019 ',
 'Percentage ',
 'Change 2019

### Example of output with tables

In [6]:
Microsoft_clean_with_tables = cleaning.toy_clean(Microsoft, table_contents=True)
Microsoft_clean_with_tables[34]

[' ',
 '33 ',
 ' ',
 '• Gaming revenue increased $1.0 billion or 10%, driven by Xbox software and services growth of 19%, primarily due to third-party title strength and subscriptions growth, offset in part by a decline in Xbox hardware of 13% primarily due to a decrease in volume of consoles sold. ',
 '• Search advertising revenue increased $616 million or 9%. Search advertising revenue, excluding traffic acquisition costs, increased 13%, driven by higher revenue per search. ',
 'Operating income increased $2.2 billion or 21%, including an unfavorable foreign currency impact of 2%. ',
 '• Gross margin increased $2.0 billion or 9%, driven by growth in Windows, Gaming, and Search. Gross margin percentage increased slightly, due to sales mix shift to higher gross margin businesses in Windows and Gaming. ',
 '• Operating expenses decreased $172 million or 1%. ',
 'OPERATING EXPENSES ',
 'Research and Development ',
 '<TABLE 16: >',
 'Percentage ',
 'Change 2020 Versus 2019 ',
 'Percentage

## Example with Just Eat

In [7]:
Just_Eat_path = "../data/raw/annual_reports/2020/Just_Eat_Annual_Report_2020.pdf"

### Example of parsed output

In [8]:
Just_Eat = cleaning.parse_file(Just_Eat_path)

In [9]:
Just_Eat[107]

['Share ownership\n',
 'Share ownership members of the Management Board\n',
 'As at 31 December 2020, the Managing Directors held shares in the \n',
 'Company as set out below. \n',
 'Numbers of shares held\n',
 'J. Groen \nCEO1\nB. Wissink \nCFO\n',
 'J. Gerbig \nCOO1\n',
 'Numbers of shares held as at 31 December 2020 15,318,766\n115,581\n310,000\n',
 '1 \nShares are hold indirectly through personal holding \n',
 'Share ownership members of the Supervisory Board\n',
 'As at 31 December 2020, none of the Supervisory Board members held \n',
 'shares in the Company.\n',
 'Severance arrangements\n',
 'Contractual severance arrangements of the Managing Directors provide for \n',
 'compensation for the loss of income resulting from a non-voluntary \n',
 'termination of employment. In that situation, the severance package is \n',
 'equal to the sum of the six-month gross fixed base fee of the respective \n',
 'Managing Director. The contractual severance arrangements are compliant \n',
 'wi

### Example of clean output without tables (mind that we could not perfectly find all tables!)

In [10]:
Just_Eat_clean = cleaning.toy_clean(Just_Eat)

In [11]:
Just_Eat_clean[107]

['Share ownership ',
 'Share ownership members of the Management Board ',
 'As at 31 December 2020, the Managing Directors held shares in the ',
 'Company as set out below. ',
 'Numbers of shares held ',
 'J. Groen CEO1 B. Wissink CFO ',
 'J. Gerbig COO1 ',
 'Numbers of shares held as at 31 December 2020 15,318,766 115,581 310,000 ',
 '1 Shares are hold indirectly through personal holding ',
 'Share ownership members of the Supervisory Board ',
 'As at 31 December 2020, none of the Supervisory Board members held ',
 'shares in the Company. ',
 'Severance arrangements ',
 'Contractual severance arrangements of the Managing Directors provide for ',
 'compensation for the loss of income resulting from a non-voluntary ',
 'termination of employment. In that situation, the severance package is ',
 'equal to the sum of the six-month gross fixed base fee of the respective ',
 'Managing Director. The contractual severance arrangements are compliant ',
 'with the Dutch Corporate Governance Code

### Example output with tables

In [12]:
Just_Eat_clean_with_tables= cleaning.toy_clean(Just_Eat, table_contents=True)
Just_Eat_clean_with_tables[107]

['Share ownership ',
 'Share ownership members of the Management Board ',
 'As at 31 December 2020, the Managing Directors held shares in the ',
 'Company as set out below. ',
 'Numbers of shares held ',
 'J. Groen CEO1 B. Wissink CFO ',
 'J. Gerbig COO1 ',
 'Numbers of shares held as at 31 December 2020 15,318,766 115,581 310,000 ',
 '1 Shares are hold indirectly through personal holding ',
 'Share ownership members of the Supervisory Board ',
 'As at 31 December 2020, none of the Supervisory Board members held ',
 'shares in the Company. ',
 'Severance arrangements ',
 'Contractual severance arrangements of the Managing Directors provide for ',
 'compensation for the loss of income resulting from a non-voluntary ',
 'termination of employment. In that situation, the severance package is ',
 'equal to the sum of the six-month gross fixed base fee of the respective ',
 'Managing Director. The contractual severance arrangements are compliant ',
 'with the Dutch Corporate Governance Code

## Example with Aggreko

In [13]:
Aggreko_path = "../data/raw/annual_reports/2020/Aggreko_Annual_Report_2020.pdf"

### Example parsed output

In [14]:
Aggreko = cleaning.parse_file(Aggreko_path)
Aggreko[76]

['Implementation of remuneration policy in 2021 \n',
 'Base salaries \nChris Weston and Heath Drewett’s base salaries were reviewed by the Committee in December 2020; their salaries will remain \nunchanged for 2021. The Committee intends to next review the salaries of both these Executive Directors in December 2021 to take \neffect from January 2022. \n',
 'The base salaries for Executive Directors as at 1 January 2021 and 1 January 2020 were as follows:\n',
 'Executive Director\nPosition\n1 January 2021 \n£\nIncrease*  \n%\n1 January 2020 \n£\nChris Weston \nChief Executive Officer \n765,000\n0\n765,000\nHeath Drewett\nChief Financial Officer\n469,200\n0\n469,200\n',
 '* The average increase across the Group for 2020 was 1.6%. The majority of employees received no increase. \n',
 'Pensions and benefits\nPension allowances will reduce to a rate (when expressed as a percentage of salary) no higher than the rate available to the majority \nof colleagues either in the UK or in the jurisdi

### Example cleaned output without tables

In [15]:
Aggreko_clean = cleaning.toy_clean(Aggreko)
Aggreko_clean[76]

['Implementation of remuneration policy in 2021 ',
 'Base salaries Chris Weston and Heath Drewett’s base salaries were reviewed by the Committee in December 2020; their salaries will remain unchanged for 2021. The Committee intends to next review the salaries of both these Executive Directors in December 2021 to take effect from January 2022. ',
 'The base salaries for Executive Directors as at 1 January 2021 and 1 January 2020 were as follows: ',
 '* The average increase across the Group for 2020 was 1.6%. The majority of employees received no increase. ',
 'Pensions and benefits Pension allowances will reduce to a rate (when expressed as a percentage of salary) no higher than the rate available to the majority of colleagues either in the UK or in the jurisdiction in which the Executive resides – currently 9% for the majority of the UK workforce – with effect from 1 January 2023. This transition for the incumbent Chief Executive Officer began in 2020 with Chris Weston’s pension reduci

### Example cleaned output with tables

In [16]:
Aggreko_clean_with_tables = cleaning.toy_clean(Aggreko, table_contents=True)
Aggreko_clean_with_tables[76]

['Implementation of remuneration policy in 2021 ',
 'Base salaries Chris Weston and Heath Drewett’s base salaries were reviewed by the Committee in December 2020; their salaries will remain unchanged for 2021. The Committee intends to next review the salaries of both these Executive Directors in December 2021 to take effect from January 2022. ',
 'The base salaries for Executive Directors as at 1 January 2021 and 1 January 2020 were as follows: ',
 '<TABLE 37: Executive Director Position Increase* Chris Weston Chief Executive Officer Heath Drewett Chief Financial Officer>',
 '* The average increase across the Group for 2020 was 1.6%. The majority of employees received no increase. ',
 'Pensions and benefits Pension allowances will reduce to a rate (when expressed as a percentage of salary) no higher than the rate available to the majority of colleagues either in the UK or in the jurisdiction in which the Executive resides – currently 9% for the majority of the UK workforce – with effec