<a href="https://colab.research.google.com/github/jonas-nothnagel/CBI-disaster-analysis/blob/main/Global_Compact_Collection_Analysis_0_0_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collecting UN Global Compact Communication of Progress (COP) Annual Reports through web-scrapping

For this analysis we download from [here](https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active) all COP reports submitted up to 2021. We only take into account the reports submitted in English.

Please specify below the focus year of this analysis. It will be consider as the end point of historical analyses, as well as the year for annual analyses.

In [None]:
focus_year = "2021"

Please select the focus language using one of the following values:
- en (English, default option)
- de (German)
- es (Spanish)
- fr (French)
- pt (Portuguese)

In [None]:
focus_language = 'en'

In [None]:
language_ref = { 'en' : { 'name' : 'English', 'min_coocurrence' : 10},
                 'de' : { 'name' : 'German', 'min_coocurrence' : 2},
                 'es' : { 'name' : 'Spanish', 'min_coocurrence' : 2},
                 'fr' : { 'name' : 'French', 'min_coocurrence' : 2},
                 'pt' : { 'name' : 'Portuguese', 'min_coocurrence' : 2},
               }

## 1. Gathering information about COP reports available from the UN Global Compact website
The [UN Global Compact website](https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active) contains entries for each COP report, describing the sector of the company submitting the report, country and year, as well as the language in which the repoort was written in and a link to a PDF file with the full report.

**The results in this section give a general view of the available COPs, it's not yet restricted by the focus_year and focus_language.**

In [None]:
import requests
import re
from bs4 import BeautifulSoup

gc_url = "https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active?page=1&per_page=10"
gc_base_url = "https://www.unglobalcompact.org"

gc_home = requests.get(gc_url)

soup = BeautifulSoup(gc_home.content, 'lxml')

header = soup.h2.string

total_num_cops = re.search(r'(?<=: )[0-9]+', header)[0]
print("Total number of COPs available: %s" % total_num_cops)

Total number of COPs available: 52346


In [None]:
full_gc_url_part1 = "https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active?page="
full_gc_url_part2 = "&per_page=10"

def get_link(page):
    
    r = requests.get(full_gc_url_part1+str(page)+full_gc_url_part2)
    soup = BeautifulSoup(r.content, 'lxml')
    #links = [td.find_all('a')[0]['href'] for td in soup.find_all('td', 'participant')]

    return soup

#print("Getting full list of reports ...")
gc_full_list_soup=BeautifulSoup()
for i in range(1,2): #change this back to 208 after demo
  gc_full_list_soup.append(get_link(i))

#gc_full_list_soup = BeautifulSoup(set_of_links.content, 'lxml')

In [None]:
#check the existing th class from the soup
#removing of th is required for gc_full_list_soup.find_all("tr") part of next cell
element = gc_full_list_soup.find_all('th')
print(element)

#remove th class
for th in gc_full_list_soup('th'):
  th.decompose()

#remove tags that have no content
for x in gc_full_list_soup.find_all():
  if len(x.get_text(strip=True)) ==0:
    x.extract()

#check the results
gc_full_list_soup

[<th class="sort participant">Participant</th>, <th class="sort sector">Sector</th>, <th class="sort country">Country</th>, <th class="sort year">Year</th>]


<!DOCTYPE html>
<html>
<head>



<title>Active COPs | UN Global Compact</title>











</head>
<body>
<header id="main-header">
<div class="wrapper">


<nav id="main-navigation-container">
<form accept-charset="UTF-8" action="/search" id="main-search" method="get">
<div id="main-search-filters">
<fieldset>
<label for="search_type_all">All</label>

<label for="search_type_participants">Participants</label>

</fieldset>
</div>
 <button class="search-button" type="submit"><span>Search</span></button>
</form>
<ul id="primary-navigation">
<li><a class="nav-link " href="/what-is-gc">Who We Are</a></li>
<li><a class="nav-link " href="/sdgs">The SDGs</a></li>
<li><a class="nav-link active" href="/participation">Participation</a></li>
<li><a class="nav-link " href="/take-action">Take Action</a></li>
<li><a class="nav-link " href="/engage-locally">Engage Locally</a></li>
<li><a class="nav-link " href="/library">Explore Our Library</a></li>
</ul>
<ul id="secondary-navigation">
<li><a class="n

In [None]:
def check_sdgs_3_13(profile): #checks for corresponding SDGs #check function names
    has_sdg1 = "no"
    has_sdg2 = "no"
    has_sdg3 = "no"
    has_sdg4 = "no"
    has_sdg5 = "no"
    has_sdg6 = "no"
    has_sdg7 = "no"
    has_sdg8 = "no"
    has_sdg9 = "no"
    has_sdg10 = "no"
    has_sdg11 = "no"
    has_sdg12 = "no"
    has_sdg13 = "no"
    has_sdg14 = "no"
    has_sdg15 = "no"
    has_sdg16 = "no"
    has_sdg17 = "no"

    questions = profile.find_all("ul", class_='questionnaire')
    if len(questions) == 2:
        sdgs = questions[0].find_all("li")
        if len(sdgs) != 18:  # the correct SDG questionnaire has 17 questions + header
            temp_sdgs = questions[1].find_all("li")
            if len(temp_sdgs) == 18:
                sdgs = temp_sdgs
            else:
                sdgs = []
        if 'selected_question' in sdgs[1].get('class'):
            has_sdg1 = "yes"
        if 'selected_question' in sdgs[2].get('class'):
            has_sdg2 = "yes"
        if 'selected_question' in sdgs[3].get('class'):
            has_sdg3 = "yes"
        if 'selected_question' in sdgs[4].get('class'):
            has_sdg4 = "yes"
        if 'selected_question' in sdgs[5].get('class'):
            has_sdg5 = "yes"
        if 'selected_question' in sdgs[6].get('class'):
            has_sdg6 = "yes"
        if 'selected_question' in sdgs[7].get('class'):
            has_sdg7 = "yes"
        if 'selected_question' in sdgs[8].get('class'):
            has_sdg8 = "yes"
        if 'selected_question' in sdgs[9].get('class'):
            has_sdg9 = "yes"
        if 'selected_question' in sdgs[10].get('class'):
            has_sdg10 = "yes"
        if 'selected_question' in sdgs[11].get('class'):
            has_sdg11 = "yes"
        if 'selected_question' in sdgs[12].get('class'):
            has_sdg12 = "yes"
        if 'selected_question' in sdgs[13].get('class'):
            has_sdg13 = "yes"
        if 'selected_question' in sdgs[14].get('class'):
            has_sdg14 = "yes"
        if 'selected_question' in sdgs[15].get('class'):
            has_sdg15 = "yes"
        if 'selected_question' in sdgs[16].get('class'):
            has_sdg16 = "yes"
        if 'selected_question' in sdgs[17].get('class'):
            has_sdg17 = "yes"

          
    return (has_sdg1, has_sdg2, has_sdg3, has_sdg4, has_sdg5, has_sdg6, has_sdg7, has_sdg8, has_sdg9, has_sdg10, has_sdg11, has_sdg12,
             has_sdg13, has_sdg14, has_sdg15, has_sdg16, has_sdg17)

participants = gc_full_list_soup.find_all("tr")
pdfs = {}

num_pdfs = 0
num_nonpdfs = 0
num_noreport = 0

langregex = re.compile(r'(?<=\()[^\)\(]+(?=\)$)')

print("Getting details of each report ...")
for participant in participants:
    cells = participant.find_all('td')
    sector = cells[1].get_text(strip=True)
    country = cells[2].get_text(strip=True)
    year = cells[3].get_text(strip=True)

    participant_entry_url = gc_base_url + cells[0].a.get('href')
    participant_profile = requests.get(participant_entry_url)
    participant_profile_soup = BeautifulSoup(participant_profile.content, 'lxml')

    (participant_sdgs_1, participant_sdgs_2, participant_sdgs_3, participant_sdgs_4, participant_sdgs_5, participant_sdgs_6, participant_sdgs_7, participant_sdgs_8,
     participant_sdgs_9, participant_sdgs_10, participant_sdgs_11, participant_sdgs_12, 
      participant_sdgs_13, participant_sdgs_14, participant_sdgs_15, participant_sdgs_16, participant_sdgs_17) = check_sdgs_3_13(participant_profile_soup)

    main_body = participant_profile_soup.find("section", class_='main-content-body')
    list_items = main_body.find_all("li")
    found_report = False
    for li in list_items:
        if li.a:
            link = li.a.get('href')
            if "/attachments/" in link:
                if ".pdf" in link:
                    link = link.split('?')[0]
                    num_pdfs += 1
                    language = langregex.search(li.get_text(strip=True))[0]
                    pdfs[link] = { "sector" : sector, "country" : country, "year" : year, "language" : language, "sdgs1" : participant_sdgs_1, "sdgs2" : participant_sdgs_2, "sdgs3" : participant_sdgs_3, "sdgs4" : participant_sdgs_4,
                                  "sdgs5" : participant_sdgs_5, "sdgs6" : participant_sdgs_6, "sdgs7" : participant_sdgs_7, "sdgs8" : participant_sdgs_8, "sdgs9" : participant_sdgs_9, "sdgs10" : participant_sdgs_10,
                                  "sdgs11" : participant_sdgs_11, "sdgs12" : participant_sdgs_12, "sdgs13" : participant_sdgs_13, "sdgs14" : participant_sdgs_14, "sdgs15" : participant_sdgs_15,
                                  "sdgs16" : participant_sdgs_16, "sdgs17" : participant_sdgs_17,}
                    print(".", end='')
                else:
                    num_nonpdfs += 1
                found_report = True
    if not found_report:
        num_noreport += 1
print(" done.")
print("PDFs: %d, non-PDFs: %d, no-report: %d" % (num_pdfs, num_nonpdfs, num_noreport))

Getting details of each report ...
............. done.
PDFs: 13, non-PDFs: 1, no-report: 3


**Saving index of reports so that it can be reused**

In [None]:
import pandas as pd

reports_index_csv_filename = "reports_index.csv"

df_pdfs = pd.DataFrame.from_dict(pdfs, orient='index')
df_pdfs.to_csv(reports_index_csv_filename, sep='\t', encoding='utf-8')

---
**Starting point with newly created reports_index.csv**: This can be used when an index file is available (has been saved previously). Only run this cell if starting from this point, otherwise skip it. 

In [None]:
import pandas as pd

reports_index_csv_filename = "reports_index.csv"

df_pdfs = pd.read_csv(reports_index_csv_filename, sep='\t', encoding='utf-8', index_col=0, dtype={'year': object})
pdfs = df_pdfs.to_dict(orient='index')

In [None]:
df_pdfs

Unnamed: 0,sector,country,year,language,sdgs1,sdgs2,sdgs3,sdgs4,sdgs5,sdgs6,sdgs7,sdgs8,sdgs9,sdgs10,sdgs11,sdgs12,sdgs13,sdgs14,sdgs15,sdgs16,sdgs17
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/497004/original/COP DBR Lafite 2021.pdf,Beverages,France,2021,French,yes,no,yes,yes,yes,yes,yes,yes,yes,no,no,yes,yes,no,yes,yes,no
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/497000/original/BooztABSustainabilityreport2020.pdf,General Retailers,Sweden,2021,English,no,no,no,no,no,no,no,no,yes,no,no,yes,no,no,no,no,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/496999/original/CSR_rapport_2020_final.pdf,Financial Services,Denmark,2021,Danish,no,no,no,no,yes,no,no,no,no,no,no,no,yes,no,no,no,no
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/496997/original/annual_report_2020_full_links1.pdf,Food & Drug Retailers,Netherlands,2021,English,no,yes,yes,no,yes,no,no,yes,no,yes,no,yes,yes,yes,yes,no,no
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/496998/original/CCdP_UNI_global compact_COP_2020.pdf,Food Producers,Italy,2021,Italian,no,no,no,no,no,no,no,yes,no,no,no,no,no,no,yes,no,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/496994/original/COP rapport 2020 til Global Compact.pdf,Electricity,Denmark,2021,Danish,no,no,yes,yes,no,no,yes,yes,no,no,no,yes,yes,no,no,no,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/496985/original/Pacto Mundial Comunicacion del Progreso 2020.pdf,Pharmaceuticals & Biotechno...,Peru,2021,Spanish,yes,yes,no,yes,yes,no,no,no,no,yes,no,yes,no,no,no,yes,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/496986/original/Notificacion Resolucion de Certificacion OEA ILENDER EXPORTADOR.pdf,Pharmaceuticals & Biotechno...,Peru,2021,Spanish,yes,yes,no,yes,yes,no,no,no,no,yes,no,yes,no,no,no,yes,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/496987/original/Notificacion Resolucion de Certificacion OEA ILENDER IMPORTADOR.pdf,Pharmaceuticals & Biotechno...,Peru,2021,Spanish,yes,yes,no,yes,yes,no,no,no,no,yes,no,yes,no,no,no,yes,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/496988/original/RD1112020fondecyt BM resolucion.pdf,Pharmaceuticals & Biotechno...,Peru,2021,Spanish,yes,yes,no,yes,yes,no,no,no,no,yes,no,yes,no,no,no,yes,yes


---

In [None]:
countries = {}
sectors = {}
years = {}
languages = {}
sdgs3 = 0
sdgs13 = 0
sdgs3_13 = 0

for pdf in pdfs.keys():
    language = pdfs[pdf]["language"]
    year = pdfs[pdf]["year"]
    country = pdfs[pdf]["country"]
    sector = pdfs[pdf]["sector"]
    sdg3 = pdfs[pdf]["sdgs3"] # Remove SDG3/13 
    sdg13 = pdfs[pdf]["sdgs13"]

    sectors[sector] = sectors.get(sector,0) + 1
    countries[country] = countries.get(country,0) + 1
    years[year] = years.get(year,0) + 1
    languages[language] = languages.get(language,0) + 1    
    if sdg3 == "yes":
        sdgs3 += 1
    if sdg13 == "yes":
        sdgs13 += 1
    if sdg3 == "yes" and sdg13 == "yes":
        sdgs3_13 += 1      

In [None]:
print("Number of reports that include SDG 3 or SDG 13 or both: %d, %d, %d respectively" % (sdgs3, sdgs13, sdgs3_13)) #Make a histogram for SDGs distribution; Interactive on years (check: Pre SDG years? Only to use data after 2015)

Number of reports that include SDG 3 or SDG 13 or both: 3, 4, 3 respectively


In [None]:
df_languages = pd.DataFrame(sorted(languages.items(), key=lambda k: k[1], reverse=True), columns=["Language", "Number of reports"])
df_languages

Unnamed: 0,Language,Number of reports
0,Spanish,7
1,English,2
2,Danish,2
3,French,1
4,Italian,1


In [None]:
df_countries = pd.DataFrame(sorted(countries.items(), key=lambda k: k[1], reverse=True), columns=["Country", "Number of reports"])
df_countries

Unnamed: 0,Country,Number of reports
0,Peru,7
1,Denmark,2
2,France,1
3,Sweden,1
4,Netherlands,1
5,Italy,1


In [None]:
df_sectors = pd.DataFrame(sorted(sectors.items(), key=lambda k: k[1], reverse=True), columns=["Sector", "Number of reports"])
df_sectors

Unnamed: 0,Sector,Number of reports
0,Pharmaceuticals & Biotechno...,7
1,Beverages,1
2,General Retailers,1
3,Financial Services,1
4,Food & Drug Retailers,1
5,Food Producers,1
6,Electricity,1


In [None]:
df_years = pd.DataFrame(sorted(years.items(), reverse=True), columns=["Year", "Number of reports"])
df_years

Unnamed: 0,Year,Number of reports
0,2021,13


## 2. Selecting COP reports that match required criteria (up to focus_year, written in focus_language)

In [None]:
selected_sectors = {}
selected_countries = {}
selected_years = {}
selected_countries_years = {}

selected_pdfs = {}

for pdf in pdfs.keys():
    language = pdfs[pdf]["language"]
    year = pdfs[pdf]["year"]
    country = pdfs[pdf]["country"]
    sector = pdfs[pdf]["sector"]

    if language == language_ref[focus_language]['name'] and int(year) <= int(focus_year):
        selected_pdfs[pdf] = pdfs[pdf]
        
        selected_sectors[sector] = selected_sectors.get(sector,0) + 1
        selected_countries[country] = selected_countries.get(country,0) + 1
        selected_years[year] = selected_years.get(year,0) + 1
        if country in selected_countries_years.keys():
            selected_countries_years[country][year] = selected_countries_years[country].get(year,0) + 1
        else:
            selected_countries_years[country] = {year : 1}

In [None]:
print("There are %d reports up to %s written in %s" % (len(selected_pdfs.keys()), focus_year, language_ref[focus_language]['name']))

There are 2 reports up to 2021 written in English


In [None]:
df_selected_countries = pd.DataFrame(sorted(selected_countries.items(), key=lambda k: k[1], reverse=True), columns=["Country", "Number of reports"])
df_selected_countries

Unnamed: 0,Country,Number of reports
0,Sweden,1
1,Netherlands,1


In [None]:
df_selected_sectors = pd.DataFrame(sorted(selected_sectors.items(), key=lambda k: k[1], reverse=True), columns=["Sector", "Number of reports"])
df_selected_sectors

Unnamed: 0,Sector,Number of reports
0,General Retailers,1
1,Food & Drug Retailers,1


In [None]:
df_selected_years = pd.DataFrame(sorted(selected_years.items(), reverse=True), columns=["Year", "Number of reports"])
df_selected_years

Unnamed: 0,Year,Number of reports
0,2021,2


## 3. Downloading PDF file for each COP report that matches required criteria
At this time we've only considered reports written in the focus language and submitted up to end of the focus year.

A folder should be specified as the location where PDFs will be downloaded to ('pdfs_folder' variable below).

If this process has been run before and some files are already available in the specified folder, they won't be downloaded again.

In [None]:
pdfs_folder = "/content/"

In [None]:
import re
filenameregex = re.compile(r'(?<=/)[^$/]+(?=$)')

In [None]:
!pip install --upgrade pip



In [None]:
!python -m pip install PyPDF2



In [None]:
!pip3 install --upgrade setuptools
!pip install requests



In [None]:
import PyPDF2
import shutil
import nltk
import os
import os.path
import requests
from urllib.request import urlretrieve


try:
    os.stat(pdfs_folder)
except:
    os.mkdir(pdfs_folder) 

for pdf in selected_pdfs.keys():
    filename = pdfs_folder + filenameregex.search(pdf)[0]

    if not os.path.isfile(filename):
        print("Saving %s" % (filename))
        file = requests.get(full_gc_url_part1 + pdf, stream=True)
        try:
            with open(filename, 'wb') as out_file: #file handler needed
                 shutil.copyfileobj(file.raw, out_file) #file name closed needed
            del file
        except:
            print("Could not save %s" % (filename))
            continue
    else:
        print("Skipping %s, PDF already available in folder" % (filename))

Saving /content/BooztABSustainabilityreport2020.pdf
Saving /content/annual_report_2020_full_links1.pdf


In [None]:
pdf

'//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/496997/original/annual_report_2020_full_links1.pdf'

## 4. Extracting text from the PDF file of each report

A folder should be specified as the location where text files will be saved at ('txts_folder' variable below).

This process may fail to extract the text from some PDF files.

If this process has been run before and some text files are already available in the specified folder, they won't be processed again.

In [None]:
txts_folder = "/content/"

In [None]:
try:
    os.stat(txts_folder)
except:
    os.mkdir(txts_folder) 

for pdf in selected_pdfs.keys():
    filename = pdfs_folder + filenameregex.search(pdf)[0]
    filenametxt = txts_folder + filenameregex.search(pdf)[0] + ".txt"
    if not os.path.isfile(filenametxt):
        print("Loading %s" % (filename))
        try:
            pdfFileObj = open(filename, 'rb')
            txtFileObj = open(filenametxt, 'w')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            num_pages = pdfReader.numPages
        except:
            print("Couldn't load %s" % (filename))
            continue
    
        print("Extracting text from %s" % (filename))
        for num_page in range(0,num_pages):
            try:
                pageObj = pdfReader.getPage(num_page)
                txtFileObj.write(pageObj.extractText())
            except:
                print("Couldn't extract txt %s, page %d" % (filename, num_page))
                continue
        pdfFileObj.close()
        txtFileObj.close()
    else:
        print("Skipping %s, TXT already available in folder" % (filename)) 

Loading /content/BooztABSustainabilityreport2020.pdf
Couldn't load /content/BooztABSustainabilityreport2020.pdf
Loading /content/annual_report_2020_full_links1.pdf
Couldn't load /content/annual_report_2020_full_links1.pdf


-------

# Collecting News & Media PDF documents from Connecting Business Initiative through web-scrapping

Items have been derived from the following link: https://www.connectingbusiness.org/all-topics?field_topic_type_target_id=3&sector=All&country=All&region=All&theme=All&organization=All&sort_bef_combine=created%20DESC&created_op&login_op&sort_by=created&sort_order=DESC&page=0

In [None]:
import requests
import re
from bs4 import BeautifulSoup

cbi_url = "https://www.connectingbusiness.org/all-topics?field_topic_type_target_id=3&sector=All&country=All&region=All&theme=All&organization=All&sort_bef_combine=created%20DESC&created_op&login_op&sort_by=created&sort_order=DESC&page=0"
cbi_base_url = "https://www.connectingbusiness.org"

cbi_home = requests.get(cbi_url)

soup = BeautifulSoup(cbi_home.content, 'lxml')


In [None]:
soup

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="og: https://ogp.me/ns#">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta charset="utf-8"/>
<meta charset="utf-8"/>
<noscript><style>form.antibot * :not(.antibot-message) { display: none !important; }</style>
</noscript><script>(function(i,s,o,g,r,a,m){i["GoogleAnalyticsObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,"script","https://www.google-analytics.com/analytics.js","ga");ga("create", "UA-10576069-38", {"cookieDomain":"auto"});ga("set", "anonymizeIp", true);ga("send", "pageview");</script>
<link href="https://www.connectingbusiness.org/all-topics" rel="canonical"/>
<meta content="Drupal 8 (https://www.drupal.org)" na

In [None]:
cbi_news_url = "https://www.connectingbusiness.org/all-topics?field_topic_type_target_id=3&sector=All&country=All&region=All&theme=All&organization=All&sort_bef_combine=created%20DESC&created_op&login_op&sort_by=created&sort_order=DESC&page="

def get_link(page):
  
  r = requests.get(cbi_news_url+str(page))
  soup = BeautifulSoup(r.content, 'lxml')
  return soup

cbi_news_full_soup = BeautifulSoup()
for i in range (1, 10):
  cbi_news_full_soup.append(get_link(i))

In [None]:
cbi_news_full_soup

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="og: https://ogp.me/ns#">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta charset="utf-8"/>
<meta charset="utf-8"/>
<noscript><style>form.antibot * :not(.antibot-message) { display: none !important; }</style>
</noscript><script>(function(i,s,o,g,r,a,m){i["GoogleAnalyticsObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,"script","https://www.google-analytics.com/analytics.js","ga");ga("create", "UA-10576069-38", {"cookieDomain":"auto"});ga("set", "anonymizeIp", true);ga("send", "pageview");</script>
<link href="https://www.connectingbusiness.org/all-topics" rel="canonical"/>
<meta content="Drupal 8 (https://www.drupal.org)" na

In [None]:
news_items = cbi_news_full_soup.find_all("div", {"class":"card teaser teaser--tile brand-border-radius"})
pdfs = {}

num_pdfs = 0
num_nonpdfs = 0
num_noreport = 0

langregex = re.compile(r'(?<=\()[^\)\(]+(?=\)$)')

print("Getting details of each report...")
for news_item in news_items:
  cells = news_item.find_all('href')
  news_entry_url = cbi_base_url + news_item.a.get('href') 
  news_entry = requests.get(news_entry_url)
  news_entry_soup = BeautifulSoup(news_entry.content, 'lxml')

  main_body = news_entry_soup.find("main", {"id":"content"})
  li_items = main_body.find_all("li")
  found_report = False
  for li in li_items:
      if li.a:
        link = li.a.get('href')
        if ".pdf" in link:
            num_pdfs += 1
            pdfs[link] = {}
            print(".", end='')
        else:
            num_nonpdfs += 1
  if not found_report:
    num_noreport += 1
print("Done.")
print("PDFs: %d, non-PDFs: %d" % (num_pdfs, num_nonpdfs))


Getting details of each report...
................................Done.
PDFs: 32, non-PDFs: 7


In [None]:
#Save Index
import pandas as pd

reports_index_csv_filename = "reports_cbi_index.csv"

df_pdfs = pd.DataFrame.from_dict(pdfs)
df_pdfs.to_csv(reports_index_csv_filename, sep='\t', encoding='utf-8')

In [None]:
df_pdfs

Unnamed: 0,https://www.connectingbusiness.org/system/files/2021-02/PRESS%20RELEASE-19FEBRRUARY2021-PDRF%20launches%20SIKAP%20digital%20hub%20for%20MSME%20resilience.pdf,https://www.connectingbusiness.org/system/files/2021-02/Newsletter%20La%20Veille%20du%20Patronat%20N%C2%B043%20version%20valide%CC%81e%20ok.pdf,https://www.connectingbusiness.org/system/files/2021-01/Bulletin%20December%202020%20-%20January%202021%20.pdf,https://www.connectingbusiness.org/system/files/2020-12/CBi%20Newsletter%20-%20December%202020_compressed.pdf,https://www.connectingbusiness.org/system/files/2020-12/Bulletin%20October%20-%20November%20%281%29.pdf,https://www.connectingbusiness.org/system/files/2020-11/CBi%20Newsletter%20September-October%202020_0.pdf,https://www.connectingbusiness.org/system/files/2020-11/PRESS%20RELEASE%20-%2009%20NOV%202020%20-%20Digital%20transformation%20key%20to%20recovery%20of%20education%20and%20business%20sectors%20amid%20pandemic.pdf,https://www.connectingbusiness.org/system/files/2020-09/PRESS%20RELEASE%20-%2017%20SEPT%202020%20-%20PDRF%2C%20New%20Zealand%20launch%20partnership%20strengthening%20LGUs%E2%80%99%20COVID-19%20infection%20prevention%20and%20control.pdf,https://www.connectingbusiness.org/system/files/2020-08/CBi%20Newsletter%20-%20202008%20April%20to%20Aug_0.pdf,https://www.connectingbusiness.org/system/files/2020-08/Lebanon%20Flash%20Appeal%20FINAL%2014%20Aug%202020.pdf,https://www.connectingbusiness.org/system/files/2020-08/PSHP%20COVID-19%20Business%20Brief.pdf,https://www.connectingbusiness.org/system/files/2020-06/CBi%20Brochure.pdf,https://www.connectingbusiness.org/system/files/2020-06/Bulletin%20%20December%20January%20.pdf,https://www.connectingbusiness.org/system/files/2020-05/HPPP%20Newsletter%20May2020final.pdf,https://www.connectingbusiness.org/system/files/2020-05/PRESS%20RELEASE%20-%2022%20MAY%202020%20-%20Fundraising%20for%20healthcare%20frontliners%20raises%2092%20million%20pesos%2C%20provides%20200%2C000%20PPEs%20for%2070%20hospitals_0.pdf,https://www.connectingbusiness.org/system/files/2020-05/What%20Good%20Business%20Looks%20Like.pdf,https://www.connectingbusiness.org/system/files/2020-05/COVID-19%20UN%20Multilateral%20Response%20presentation.pdf,https://www.connectingbusiness.org/system/files/2020-05/CBi%20funding%20support%20COVID-19_1.pdf,https://www.connectingbusiness.org/system/files/2020-04/TC%20Harold%20-%20Relief%20Appeal_0.pdf,https://www.connectingbusiness.org/system/files/2020-04/COVID-19-Business-Brief-20032020-EN.pdf,https://www.connectingbusiness.org/system/files/2020-08/CBi%20Newsletter%20January-March.pdf,https://www.connectingbusiness.org/system/files/2020-04/TC%20Harold%20and%20COVID-19%20Vanuatu%202020_0.pdf,https://www.connectingbusiness.org/system/files/2020-04/Bulletin%20February%20-%20March%202020%20%283%29.pdf.pdf,https://www.connectingbusiness.org/system/files/2019-11/November%202019%20Newsletter%20HPPP.pdf,https://www.connectingbusiness.org/system/files/2019-08/East%20Africa%20Newsletter-July%202019.pdf,https://www.connectingbusiness.org/system/files/2019-03/HPPP%20March%202019%20Newsletter%20NEW%201.pdf,https://www.connectingbusiness.org/system/files/2019-01/62115_sasakawaawardnominationform2019.pdf,https://www.connectingbusiness.org/system/files/2018-11/CBi-Turkey-launch-PR17Nov.pdf,https://www.connectingbusiness.org/system/files/2018-09/PDRF%20helps%20school%20in%20Marawi.pdf,https://www.connectingbusiness.org/system/files/2018-09/2019%20Conrad%20N.%20Hilton%20Humanitarian%20Prize.pdf,https://www.connectingbusiness.org/system/files/2018-04/EIC_Prize_HT_for_humanitarian_aid_A0_poster_HR.pdf,https://www.connectingbusiness.org/system/files/2018-04/EICHorizonPrize-De%CC%81pliant_LD.pdf


In [None]:
#Load Index
import pandas as pd

reports_index_csv_filename = "reports_cbi_index.csv"

df_pdfs = pd.read_csv(reports_index_csv_filename, sep='\t', encoding='utf-8', index_col=0, dtype={'year': object})
pdfs = df_pdfs.to_dict(orient='index')

In [None]:
pdfs_folder = "/content/"

In [None]:
import PyPDF2
import shutil
import nltk
import os
import os.path
import requests


try:
    os.stat(pdfs_folder)
except:
    os.mkdir(pdfs_folder) 

for pdf in pdfs.keys():
    filename = pdfs_folder + filenameregex.search(pdf)[0]

    if not os.path.isfile(filename):
        print("Saving %s" % (filename))
        file = requests.get(pdf, stream=True)
        try:
            with open(filename, 'wb') as out_file:
                shutil.copyfileobj(file.raw, out_file)
            del file
        except:
            print("Could not save %s" % (filename))
            continue
    else:
        print("Skipping %s, PDF already available in folder" % (filename))

------
