In [1]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup
from html.parser import HTMLParser

In [2]:
doc_url = '''
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/UN Staff Rules and Regulations.docx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/AC_Accountability_Information Disclosure.docx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/AC_Accountability_Internal Control Framework_Operational Guide.docx
'''.split('\n')

pdf_url = '''
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/UNDP Financial Regulations and Rules 5.7.pdf
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/Engagement with NGOs under Country Based Pooled Funds (French).pdf
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/HR_Insurance Plans_MIP Cigna Refund Claim Form (AR).pdf
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/HR_Insurance Plans_MIP Cigna Prior Approval Form (AR).pdf
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/HR_Service Contracts (SC)_UN-Outsourcing and Impact on Staff (2005).pdf
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/HR_Standards of Conduct_Status Basic Rights and Duties of United Nations Staff Members (2002).pdf
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/FRM_Cost_Recovery_Full_Visual_Guide.pdf
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/SO_Framework of Accountability Explainer.pdf
'''.split('\n')

ppt_url = '''
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/PPM_SOP_Project_Origination_and_Integrated_Programming.pptx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/PPM_Governance arrangements GEF-GCF Project Documents.pptx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/Pipeline_Management_2014_06.pptx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/FRM_Financial Operations Management_Revenue Management Better Practice Guide070306.ppt
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/Atlas Contract Management Module User Guide.pptx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/PPM_Project Management_Pipeline Management.pptx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/FRM_Resource Planning and Cost Recovery_Cost Recovery_GMSvsDPC.pptx
'''.split('\n')

xls_url = '''
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/PPM_GEF-Funded Projects_Roles and Accountability_RACI Matrix.xlsx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/PSU_Evaluation of Offers_Copy of Preliminary Examination Template - for RFPs and ITBs.xlsx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/PSU_RFP-Evaluation-Tool_S.xlsx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/Admin Services_05-Funding-model-01D.xlsx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/PSU_Evaluation of Offers_Copy of Preliminary Examination Template - for RFPs and ITBs_FR.xlsx
https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/FRM_Annex 1_DRAFT-2015_Modified Template CERTIFICATION of Common Services.xlsx
'''.split('\n')

In [3]:
extract_endpoint = 'https://docanalyze.azurewebsites.net/api/ExtractText'

In [20]:
def get_links(html):
    soup = BeautifulSoup(html)
    result = []
    links = soup.find_all('a')
    # print(links[:3])
    for link in links:
        meta_doc = {}
        url = link.get('href')
        if not url:
            continue
        link_str = link.string
        meta_doc = {'Link URL': url, 'Link Name': link_str}
        if 'popp.undp.org' in url:
            meta_doc.update({'Link Category': 'POPP'})
        elif 'intranet.undp.org' in url:
            meta_doc.update({'Link Category': 'Intranet'})
        elif 'content.undp.org' in url:
            meta_doc.update({'Link Category': 'UNDP System'})
        elif 'mailto:' in url:
            meta_doc.update({'Link Category': 'e-mail address'})
        else:
             meta_doc.update({'Link Category': 'Other'})

        result.append(meta_doc)
    return result

def process(url):
    r = requests.get(extract_endpoint, params={'url': url, 'htmlFlag': True})
    html = r.text
    print('extract http status: ', r.status_code)
    print('HTML: ', html[:300])
    result = get_links(html=html)
    # print(result[0])
    link_df = pd.DataFrame(result)
    fileName = url.split('/')[-1]
    link_df['FileName'] = fileName
    return link_df

# Try Word Document

In [21]:
url = 'https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/UN Staff Rules and Regulations.docx'
df = process(url)
df.head()

extract http status:  200
HTML:  <html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="date" content="2022-10-04T20:09:00Z" />
<meta name="Total-Time" content="2" />
<meta name="extended-properties:AppVersion" content="16.0000" />
<meta name="meta:paragraph-count" content="558" />
<meta name="custom:ODSRefJobNo" content="1


Unnamed: 0,Link URL,Link Name,Link Category,FileName
0,https://undocs.org/A/RES/72/254,72/254,Other,UN Staff Rules and Regulations.docx
1,https://undocs.org/A/RES/70/244,70/244,Other,UN Staff Rules and Regulations.docx
2,https://undocs.org/A/RES/70/244,70/244,Other,UN Staff Rules and Regulations.docx
3,https://undocs.org/A/RES/70/244,70/244,Other,UN Staff Rules and Regulations.docx
4,https://undocs.org/en/A/RES/73/273,73/273,Other,UN Staff Rules and Regulations.docx


In [22]:
df['Link Category'].value_counts().to_frame()

Unnamed: 0,Link Category
Other,88


# Try PDF

In [25]:
url = 'https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/FRM_Cost_Recovery_Full_Visual_Guide.pdf'
df = process(url)
df.head()

extract http status:  200
HTML:  <html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="date" content="2022-03-23T13:37:42Z" />
<meta name="pdf:PDFVersion" content="1.7" />
<meta name="xmp:CreatorTool" content="Adobe InDesign 17.1 (Windows)" />
<meta name="pdf:hasXFA" content="false" />
<meta name="access_permission:modify_


Unnamed: 0,Link URL,Link Name,Link Category,FileName
0,#nameddest=DESPlan,#nameddest=DESPlan,Other,FRM_Cost_Recovery_Full_Visual_Guide.pdf
1,#nameddest=DESImplement,#nameddest=DESImplement,Other,FRM_Cost_Recovery_Full_Visual_Guide.pdf
2,#nameddest=DESMonitor,#nameddest=DESMonitor,Other,FRM_Cost_Recovery_Full_Visual_Guide.pdf
3,#nameddest=GMSPlan,#nameddest=GMSPlan,Other,FRM_Cost_Recovery_Full_Visual_Guide.pdf
4,#nameddest=GMSImplement,#nameddest=GMSImplement,Other,FRM_Cost_Recovery_Full_Visual_Guide.pdf


In [27]:
df.tail()

Unnamed: 0,Link URL,Link Name,Link Category,FileName
56,https://finance.partneragencies.org/psp/UNDPP1...,https://finance.partneragencies.org/psp/UNDPP1...,Other,FRM_Cost_Recovery_Full_Visual_Guide.pdf
57,https://popp.undp.org/_layouts/15/WopiFrame.as...,https://popp.undp.org/_layouts/15/WopiFrame.as...,POPP,FRM_Cost_Recovery_Full_Visual_Guide.pdf
58,https://popp.undp.org/SitePages/POPPSubject.as...,https://popp.undp.org/SitePages/POPPSubject.as...,POPP,FRM_Cost_Recovery_Full_Visual_Guide.pdf
59,https://undp.sharepoint.com/sites/UNAgencies/S...,https://undp.sharepoint.com/sites/UNAgencies/S...,Other,FRM_Cost_Recovery_Full_Visual_Guide.pdf
60,https://popp.undp.org/_layouts/15/WopiFrame.as...,https://popp.undp.org/_layouts/15/WopiFrame.as...,POPP,FRM_Cost_Recovery_Full_Visual_Guide.pdf


In [26]:
df['Link Category'].value_counts().to_frame()

Unnamed: 0,Link Category
POPP,29
Other,17
Intranet,14
e-mail address,1


# Try PPT 

In [30]:
url = 'https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/FRM_Resource Planning and Cost Recovery_Cost Recovery_GMSvsDPC.pptx'
df = process(url)
df.head()

extract http status:  200
HTML:  <html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="date" content="2017-01-18T02:15:22Z" />
<meta name="cp:revision" content="1" />
<meta name="extended-properties:AppVersion" content="16.0000" />
<meta name="extended-properties:PresentationFormat" content="Letter Paper (8.5x11 in)" />
<m


Unnamed: 0,FileName


In [None]:
df['Link Category'].value_counts().to_frame()

# Try XLS

In [31]:
url = 'https://popp.undp.org/UNDP_POPP_DOCUMENT_LIBRARY/Public/PSU_RFP-Evaluation-Tool_S.xlsx'
df = process(url)
df.head()

extract http status:  200
HTML:  <html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="date" content="2012-03-01T21:39:38Z" />
<meta name="extended-properties:AppVersion" content="14.0300" />
<meta name="dc:creator" content="abdouramane.boureima" />
<meta name="meta:print-date" content="2011-08-09T16:19:51Z" />
<meta name=


Unnamed: 0,FileName
