# Demo notebook for the scraping pipeline

In [1]:
import pandas as pd
from scraping_lib import scrape_documents
from text_wrangling_utils import build_segments_df



In [2]:
url_list = [
            "https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/900030/2020-07-13_Tier_2_Policy_Guidance.pdf",
            "https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system",
            "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-a-attributes",
            "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-c-maintenance-funds",
            "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-b-english-language",
           ]

In [3]:
scrape_df = pd.DataFrame(scrape_documents(url_list,out_path = "filedump/"))
scrape_df.head()

scraping 1/5: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/900030/2020-07-13_Tier_2_Policy_Guidance.pdf
scraping 2/5: https://www.gov.uk/guidance/immigration-rules/immigration-rules-part-6a-the-points-based-system
scraping 3/5: https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-a-attributes
scraping 4/5: https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-c-maintenance-funds
scraping 5/5: https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-b-english-language


Unnamed: 0,URL,media type,title,text_dump,text_segmented,hyperlinks_dump,timestamp
0,https://assets.publishing.service.gov.uk/gover...,application/pdf,2020-07-13_Tier_2_Policy_Guidance.pdf,\n\n \n \n \n \n \n \n \n \n \n\n \n\nTier ...,title subtitle subsubtitle text_...,[],2020-08-20T07:59:07+00:00
1,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules part 6A: the points-based sy...,\nImmigration Rules part 6A: the points-based ...,"[(text, Immigration Rules part 6A: the points-...",[],2020-08-20T08:00:56+00:00
2,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules Appendix A: attributes,\nImmigration Rules Appendix A: attributes\nPo...,"[(text, Immigration Rules Appendix A: attribut...",[http://www.oanda.com],2020-08-20T08:00:59+00:00
3,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules Appendix C: maintenance (funds),\nImmigration Rules Appendix C: maintenance (f...,"[(text, Immigration Rules Appendix C: maintena...",[],2020-08-20T08:01:02+00:00
4,https://www.gov.uk/guidance/immigration-rules/...,text/html,Immigration Rules Appendix B: English language,\nImmigration Rules Appendix B: English langua...,"[(text, Immigration Rules Appendix B: English ...",[],2020-08-20T08:01:04+00:00


Segmented text for the test PDF
---

In [4]:
scrape_df.loc[0,"text_segmented"]

Unnamed: 0,title,subtitle,subsubtitle,text_type,text
0,0.0,0.0,0.0,subtitle,\nTIER 2 \nVersion \n07/20 \n
1,1.0,0.0,0.0,title,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ...
2,1.0,0.0,0.0,main_text,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ...
3,1.0,0.0,0.0,subsubtext,Tier 2 Policy Guidance (Version 07/2020) \nPag...
4,1.0,0.0,0.0,subtext,\n \n \n 2 \n \n \n \n 3 \n \n \n \n 4 \nJanu...
5,1.0,0.0,0.0,subsubsubsubtext,\n \n \n ...
6,1.0,0.0,1.0,subsubtitle,\nEligibility \n
7,1.0,0.0,1.0,main_text,\n5. You must have a job offer and a Certifi...
8,1.0,0.0,1.0,subsubtext,\n \n \nTier 2 Policy Guidance (Version 07/20...
9,1.0,0.0,1.0,subtext,\n1 A ‘higher education provider’ is an educa...


Segmented text for the test HTML
---

In [5]:
build_segments_df(scrape_df.loc[1,"text_segmented"])

Unnamed: 0,section,subsection,is table,string
0,0,0,False,Immigration Rules part 6A: the points-based sy...
1,1,0,False,245AAA.General requirements for indefinite lea...
2,1,0,False,The following rules apply to all requirements ...
3,2,0,False,245AA.Documents not submitted with applications
4,2,0,False,\n(a) Subject to sub-paragraph (b) and where o...
...,...,...,...,...
143,14,4,False,To qualify for leave to remain as a Tier 4 (Ch...
144,14,5,False,245ZZD. Period and conditions of grant
145,14,5,False,\n(a) Where the applicant is under the age of ...
146,14,6,False,"245ZZE Specified documents, details and requir..."


# Attempting to run scrape_pdf_by_line() on downloaded files

In [7]:
from scraping_lib import scrape_pdf_by_line,pdf_dict_to_outputformat

dict_lbl=scrape_pdf_by_line("filedump/2020-07-13_Tier_2_Policy_Guidance.pdf")

df_lbl = pd.DataFrame(dict_lbl)

pdf_dict_to_outputformat(df_lbl)

Unnamed: 0,title,subtitle,subsubtitle,text_type,text
0,0.0,0.0,0.0,subtitle,\nTIER 2 \nVersion \n07/20 \n
1,1.0,0.0,0.0,title,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ...
2,1.0,0.0,0.0,main_text,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ...
3,1.0,0.0,0.0,subsubtext,Tier 2 Policy Guidance (Version 07/2020) \nPag...
4,1.0,0.0,0.0,subtext,\n \n \n 2 \n \n \n \n 3 \n \n \n \n 4 \nJanu...
5,1.0,0.0,0.0,subsubsubsubtext,\n \n \n ...
6,1.0,0.0,1.0,subsubtitle,\nEligibility \n
7,1.0,0.0,1.0,main_text,\n5. You must have a job offer and a Certifi...
8,1.0,0.0,1.0,subsubtext,\n \n \nTier 2 Policy Guidance (Version 07/20...
9,1.0,0.0,1.0,subtext,\n1 A ‘higher education provider’ is an educa...


### A few pathological test cases

In [None]:
test_urls = [
            "https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/900030/2020-07-13_Tier_2_Policy_Guidance.pdf",
            "https://www.gov.uk/guidance/immigration-rules/immigration-rules-appendix-c-maintenance-funds",
            "https://upload.wikimedia.org/wikipedia/commons/thumb/9/9e/Vincent_van_Gogh_-_The_raising_of_Lazarus_%28after_Rembrandt%29_-_Google_Art_Project.jpg/1280px-Vincent_van_Gogh_-_The_raising_of_Lazarus_%28after_Rembrandt%29_-_Google_Art_Project.jpg",
            "http://www.wikipedia.org",
            "thisurldoesnotexist"
            ]

test_scrape = scrape_documents(test_urls)

In [None]:
test_scrape_df = pd.DataFrame(test_scrape)
test_scrape_df.head()