<a href="https://colab.research.google.com/github/leopard8k/IRCC_Scraping/blob/master/IRCC_BasicQA_plus_FineTuning_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install html2text

Collecting html2text
  Downloading https://files.pythonhosted.org/packages/ae/88/14655f727f66b3e3199f4467bafcc88283e6c31b562686bf606264e09181/html2text-2020.1.16-py3-none-any.whl
Installing collected packages: html2text
Successfully installed html2text-2020.1.16


In [2]:
from bs4 import BeautifulSoup
import html2text
import re
import requests
import time
import tqdm
import urllib.request


In [3]:
CND_SITE = 'https://www.canada.ca'
IRCC_SUFFIX='/en/immigration-refugees-citizenship/'
filter = re.compile('^' + IRCC_SUFFIX + '.*')
exclude_some = re.compile("#")
url = CND_SITE + IRCC_SUFFIX
scraped_uris={
              IRCC_SUFFIX: {
                  "url": CND_SITE + IRCC_SUFFIX,
               "broken": False,
               "visited": False,
               "children": {},
               "text":"",
               }
}

In [4]:
def get_dict_hrefs(uri):
  for link in tqdm.tqdm(uri.copy()):
    if not uri[link].get("visited"):
      uri[link]["visited"] = True
      response = requests.get(CND_SITE+link)
      if response.status_code == 200:
        uri[link]["broken"] = False
        soup = BeautifulSoup(response.text, "html.parser")
        for tag in soup(['head', 'footer']):
          tag.decompose()
        uri[link]["text"] = soup.get_text()
        uri.update({a['href']:{} for a in soup.findAll('a', href=filter) if not exclude_some.search(a['href']) and a['href'] not in uri}) 
      else:
        uri[link]["broken"] = True      
  
  return uri

### Repeat the following until no new members are added

In [5]:
len1 = len(scraped_uris)
len2 = len1 + 1
while len2 > len1:
  len1 = len2
  scraped_uris.update(get_dict_hrefs(scraped_uris))
  len2 = len(scraped_uris)
len2

100%|██████████| 1/1 [00:00<00:00,  3.40it/s]
100%|██████████| 37/37 [00:04<00:00,  8.86it/s]
100%|██████████| 631/631 [02:14<00:00,  4.68it/s]
100%|██████████| 3706/3706 [12:30<00:00,  4.94it/s]
100%|██████████| 5791/5791 [08:26<00:00, 11.43it/s]
100%|██████████| 6458/6458 [02:50<00:00, 37.87it/s]
100%|██████████| 6489/6489 [00:06<00:00, 988.92it/s]  
100%|██████████| 6495/6495 [00:00<00:00, 7338.42it/s] 


6495

In [6]:
h2t = html2text.HTML2Text()
h2t.ignore_images = True
h2t.ignore_links = True
h2t.ignore_emphasis = True
h2t.ignore_tables = True


## Download

In [7]:
count=100000
DATA_DIR='./IRCC_data'

!rm -fr ./IRCC_data 
!mkdir ./IRCC_data

for link in tqdm.tqdm(scraped_uris, desc="Downloading"):
  html_code = scraped_uris[link].get('text','')
  if html_code:
    count += 1
    with open(DATA_DIR+'/file-'+str(count)+'.html.txt', "w") as outfile:
      outfile.write(h2t.handle(html_code))


Downloading: 100%|██████████| 6495/6495 [00:30<00:00, 210.97it/s]


In [8]:
# Make sure you have a GPU running
!nvidia-smi

Tue Mar  9 12:42:47 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
# Install the latest release of Haystack in your own environment 
#! pip install farm-haystack

# Install the latest master of Haystack
!pip install git+https://github.com/deepset-ai/haystack.git
!pip install urllib3==1.25.4


Collecting git+https://github.com/deepset-ai/haystack.git
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-req-build-s1q2fs5b
  Running command git clone -q https://github.com/deepset-ai/haystack.git /tmp/pip-req-build-s1q2fs5b
Collecting farm==0.6.2
[?25l  Downloading https://files.pythonhosted.org/packages/5b/3d/91c184813b8205c697c13117154f3216f01709291155cc9ee88628cb63d2/farm-0.6.2-py3-none-any.whl (207kB)
[K     |████████████████████████████████| 215kB 8.0MB/s 
[?25hCollecting fastapi
[?25l  Downloading https://files.pythonhosted.org/packages/9f/33/1b643f650688ad368983bbaf3b0658438038ea84d775dd37393d826c3833/fastapi-0.63.0-py3-none-any.whl (50kB)
[K     |████████████████████████████████| 51kB 6.5MB/s 
[?25hCollecting uvicorn
[?25l  Downloading https://files.pythonhosted.org/packages/c8/de/953f0289508b1b92debdf0a6822d9b88ffb0c6ad471d709cf639a2c8a176/uvicorn-0.13.4-py3-none-any.whl (46kB)
[K     |████████████████████████████████| 51kB 6.9MB/s 
[?25hCollecting

In [10]:
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

03/09/2021 12:46:01 - INFO - faiss.loader -   Loading faiss with AVX2 support.
03/09/2021 12:46:01 - INFO - faiss.loader -   Loading faiss.
03/09/2021 12:46:03 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


## Document Store


### Start an Elasticsearch server
You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.

In [11]:
# Recommended: Start Elasticsearch using Docker
#! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2

In [12]:
# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [13]:
# Connect to Elasticsearch

from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

03/09/2021 12:47:23 - INFO - elasticsearch -   HEAD http://localhost:9200/ [status:200 request:0.089s]
03/09/2021 12:47:23 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:200 request:0.324s]
03/09/2021 12:47:23 - INFO - elasticsearch -   PUT http://localhost:9200/label [status:200 request:0.213s]


## Preprocessing of documents

Haystack provides a customizable pipeline for:
 - converting files into texts
 - cleaning texts
 - splitting texts
 - writing them to a Document Store


In [14]:

doc_dir = DATA_DIR

# Convert files to dicts
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
dicts = convert_files_to_dicts(dir_path=doc_dir, split_paragraphs=False)

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is:
# {
#    'text': "<DOCUMENT_TEXT_HERE>",
#    'meta': {'name': "<DOCUMENT_NAME_HERE>", ...}
#}
# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and
# can be accessed later for filtering or shown in the responses of the Finder)

# Let's have a look at the first 3 entries:
print(dicts[:3])

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
03/09/2021 12:47:41 - INFO - haystack.preprocessor.utils -   Converting IRCC_data/file-102471.html.txt
03/09/2021 12:47:41 - INFO - haystack.preprocessor.utils -   Converting IRCC_data/file-102835.html.txt
03/09/2021 12:47:41 - INFO - haystack.preprocessor.utils -   Converting IRCC_data/file-104872.html.txt
03/09/2021 12:47:41 - INFO - haystack.preprocessor.utils -   Converting IRCC_data/file-100873.html.txt
03/09/2021 12:47:41 - INFO - haystack.preprocessor.utils -   Converting IRCC_data/file-104954.html.txt
03/09/2021 12:47:41 - INFO - haystack.preprocessor.utils -   Converting IRCC_data/file-102501.html.txt
03/09/2021 12:47:41 - INFO - haystack.preprocessor.utils -   Converting IRCC_data/file-102823.html.txt
03/09/2021 12:47:41 - INFO - haystack.preprocessor.utils -   Converting IRCC_data/file-105185.html.txt
03/09/2021 12:47:41 - INFO - haystack.preprocessor.utils -   Converting IRCC_data/file-101704.html.txt
03/09/20

[{'text': 'Skip to main content Skip to "About government" Language selection Français fr\n/ Gouvernement du Canada Search Search Canada.ca Search Menu Main Menu Jobs\nand the workplace Immigration and citizenship Travel and tourism Business and\nindustry Benefits Health Taxes Environment and natural resources National\nsecurity and defence Culture, history and sport Policing, justice and\nemergencies Transport and infrastructure Canada and the world Money and\nfinances Science and innovation You are here: Canada.ca Immigration, Refugees\nand Citizenship Canada Corporate information Publications and Manuals\nOperational instructions and guidelines Operational Bulletins 2012 Operational\nBulletin 460 - July 27, 2012 This section contains policy, procedures and\nguidance used by IRCC staff. It is posted on the department’s website as a\ncourtesy to stakeholders. Acceptance of Handling of Public Money Receipts at\nLocal CIC Offices in Canada for Global Case Management Systems Applications

03/09/2021 12:48:17 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:3.925s]
03/09/2021 12:48:19 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.280s]
03/09/2021 12:48:22 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.248s]
03/09/2021 12:48:24 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.248s]
03/09/2021 12:48:26 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.150s]
03/09/2021 12:48:29 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.217s]
03/09/2021 12:48:31 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.074s]
03/09/2021 12:48:33 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.131s]


## Initalize Retriever, Reader,  & Finder

### Retriever



In [15]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [None]:
# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store.

# from haystack.retriever.sparse import TfidfRetriever
# retriever = TfidfRetriever(document_store=document_store)

## get some SQuAD like data for fine tuning

In [16]:
!wget https://dl.fbaipublicfiles.com/UnsupervisedQA/UnsupervisedQAData.tar.gz -q
!mkdir ft_data
!tar xzf UnsupervisedQAData.tar.gz -C ft_data

### Reader



#### FARMReader

In [20]:
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

03/09/2021 12:59:25 - INFO - farm.utils -   Using device: CUDA 
03/09/2021 12:59:25 - INFO - farm.utils -   Number of GPUs: 1
03/09/2021 12:59:25 - INFO - farm.utils -   Distributed Training: False
03/09/2021 12:59:25 - INFO - farm.utils -   Automatic Mixed Precision: None
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
03/09/2021 12:59:35 - INFO - farm.utils -   Using device: CUDA 
03/09/2021 12:59:35 - INFO - farm.utils -   Number of GPUs: 1
03/09/2021 12:59:35 - INFO - farm.utils -   Distributed Training: False
03/09/2021 12:59:35 - INFO - farm.utils -   Automatic Mixed Precision: None
03/09/2021 12:59:35 - INFO - farm.infer -   Got ya 2 parallel workers to do inference ...
03/09/2021 12:59:35 - INFO - farm.infer -    0    0 
03/

## Fine tune

In [None]:
reader.train(data_dir='./ft_data/UnsupervisedQAData',
             dev_filename="unsupervised_qa_dev.json",
             train_filename="unsupervised_qa_train.json",
             test_filename="unsupervised_qa_test.json",
             use_gpu=True, n_epochs=1, save_dir="my_model")

03/09/2021 13:01:32 - INFO - farm.utils -   Using device: CUDA 
03/09/2021 13:01:32 - INFO - farm.utils -   Number of GPUs: 1
03/09/2021 13:01:32 - INFO - farm.utils -   Distributed Training: False
03/09/2021 13:01:32 - INFO - farm.utils -   Automatic Mixed Precision: None
Preprocessing Dataset ft_data/UnsupervisedQAData/unsupervised_qa_train.json:   0%|          | 0/782556 [00:00<?, ? Dicts/s]

#### TransformersReader

In [None]:
# Alternative:
# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

### Pipeline



In [None]:
from haystack.pipeline import ExtractiveQAPipeline
pipe = ExtractiveQAPipeline(reader, retriever)

## Ask a question!

In [None]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers. 
prediction = pipe.run(query="is there a family visa?", top_k_retriever=10, top_k_reader=5)
print_answers(prediction, details="minimal")