# Demo-Web crawler

## BeautifulSoup

__Target__

Crawle professors' disciplines from
https://csie.asia.edu.tw/en/associate_professors_2

### Step 1

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup 
import re
pages = set()
html = urlopen('https://csie.asia.edu.tw/en/associate_professors_2')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a', href=re.compile('^(http://research.asia.edu.tw/TchEportfolio/)')):
  if link.attrs['href'] not in pages:
    #We have encountered a new page
    newPage = link.attrs['href']
    pages.add(newPage)
    print(newPage)

### Step 2

http://research.asia.edu.tw/TchEportfolio/index_1/wenhsu
```
    ul_items=discipline.find_parent().find_next_siblings('ul')
    print(ul_items)
    break
```

In [None]:
the_word = 'Discipline expertise'
for page in pages:
  html2 = urlopen(page)
  bs2 = BeautifulSoup(html2, 'html.parser')
  professor_name = bs2.find("h1", {"id": "colorlib-logo"})
  professor_discipline = bs2.find(string=re.compile(the_word))
  if professor_name:
    print(professor_name.text.strip())
  if professor_discipline:
    print(professor_discipline.parent.find_next_sibling("ul").text)


## Scrapy-Ver.1

```
scrapy startproject csie
cd csie/csie/spiders
code professor_spider.py
scrapy crawl professor
scrapy crawl professor -o professors.jl
scrapy crawl professor -o professors.json
scrapy crawl professor -o professors.csv
```

In [None]:
#professor_spider.py
import scrapy
class ProfessorSpider(scrapy.Spider):
    name = "professor"
    start_urls = [
        "https://csie.asia.edu.tw/zh_tw/associate_professors_2",
    ]

    def parse(self, response):
      #for professor in response.css('//span[@class="i-member-value member-data-value-name"]/text()'):
      #for professor in response.xpath('//span[@class="i-member-value member-data-value-name"]/text()'):
          yield {
              'name': professor.get(),
          }

### settings.py
```
FEED_EXPORT_ENCODING = 'utf-8'
```

## Scrapy-Ver.2:BeautifulSoup

```
scrapy crawl discipline -o discipline.jl
```

In [None]:
#discipline_spider.py
import scrapy
from bs4 import BeautifulSoup 
import re
class DisciplineSpider(scrapy.Spider):
    name = "discipline"
    def start_requests(self):
        urls = [
            'https://csie.asia.edu.tw/zh_tw/associate_professors_2',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
            self.log(f'resuest url {url}')

    def parse(self, response):
        bs = BeautifulSoup(response.text, 'lxml')
        for link in bs.find_all('a', href=re.compile('^(http://research.asia.edu.tw/TchEportfolio/)')):
            sub_url = link.attrs['href']
            yield scrapy.Request(url=sub_url, callback=self.parse_discipline)

    def parse_discipline(self, response):
        bs2 = BeautifulSoup(response.body, 'html.parser')
        the_word = 'Discipline expertise'
        professor_name = bs2.find("h1", {"id": "colorlib-logo"})
        professor_discipline = bs2.find(string=re.compile(the_word))
        yield{
            'name': professor_name.text.strip(),
            'discipline': professor_discipline.parent.find_next_sibling("ul").text,
        }


## Scrapy-Ver.3: xpath selector

```
scrapy crawl discipline2 -o discipline2.csv
scrapy crawl discipline2 -o discipline2.json
scrapy crawl discipline2 -o discipline2.jl
```

In [None]:
#discipline2_spider.py
import scrapy
class Discipline2Spider(scrapy.Spider):
    name = "discipline2"
    def start_requests(self):
        urls = [
            'https://csie.asia.edu.tw/zh_tw/associate_professors_2',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for link in response.xpath('//a[contains(@href, "research.asia.edu.tw")]/@href').extract():
            yield scrapy.Request(link, callback=self.parse_discipline)

    def parse_discipline(self, response):
        professor_name = response.xpath('//h1[@id="colorlib-logo"]/a/text()').get()
        professor_disciplines = response.xpath('//strong[contains(text(),"Discipline expertise")]/following::ul[1]//li/text()').extract()
        if professor_name:
            yield{
                'url':response.request.url,
                'name': professor_name.strip(),
                'discipline': professor_disciplines,
            }

## Selenium

https://colab.research.google.com/github/restrepo/ComputationalMethods/blob/master/tools/selenium.ipynb

Info:

https://chocolatey.org/install
```
choco install chromedriver
```

C:\ProgramData\chocolatey\lib\chromedriver\tools\chromedriver.exe

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup 
import re

browser = webdriver.Chrome(executable_path = 'C:\ProgramData\chocolatey\lib\chromedriver\tools\chromedriver.exe')
browser.get("https://csie.asia.edu.tw/en/associate_professors_2") 
bs = BeautifulSoup(browser.page_source, 'html.parser')
browser.close() 
pages = set()

for link in bs.find_all('a', href=re.compile('^(http://research.asia.edu.tw/TchEportfolio/)')):
  if link.attrs['href'] not in pages:
    #We have encountered a new page
    newPage = link.attrs['href']
    pages.add(newPage)
    print(newPage)