## 1. Get webpage using *requests*

In [1]:
import requests

req = requests.get('https://en.wikipedia.org/wiki/Data_science')


In [2]:
req

<Response [200]>

In [3]:
req.content

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Data science - Wikipedia</title>\n<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled";(function(){var cookie=document.cookie.m

In [4]:
req.encoding

'UTF-8'

In [5]:
webpage = req.text
type(webpage)

str

In [6]:
filename = 'test.txt'
with open(filename, "wb") as f:
    f.write(webpage.encode())    

In [7]:
print(webpage)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Data science - Wikipedia</title>
<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled";(function(){var cookie=document.cookie.match(/(

## 2. Get specific contents using BeatifulSoup

In [8]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(webpage, 'html.parser')

### 2.1 Prettify the webpage

In [9]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Data science - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled";(function(){var cookie=docume

### 2.2 Get the first paragraph

You can try to remove "attrs" to see how it works.

In [10]:
paragraph = soup.find('p')

In [11]:
paragraph

<p>
			Pages for logged out editors <a aria-label="Learn more about editing" data-mw="interface" href="/wiki/Help:Introduction"><span>learn more</span></a>
</p>

In [12]:
paragraph = soup.find('p', attrs={"class":False})

In [13]:
paragraph

<p>
			Pages for logged out editors <a aria-label="Learn more about editing" data-mw="interface" href="/wiki/Help:Introduction"><span>learn more</span></a>
</p>

### 2.3 Get all the links in this paragraph which point to other webpages

In [14]:
paragraph.find_all('a')

[<a aria-label="Learn more about editing" data-mw="interface" href="/wiki/Help:Introduction"><span>learn more</span></a>]

In [15]:
paragraph.find_all('a', attrs={"title":True})
#paragraph.find_all('a', attrs={"title":False})

[]

In [16]:
data = {"title":[], "href":[]}
for link in paragraph.find_all('a', attrs={"title":True}):
    data["title"].append(link["title"])
    data["href"].append(link["href"])

In [17]:
import pandas as pd
df = pd.DataFrame(data)

In [18]:
df

Unnamed: 0,title,href


## 3. Get the contents from all the webpages

In [19]:
webpages = []
head = "https://en.wikipedia.org"
for href in data["href"]:
    link = head + href
    req = requests.get(link)
    webpage = req.text
    webpages.append(webpage)

In [20]:
len(webpages)

0

## 4. Futher readings

### 4.1 robots.txt

Check robots.txt of the website to find out what are allowed.

In [21]:
req = requests.get("https://en.wikipedia.org/robots.txt")
webpage = req.text

In [22]:
soup = BeautifulSoup(webpage, 'html.parser')
print(soup.text)

﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: 

### 4.2 Sleep

You would be banned, if you scrape a website too fast. Let your crawler sleep for a while after each round.

In [23]:
import time

for i in range(5):
    time.sleep(3)
    print(i)

0
1
2
3
4


### 4.3 Randomness

Pausing for extactly three seconds after each round is too robotic. Let's add some randomness to make your crawler looks more like a human.

In [24]:
from random import random

for i in range(5):
    t = 1 + 2 * random()
    time.sleep(t)
    print(i)

0
1
2
3
4


### 4.4 Separate the codes for scraping from the ones for data extraction

1. Scraping is more vulnerable. Nothing is more annoying than your crawler breaks because of a bug in the data extraction part.  
2. You never know what data you would need for modeling. So keep all the webpages you obtain. 

### 4.5 Chrome Driver and Selenium

### 4.5.1 Start Chrome Service

In [25]:
from selenium.webdriver.chrome.service import Service

In [26]:
service = Service(r"C:\Users\jingd\OneDrive\Documents\DownloadPrograms\chromedriver\chromedriver.exe")
service.start()

### 4.5.2 Define driver

In [27]:
from selenium import webdriver

In [28]:
driver = webdriver.Remote(service.service_url)

### 4.5.3 Get Webpage

In [29]:
driver.get("http://www.indeed.com/")

### 4.5.4 Input position

In [30]:
elem = driver.find_element("id", "text-input-what")
elem.clear()
elem.send_keys("data scientist")

### 4.5.5 Return

In [31]:
from selenium.webdriver.common.keys import Keys

In [32]:
elem.send_keys(Keys.RETURN)

### 4.5.6 Get current link

These are the tools make your crawler act even more like a human.

In [33]:
print(driver.current_url)

https://www.indeed.com/jobs?q=data+scientist&l=Houston%2C+TX&from=searchOnHP&vjk=71e6982aeaf707cc


### 4.5.7 Quit Driver

In [34]:
driver.quit()

In [35]:
import time
from selenium import webdriver

# DeprecationWarning: executable_path has been deprecated, please pass in a Service object
#driver = webdriver.Chrome(r"C:\Users\jingd\OneDrive\Documents\DownloadPrograms\chromedriver\chromedriver.exe")  # Optional argument, if not specified will search path.

ser = Service(r"C:\Users\jingd\OneDrive\Documents\DownloadPrograms\chromedriver\chromedriver.exe")

op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)

driver.get('http://www.google.com/');

time.sleep(5) # Let the user actually see something!

search_box = driver.find_element("name", "q")

search_box.send_keys('Techlent')

search_box.submit()

time.sleep(5) # Let the user actually see something!

driver.quit()