# Static site crawler using beautifulsoup and requests

- Static site
- Dynamic site
- API
- Auth


source:

https://beautiful-soup-4.readthedocs.io/en/latest/

In [1]:
!pip install requests
!pip install beautifulsoup4



In [110]:
import requests

URL = "https://www.varzesh3.com/"
# GET request
page = requests.get(URL)
# page = requests.get(URL, auth=('user', 'pass')) # HTTP Basic Auth


print(page.text)
print(page.content)


<!DOCTYPE html>
<html lang="fa" prefix="og: http://ogp.me/ns#">
<head>
    <title>مرجع فوتبال و ورزش | ورزش سه</title>
    <meta charset="utf-8" />
    
    <meta name="viewport" content="width=1170" />
    
    <meta name="description" content="پایگاه اطلاع رسانی ورزشی برای فارسی زبانان كه اخبار حوزه ورزش (فوتبال،والیبال ،بسكتبال و...)ونتایج بازیها و جداول لیگ های ورزشی را بصورت زنده ارائه می کند" />





<link href="https://www.varzesh3.com/" rel="canonical" />

<meta property="og:site_name" content="ورزش سه">
<meta property="og:title" content="مرجع فوتبال و ورزش" />
<meta property="og:url" content="https://www.varzesh3.com/" />
<meta property="og:description" content="پایگاه اطلاع رسانی ورزشی برای فارسی زبانان كه اخبار حوزه ورزش (فوتبال،والیبال ،بسكتبال و...)ونتایج بازیها و جداول لیگ های ورزشی را بصورت زنده ارائه می کند" />
<meta property="og:type" content="website" />
<meta property="og:locale" content="fa_IR" />
<meta property="og:image" content="https://st

In [111]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [112]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [116]:
soup.title

soup.title.name

soup.title.string

soup.title.parent.name

'head'

In [118]:
soup.p

soup.p['class']

['title']

In [120]:
# soup.a

soup.find_all('a')

soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [121]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [122]:
print(soup.get_text())



The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [124]:
# Tag Object
soup.p

# Name
soup.p.name

# Attributes
soup.p['class']
soup.p.attrs

soup.p.get('class')

{'class': ['title']}

In [11]:
soup.p.string

"The Dormouse's story"

In [12]:
# Navigation

soup.body.b

soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [13]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [14]:
len(soup.body.contents)

7

In [15]:
# String

soup.strings

soup.a.string

'Elsie'

In [125]:
# Search

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [128]:
soup.find_all('b')

import re
for tag in soup.find_all(re.compile("^b")):
    print(tag)

soup.find_all(["a", "b"])


<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
<b>The Dormouse's story</b>


[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [132]:
# Name
# soup.find_all(name="title")

# Arguments
# soup.find_all(id='link1')

soup.find_all(name="a", id= "link1", class_= "sister")

# soup.find

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# selenium

source:

https://selenium-python.readthedocs.io/

In [19]:
!pip install selenium



In [139]:
# https://sites.google.com/chromium.org/driver/

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time


driver = webdriver.Chrome()


In [136]:
driver.get("http://www.google.com")


In [137]:
driver.close()


In [140]:
driver.get("http://www.google.com")


In [141]:
element = driver.find_element(By.ID, "APjFqb")
# element = driver.find_element(By.NAME, "textarea")
# element = driver.find_element(By.XPATH, "//input[@id='passwd-id']")
# element = driver.find_element(By.CSS_SELECTOR, "input#passwd-id")

In [142]:
element.send_keys("NLP")


In [143]:

element = driver.find_element(By.CLASS_NAME, "gNO89b")

element.click()

In [144]:
driver.get('https://docs.google.com/forms/d/e/1FAIpQLSeI8_vYyaJgM7SJM4Y9AWfLq-tglWZh6yt7bEXEOJr_L-hV1A/viewform?formkey=dGx0b1ZrTnoyZDgtYXItMWVBdVlQQWc6MQ')

In [145]:
element = driver.find_element(By.XPATH, "//*[@id='mG61Hd']/div[2]/div/div[2]/div[3]/div/div/div[2]/div[1]/div/span")

all_options = element.find_elements(By.TAG_NAME, "label")
for option in all_options:
    option.click()
    time.sleep(2)


In [None]:
# ID = "id"
# NAME = "name"
# XPATH = "xpath"
# LINK_TEXT = "link text"
# PARTIAL_LINK_TEXT = "partial link text"
# TAG_NAME = "tag name"
# CLASS_NAME = "class name"
# CSS_SELECTOR = "css selector"

# API

In [24]:
import requests

api_url = "https://jsonplaceholder.typicode.com/todos/1"
response = requests.get(api_url)
response.json()

{'userId': 1, 'id': 1, 'title': 'delectus aut autem', 'completed': False}

In [25]:
import requests

api_url = "https://jsonplaceholder.typicode.com/todos"
todo = {"userId": 1, "title": "Buy milk", "completed": False}
response = requests.post(api_url, json=todo)
response.json()

{'userId': 1, 'title': 'Buy milk', 'completed': False, 'id': 201}

# Example

In [149]:
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup


with open('download_log.txt', 'r+') as f:
    lecture_names = f.readlines()
lecture_names = [i.strip() for i in lecture_names]

base_url = 'https://{user}:{pass}@language.ml/courses/nlp14022/'

# user = 
# pass_ = 

page = requests.get(f'https://{user}:{pass_}@language.ml/courses/nlp14022/index.html')

soup = BeautifulSoup(page.content, 'html.parser')

table = soup.find('tbody')
trs = table.find_all('tr')

for i in tqdm(trs):
    tds = i.find_all('td')
    print(tds[0].string)
    print('-'*10)
    
    if tds[0].string in lecture_names and tds[0].string!=None:
        continue
        
    lecture_names.append(tds[0].string)
    if tds[2].a!=None:
        print(base_url+tds[2].a.get('href'))
        r = requests.get(base_url+tds[2].a.get('href'), auth=(user, pass_))
        with open(tds[0].string+'.pdf', 'wb') as f:
            f.write(r.content)
    if tds[3].a!=None:
        print(base_url+tds[3].a.get('href'))
        r = requests.get(base_url+tds[3].a.get('href'), auth=(user, pass_))
        with open(tds[0].string+'.mp4', 'wb') as f:
            f.write(r.content)
        

lecture_names = [i for i in lecture_names if i!=None]
with open('download_log.txt', 'w') as f:
    f.write('\n'.join(lecture_names))
