In [1]:
#web scraping:
#############################################################################################
#Inspect the Website Site Using Developer Tools
#####################################################
#Step 1: Find the URL that you want to scrape
#For this example, we are going to scrape wikipedia website to extract the contents part
#The URL for this page is https://en.wikipedia.org/wiki/Python_(programming_language)
####################################################
#Step 2: Inspecting the Page
#You will need a basic understanding of HTML and CSS. This is so you understand the territory you are working in. 
#You don’t need to be an expert but you do need to know how to navigate the elements on a web-page using an inspector 
#such as chrome dev tools.
#To inspect the page, just right click on the element and click on “Inspect”
##############################################################################################
#Scrape HTML Content from a Page
####################################################
#Step 3: Install Requests
#pip install Requests
#The most basic way to perform an HTTP request in Python is to open a TCP socket and manually send the HTTP request.
#We use Requests library to the given URL in order to retrieve the HTML data that the server sends back and stores that
#data in a Python object.
###################################################
#Step 4: Check the request response
#import requests
#link = "https://en.wikipedia.org/wiki/Python_(programming_language)"
#page = requests.get(link)
#print(page.status_code)
#############################################################################################
#Parse HTML Code with Beautiful Soup
##################################################
#Step 5: Install Beautiful soup library
#Beautiful Soup is a Python library for parsing structured data. It allows you to interact with HTML in a similar way 
#to how you interact with a web page using developer tools.
#The library exposes a couple of intuitive functions you can use to explore the HTML you received.
#To get started, use your terminal or jupyter notebook to install Beautiful Soup:
#pip install beautifulsoup4
#################################################
#Step 6: Parse HTML Code
#import requests from bs4 import BeautifulSoup
#link = "https://en.wikipedia.org/wiki/Python_(programming_language)"
#page = requests.get(link)
#soup = BeautifulSoup(page.content, 'html.parser')
#print(soup.prettify())
#In this scenario, the server that hosts the site sends back a static HTML content that already contain all the data 
#that you’ll get to see as a user.


In [2]:
pip install Requests

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
link = "https://en.wikipedia.org/wiki/Python_(programming_language)"
page = requests.get(link)
print(page.status_code)

200


In [4]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [5]:
import requests 
from bs4 import BeautifulSoup
link = "https://en.wikipedia.org/wiki/Python_(programming_language)"
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Python (programming language) - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"53f13d98-c20a-4011-9897-36f59f8f85ac","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Python_(programming_language)","wgTitle":"Python (programming language)","wgCurRevisionId":1131306264,"wgRevisionId":1131306264,"wgArticleId":23862,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","Wikipedia semi-protected pages","Articles with short 

In [6]:
#find() is used for returning the result when the searched element is found on the page.
print(soup.find("span",{"class":"mw-page-title-main"}).get_text())
#get_text() is used to get only the text of a tag element
#In this example, we extracted the main title of Wikipedia page

Python (programming language)


In [7]:
#find_all() is used for returning all the matches after scanning the entire document in the format of a list
tables = soup.find_all("table")
tables[0].find("td",{"class":"infobox-data"}).text
#Text is an attribute that is used to get only the text of the whole text inside the tag object
#In this example, we extracted the paradigms of python language

'Multi-paradigm: object-oriented,[1] procedural (imperative), functional, structured, reflective'

In [8]:
#select_one() returns the first tag that matches css selector
print(soup.select_one(".infobox vevent, tr:nth-child(6) > td").text)
#In this example, we extracted the first appearance of python using CSS selectors

20 February 1991; 31 years ago (1991-02-20)[2]


In [9]:
#select() returns all the matching elements
soup.select("#toc>ul>li")
#In this example, we extracted the main titles of the contents using CSS selectors
#Note:
#id is used for single elements that appear on the page for only once, whereas class is used for single or multiple
#elements that appear on the page.

[<li class="toclevel-1 tocsection-1"><a href="#History"><span class="tocnumber">1</span> <span class="toctext">History</span></a>
 <ul>
 <li class="toclevel-2 tocsection-2"><a href="#Removals_from_Python"><span class="tocnumber">1.1</span> <span class="toctext">Removals from Python</span></a></li>
 </ul>
 </li>,
 <li class="toclevel-1 tocsection-3"><a href="#Design_philosophy_and_features"><span class="tocnumber">2</span> <span class="toctext">Design philosophy and features</span></a></li>,
 <li class="toclevel-1 tocsection-4"><a href="#Syntax_and_semantics"><span class="tocnumber">3</span> <span class="toctext">Syntax and semantics</span></a>
 <ul>
 <li class="toclevel-2 tocsection-5"><a href="#Indentation"><span class="tocnumber">3.1</span> <span class="toctext">Indentation</span></a></li>
 <li class="toclevel-2 tocsection-6"><a href="#Statements_and_control_flow"><span class="tocnumber">3.2</span> <span class="toctext">Statements and control flow</span></a></li>
 <li class="toclevel

In [10]:
#Annual inflation rate in the UK.
#Let’s scrape a table of inflation rate during 2022 !
#Import libraries
#import requests
#import re
#import numpy as np
#import pandas as pd
#from bs4 import BeautifulSoup
#Annual inflation rate in the UK.
#Get the container of table
#link = "https://tradingeconomics.com/united-states/inflation-cpi"
#page = requests.get(link)
#soup = BeautifulSoup(page.content, 'html.parser')
#table = soup.find("div",{"id":"ctl00_ContentPlaceHolder1_ctl00_ctl01_Panel1"})
#Annual inflation rate in the UK.
#Scraping the table
#result = {}
#for column in table.find("tr").find_all("th"):
#    key = column.get_text().replace("\r","").replace("\n","")
#    result[key] = []
#my_regex = re.compile("datatable-row*")
#for tr in table.find_all("tr",{"class":my_regex}):
#    for i,row in enumerate(tr.find_all("td")):
#        result[list(result.keys())[i]].append(row.text.replace("\r","").replace("\n","")) 
#Display the table using Pandas
#pd.DataFrame(result)


In [11]:
#Practical examples
#Consider, for example, Stack Overflow, which is a question and answer website for professional and enthusiast programmers.
#Let’s scrape one of these answers !

#Code Box
import requests
from bs4 import BeautifulSoup

link = "https://stackoverflow.com/questions/2081586/web-scraping-with-python"
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
container = soup.find("div",{"id":"answer-2082025"})
container.find("div",{"class":"s-prose js-post-body"}).find("p").get_text()


'Use urllib2 in combination with the brilliant BeautifulSoup library:'

In [12]:
#apres avoir regarder la video youtube du checkpoint:
import requests 
import string
from bs4 import BeautifulSoup

Enter_input=input("Search: ")
u_i=string.capwords(Enter_input)
lists=u_i.split()
word="_".join(lists)

url="https://en.wikipedia.org/wiki/"+word

def wikibot(url):
    url_open=requests.get(url)
    soup=BeautifulSoup(url_open.content,'html.parser')
    details=soup('table',{'class':'infobox'})
    for i in details:
        h=i.find_all('tr')
        for j in h:
            heading=j.find_all('th')
            detail=j.find_all('td')
            if heading is not None and detail is not None:
                for x,y in zip(heading,detail):
                    print("{} :: {}".format(x.text,y.text))
                    print("----------------")
    for i in range(1,3):
        print(soup('p')[i].text)
wikibot(url)


Search: covid 19
Other names :: COVID, (the) coronavirus
----------------
Pronunciation :: /kəˈroʊnəvaɪrəs//ˌkoʊvɪdnaɪnˈtiːn, ˌkɒvɪd-/[1] 
----------------
Specialty :: Infectious disease
----------------
Symptoms :: Fever, cough, fatigue, shortness of breath, vomiting, loss of taste or smell; some cases asymptomatic[2][3]
----------------
Complications :: Pneumonia, viral sepsis, acute respiratory distress syndrome, kidney failure, cytokine release syndrome, respiratory failure, pulmonary fibrosis, paediatric multisystem inflammatory syndrome, long COVID
----------------
Usual onset :: 2–14 days (typically 5) from infection
----------------
Duration :: 5 days to chronic
----------------
Causes :: Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
----------------
Diagnostic method :: rRT‑PCR testing, CT scan, Rapid antigen test
----------------
Prevention :: Vaccination,[4] face coverings, quarantine, physical/social distancing, ventilation, hand washing[5]
----------------
