# Web scraping

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# use /robots.txt to understand what a website allows and does not allow

In [3]:
url = 'https://waset.org/anaesthesiology-conference-in-july-2022-in-paris'

In [4]:
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'

In [5]:
# We're pointing to a free standing webpage, therefore it will give us an HTML file instead of with APIs that give you json
r = requests.get(url, headers = {'User-agent' :useragent})
r

<Response [200]>

In [6]:
# second argument of BeautifulSoup tells you have to parse it
soup = BeautifulSoup(r.text,"html.parser")

In [7]:
#Calling all of soup will have a long HTML 
#soup
# When looking at this, how do we know if this data is useable or not useable? 
# Go to the page you want to scrap, right click and view page source, this brings you to the page in html and you can control F

In [8]:
#Find_all allows you to search for particular tags
soup.find_all('title')
#this returns a list, therefore you can pull it out of a list

[<title>International Conference on Anaesthesiology ICA in July 2022 in Paris</title>]

In [9]:
#You're taking the first item out of the list, since there's only 1 item in the list it is 0
soup.find_all('title')[0]

<title>International Conference on Anaesthesiology ICA in July 2022 in Paris</title>

In [10]:
title = soup.find_all('title')[0].string

In [11]:
# Begin here after finding the price in the HTML file
soup.find_all('td')

[<td>Abstracts/Full-Text Paper Submission Deadline</td>,
 <td> </td>,
 <td class="textright">June 15, 2022</td>,
 <td>Notification of Acceptance/Rejection</td>,
 <td> </td>,
 <td class="textright">June 30, 2022</td>,
 <td>Final Paper (Camera Ready) Submission &amp; Early Bird Registration Deadline</td>,
 <td> </td>,
 <td class="textright">June 19, 2022</td>,
 <td>Conference Dates</td>,
 <td> </td>,
 <td class="textright">July 19-20, 2022</td>,
 <td>Non-Student Oral/Poster Presenter Registration</td>,
 <td class="earlyBird0">€ 450</td>,
 <td class="earlyBird1">€ 500</td>,
 <td>Student Oral/Poster Presenter Registration</td>,
 <td class="earlyBird0">€ 350</td>,
 <td class="earlyBird1">€ 400</td>,
 <td>Listener Registration</td>,
 <td class="earlyBird0">€ 250</td>,
 <td class="earlyBird1">€ 300</td>,
 <td>Additional Paper Publication</td>,
 <td colspan="2">€ 100</td>]

In [12]:
#You can then specfic where inside of td by providing a second argument
soup.find_all('td','earlyBird1')

[<td class="earlyBird1">€ 500</td>,
 <td class="earlyBird1">€ 400</td>,
 <td class="earlyBird1">€ 300</td>]

In [13]:
# this is making a string of just the one price
cost = soup.find_all('td','earlyBird1')[0].string

In [14]:
# Looked for the 'Conference Dates' and then looked for the find next function, which goes to the elements after that
conference_date = soup.find_all('td', text = 'Conference Dates')[0].find_next().find_next().string
conference_date

'July 19-20, 2022'

In [15]:
# to get a program, it was a different url, therefore we needed to pull from a different url
url2 = 'https://waset.org/conferences-in-july-2022-in-paris/program'
r2 = requests.get(url2, headers = {'User-agent' :useragent})
r2

<Response [200]>

In [16]:
program = soup.find_all('div', 'col-6')
program

[]

### Building a spider

In [30]:
def scrape1(url):
    r = requests.get(url, headers = {'User-agent' :useragent})
    
    soup = BeautifulSoup(r.text,"html.parser")
    
    title = soup.find_all('title')[0].string
    cost = soup.find_all('td','earlyBird1')[0].string
    conference_date = soup.find_all('td', text = 'Conference Dates')[0].find_next().find_next().string
    
    mydict = {'title':[title],
              'cost': [cost],
              'conference_date':[conference_date]}
    
    mydf = pd.DataFrame(mydict)
    return mydf

In [19]:
#scrape1('https://waset.org/anaesthesiology-conference-in-july-2022-in-paris')

In [35]:
url = 'https://waset.org/conferences-in-july-2022-in-paris'
r = requests.get(url, headers = {'User-agent': useragent})
soup = BeautifulSoup(r.text, 'html.parser')
urls = [x['href'] for x in soup.find_all('a', href=True, title=True)[5:]]
scrape1(urls[50])

Unnamed: 0,title,cost,conference_date
0,International Conference on Advanced Dynamic C...,€ 500,"July 19-20, 2022"


In [33]:
url

'https://waset.org/conferences-in-july-2022-in-paris'

In [23]:
# Using a list comprehension to find all of the links inside of the webpage
urls = [x['href'] for x in soup.find_all('a', href=True, title=True)][5:]
urls

['https://waset.org/conferences',
 'https://waset.org/disciplines',
 'https://waset.org/page/support',
 'https://waset.org/conferences',
 'https://waset.org/conferences-in-2022',
 'https://waset.org/conferences-in-july-2022-in-paris',
 'https://waset.org/anaesthesiology-conference',
 'https://waset.org/profile/submissions/papers/create/1842/8729',
 'https://waset.org/profile/registrations/author/create/1842/8729',
 'https://waset.org/profile/registrations/listener/create/1842/8729',
 'https://waset.org/conferences-in-july-2022-in-paris/program',
 'https://waset.org/anaesthesiology-conference-in-november-2022-in-san-francisco',
 'https://waset.org/anaesthesiology-conference-in-june-2023-in-san-francisco',
 'https://waset.org/anaesthesiology-conference-in-july-2023-in-paris',
 'https://waset.org/anaesthesiology-conference-in-november-2023-in-san-francisco',
 'https://waset.org/anaesthesiology-conference-in-june-2024-in-san-francisco',
 'https://waset.org/anaesthesiology-conference-in-jul

In [36]:
total_df = pd.DataFrame()

for u in urls:
    print(u)
    scrape1(u)
    one_df = scrape1(u)
    total_df = pd.concat([total_df,one_df])

https://waset.org/anaesthesiology-conference-in-july-2022-in-paris
https://waset.org/allergy-and-asthma-conference-in-july-2022-in-paris
https://waset.org/applied-aerodynamics-and-aeronautics-conference-in-july-2022-in-paris
https://waset.org/antihypertensive-agents-and-diuretics-conference-in-july-2022-in-paris
https://waset.org/aquatic-animals-diseases-and-resources-conference-in-july-2022-in-paris
https://waset.org/aquatic-animal-health-and-diseases-conference-in-july-2022-in-paris
https://waset.org/allergy-asthma-immunology-and-rheumatology-conference-in-july-2022-in-paris
https://waset.org/advanced-aerogel-materials-conference-in-july-2022-in-paris
https://waset.org/advanced-biofuels-conference-in-july-2022-in-paris
https://waset.org/agent-based-applications-for-air-transportation-conference-in-july-2022-in-paris
https://waset.org/applications-of-biorobotics-and-biomechatronics-conference-in-july-2022-in-paris
https://waset.org/agricultural-biotechnology-biological-and-biosystems-