In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup 

pd.set_option('display.max_rows', None)

In [2]:
#make a request to the website you wish to scrape data from
response = requests.get('https://aaiasb.gr/publications/investigation-reports')
response

<Response [200]>

In [3]:
#parse the webpage
soup = BeautifulSoup(response.text, 'html.parser')
soup

<!DOCTYPE html>

<html dir="ltr" lang="el-gr" vocab="http://schema.org/">
<head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/images/favicon.png" rel="shortcut icon"/>
<link href="/images/apple-touch-icon.png" rel="apple-touch-icon"/>
<meta charset="utf-8">
<base href="https://aaiasb.gr/publications/investigation-reports"/>
<meta content="Hellenic AAIASB,AAIASB,EDAAP, ΕΔΑΑΠ, Διερεύνηση Αεροπορικών Ατυχημάτων , Air Accident Investigation and Aviation Safety Board" name="keywords">
<meta content="Hellenic Air Accident Investigation and Aviation Safety Board (AAIASB) Official Webpage
Επιτροπή Διερεύνησης Ατυχημάτων &amp; Ασφάλειας Πτήσεων (ΕΔΑΑΠ)" name="description"/>
<meta content="Joomla! - Open Source Content Management" name="generator"/>
<title>Πορίσματα / Εκθέσεις</title>
<link href="/media/cck/css/cck.css" rel="stylesheet">
<link href="/media/cck/css/cck.responsive.css" rel="stylesheet">
<l

In [4]:
#create an empty list called entries
#we gonna store the tables' data in there later 
entries = []

#after inspecting, grab the part of the page we really need
page = soup.select('div.cck_page_items')[0]

#after inspecting, grab all "tr" in the "table" located in our "page". Those tr are the table's rows
#so, we store them in a variable called rows
#we don't need the header, so we gonna scrape all the items of the list from the second to the last one
rows = page.find('table').find_all('tr')[1:]

#loop through those tr
#for each row in rows...
for tr in rows:
    #we gonna scrape its element we need SEPERATELY. We have to INSPECT IN DETAIL & understand the html hierarchy first!
    #so, please inspect to find the elements I am referrint to below
    
    #the first date mentioned under the "Τελικό Πόρισμα" verbatim is located in the first cell of each row (that's td[0])
    #each first cell contains three 'div'. The second 'div' found in the first cell of each row is...
    #the first date we wish to grab!
    conclusion_date1 = tr.find_all('td')[0].find_all('div')[1].text.strip()
    #the third 'div' found in the first cell of each row is...
    #the second date we wish to grab!
    conclusion_date2 = tr.find_all('td')[0].find_all('div')[2].text.strip()
    
    #incident info, including date and category, is found in the second cell of each row and
    #is the first div in it
    incident_info = tr.find_all('td')[1].find_all('div')[0].text.strip()
    #incident type is the second div in it
    incident_type = tr.find_all('td')[1].find_all('div')[1].text.strip()
    #incident description found in a tooltip is a 'span' and, specifically, the first span found in each row!
    incident_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[0])
    #if that incident had fatalities, that is found in the second cell of each row and
    #is the third div in it
    fatalities = tr.find_all('td')[1].find_all('div')[2].text.strip()
    #fatalities description is found in a tooltip and is a 'span' -specifically, the second span found in each row!
    fatalities_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[1])
    #area is found in the third cell of each row and is the first div in it
    area = tr.find_all('td')[2].find_all('div')[0].text.strip()
    #registry is found in the third cell of each row and is the second div in it
    registry = tr.find_all('td')[2].find_all('div')[1].text.strip()
    #aircraft type is found in the third cell of each row and is in the second to last div in it
    aircraft_type = tr.find_all('td')[2].find_all('div')[-2].text.strip()
    #more aircraft info is found in the third cell of each row and is the last div in it
    aircraft_info = tr.find_all('td')[2].find_all('div')[-1].text.strip()
    #because the structure of the last column of the table changes sometimes, we gonna scrape 
    #all info found in the third cell of each row also
    #so, we can extract missing info in the cleaning stage of the project
    area_info = tr.find_all('td')[2].text.strip()

    #now that we've scraped the desired data and we've stored them in respective variables
    #we gonna create a dictionary
    #each one of the dict keys is the column name of our future df
    #each key holds the respective variable with our scraped data as a value
    dict = {'conclusion_date1': conclusion_date1,
            'conclusion_date2': conclusion_date2,
            'incident_info': incident_info,
            'incident_type': incident_type,
            'incident_description': incident_description,
            'fatalities': fatalities,
            'fatalities_description': fatalities_description,
            'area': area,
            'registry': registry,
            'aircraft_type': aircraft_type,
            'aircraft_info': aircraft_info,
            'area_info': area_info}
    
    #now, we append our initially empty list called entries with the dictionary we created
    entries.append(dict)

#let's turn our entries list into a df
df = pd.DataFrame(entries)

#check out how our df looks like!
df

Unnamed: 0,conclusion_date1,conclusion_date2,incident_info,incident_type,incident_description,fatalities,fatalities_description,area,registry,aircraft_type,aircraft_info,area_info
0,06/2022,23/11/2022,22/09/2017\nΑΤΥΧΗΜΑ,ΑΤΥΧΗΜΑ,"<span span="""" uk-icon=""info"" uk-tooltip=""Συντρ...",Θανάσιμος τραυματισμός,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",ΠΕΡΙΟΧΗ ΚΕΡΑΣΙΑ ΡΟΔΟΠΗΣ,Νηολόγιο(α): UR-STAS,Αεροπλάνο,Υπερελαφρύ,ΠΕΡΙΟΧΗ ΚΕΡΑΣΙΑ ΡΟΔΟΠΗΣΝηολόγιο(α): UR-STASΑερ...
1,05/2022,10/11/2022,08/11/2013\nΑΤΥΧΗΜΑ,ΑΤΥΧΗΜΑ,"<span span="""" uk-icon=""info"" uk-tooltip=""Ελαφρ...",Χωρίς Τραυματισμό,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",ΑΕΡΟΔΡΟΜΙΟ ΜΕΓΑΡΩΝ LGMG,Νηολόγιο(α): SX-AGE,Αεροπλάνο,Μικρό / Ελαφρύ,ΑΕΡΟΔΡΟΜΙΟ ΜΕΓΑΡΩΝ LGMGΝηολόγιο(α): SX-AGEΑερο...
2,E01/2022,27/10/2022,28/05/2019\nΑΤΥΧΗΜΑ,ΑΤΥΧΗΜΑ,"<span span="""" uk-icon=""info"" uk-tooltip=""Χειρι...",Σοβαρός τραυματισμός,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",ΣΤΗ ΠΕΡΙΟΧΗ ΤΗΣ ΑΝΑΤΟΛΙΚΗΣ ΠΑΡΑΛΙΑΣ ΚΑΛΑΜΑΤΑΣ,Νηολόγιο(α): -,Νηολόγιο(α): -,Αλεξίπτωτο Πλαγιάς,ΣΤΗ ΠΕΡΙΟΧΗ ΤΗΣ ΑΝΑΤΟΛΙΚΗΣ ΠΑΡΑΛΙΑΣ ΚΑΛΑΜΑΤΑΣΝ...
3,04/2022,09/06/2022,20/08/2019\nΑΤΥΧΗΜΑ,ΑΤΥΧΗΜΑ,"<span span="""" uk-icon=""info"" uk-tooltip=""Ελικό...",Θανάσιμος τραυματισμός,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",,,Ελικόπτερο,Μεγάλο,ΘΑΛΛΑΣΙΟΣ ΔΙΑΥΛΟΣ ΠΟΡΟΥ-ΓΑΛΑΤΑΝηολόγιο(α): SX-...
4,03/2022,02/06/2022,16/07/2020\nΑΤΥΧΗΜΑ,ΑΤΥΧΗΜΑ,"<span span="""" uk-icon=""info"" uk-tooltip=""Xειρι...",Θανάσιμος τραυματισμός,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",,,Νηολόγιο(α): -,Αλεξίπτωτο Πλαγιάς,ΚΑΛΥΜΠΑΚΙ ΑΓΙΟΥ ΘΩΜΑ ΒΟΙΩΤΙΑΣΝηολόγιο(α): -Αλε...
5,02/2022,19/05/2022,15/09/2019\nΑΤΥΧΗΜΑ,ΑΤΥΧΗΜΑ,"<span span="""" uk-icon=""info"" uk-tooltip=""Χειρι...",Θανάσιμος τραυματισμός,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",,,Νηολόγιο(α): -,Αλεξίπτωτο Πλαγιάς,ΑΛΥΚΕΣ ΜΕΣΟΛΟΓΓΙΟΥΝηολόγιο(α): -Αλεξίπτωτο Πλα...
6,01/2022,17/02/2022,07/10/2017\nΑΤΥΧΗΜΑ,ΑΤΥΧΗΜΑ,"<span span="""" uk-icon=""info"" uk-tooltip=""Tραυμ...",Σοβαρός τραυματισμός,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",,,Αεροπλάνο,Μεγάλο,ΔΙΕΘΝΗΣ ΑΕΡΟΛΙΜΕΝΑΣ ΑΘΗΝΩΝ (LGAV) ΘΕΣΗ Β62Νηολ...
7,E05/2021,24/09/2021,13/12/2019\nΣΟΒΑΡΟ ΣΥΜΒΑΝ,ΣΟΒΑΡΟ ΣΥΜΒΑΝ,"<span span="""" uk-icon=""info"" uk-tooltip=""Εκτρο...",Χωρίς Τραυματισμό,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",,,Αεροπλάνο,Μεγάλο,ΔΙΕΘΝΕΣ ΑΕΡΟΔΡΟΜΙΟ ΑΘΗΝΩΝ (ΛΓΑΩ) ΔΙΑΔΡΟΜΟΣ 21R...
8,E04/2021,26/07/2021,14/02/2020\nΣΟΒΑΡΟ ΣΥΜΒΑΝ,ΣΟΒΑΡΟ ΣΥΜΒΑΝ,"<span span="""" uk-icon=""info"" uk-tooltip=""Κατά ...",Χωρίς Τραυματισμό,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",ΚΡΑΤΙΚΟΣ ΑΕΡΟΛΙΜΕΝΑΣ ΚΟΖΑΝΗΣ LGKZ,Νηολόγιο(α): SX-KVA,Αεροπλάνο,Πολύ Ελαφρύ,ΚΡΑΤΙΚΟΣ ΑΕΡΟΛΙΜΕΝΑΣ ΚΟΖΑΝΗΣ LGKZΝηολόγιο(α): ...
9,E03/2021,11/02/2021,28/01/2019\nΣΟΒΑΡΟ ΣΥΜΒΑΝ,ΣΟΒΑΡΟ ΣΥΜΒΑΝ,"<span span="""" uk-icon=""info"" uk-tooltip=""Ιδιαί...",Χωρίς Τραυματισμό,"<span <="""" span="""" uk-icon=""info"" uk-tooltip=""...",ΔΙΕΘΝΗΣ ΑΕΡΟΛΙΜΕΝΑΣ MUSCAT (OOMS) ΣΤΟ ΟΜΑΝ,Νηολόγιο(α): SX-ODS,Αεροπλάνο,Μεγάλο,ΔΙΕΘΝΗΣ ΑΕΡΟΛΙΜΕΝΑΣ MUSCAT (OOMS) ΣΤΟ ΟΜΑΝΝηολ...


In [5]:
#save your df with the scraped data, so that you don't need to scrape them again!
df.to_csv('aircraft_incidents.csv',index=False)

In [6]:
#we need to clean this, but:
#a) cleaning will be much easier because you have scraped in a nicer way now!
#b) we gonna do our cleaning in a different script

#before that you have a challenge!!!

# Your challenge!

### This script scrapes the table data found in the first page! Find out the way to do multipage scraping in a Python way and wrap everything above in a for loop you should come up with in order to scrape all data found in the 4 pages!!!!