In [1]:
import requests
from bs4 import BeautifulSoup
import os
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH

In [2]:
response = requests.get('https://edition.cnn.com/travel/article/scenic-airport-landings-2020/index.html')
page = BeautifulSoup(response.content,'html.parser')

In [3]:
title = page.find('h1',{'id':'maincontent'}).text.strip()

In [4]:
title

'World’s 10 most scenic airport landings for 2020'

In [5]:
byline_name = page.find('span',{'class':'byline__name'}).text

In [6]:
byline_name

"Maureen O'Hare"

In [7]:
updated = page.find('div',{'class':'timestamp vossi-timestamp'}).text.strip().replace("\n    ","")

In [8]:
updated

'Updated    1:22 PM EDT, Mon September 28, 2020'

In [9]:
paragraphs = page.find_all('p',{'class':'paragraph'})

In [10]:
paragraphs_list = []
for p in paragraphs:
    paragraphs_list.append(p.text.strip())

In [11]:
paragraphs_list

['They say absence makes the heart grow fonder.',
 'The world’s most scenic airport airport approaches, as ranked annually by booking platform PrivateFly, are this year looking more glorious than ever.',
 'More than 6,000 frequent fliers and aviation enthusiasts took part in the 2020 poll, with voting taking place in February and March, just before travel restrictions to fight Covid-19 were introduced around the world.',
 'The winner, for the third year in a row, is Ireland’s Donegal Airport. On the island’s northwest coast, there are sweeping views of rugged coastline, pristine beaches and the steep slopes of Mount Errigal on the approach to this regional airport in Carrickfinn.',
 'While the top spot was held by an old favorite, there are six new entries in this new year’s top 10.',
 'The highest new entry is Tanzania’s Msembe Airstrip, in second place, which serves East Africa’s Ruaha National Park.',
 'Skiathos Alexandros Papadiamantis Airport, at number three, is also making its d

In [12]:
head_content = page.find('cite',{'class':'source__cite'}).find('span',{'class':'source__text'}).text + " - " + paragraphs_list[0]
head_content

'CNN - They say absence makes the heart grow fonder.'

In [13]:
content = paragraphs_list[1:-10]
content

['The world’s most scenic airport airport approaches, as ranked annually by booking platform PrivateFly, are this year looking more glorious than ever.',
 'More than 6,000 frequent fliers and aviation enthusiasts took part in the 2020 poll, with voting taking place in February and March, just before travel restrictions to fight Covid-19 were introduced around the world.',
 'The winner, for the third year in a row, is Ireland’s Donegal Airport. On the island’s northwest coast, there are sweeping views of rugged coastline, pristine beaches and the steep slopes of Mount Errigal on the approach to this regional airport in Carrickfinn.',
 'While the top spot was held by an old favorite, there are six new entries in this new year’s top 10.',
 'The highest new entry is Tanzania’s Msembe Airstrip, in second place, which serves East Africa’s Ruaha National Park.',
 'Skiathos Alexandros Papadiamantis Airport, at number three, is also making its debut. The airport on the Greek island of Skiathos,

In [14]:
top_10_text = page.find('h2',{'class':'subheader'}).text.strip()

In [15]:
top_10_text

'10 most scenic airport landings, according to PrivateFly'

In [16]:
gallery = page.find('div',{'class':'gallery-inline__slides'})

In [17]:
all_data = gallery.find_all("div",{'class':'image image__hide-placeholder'})

In [18]:
data_list_dict = []

In [19]:
for i in all_data:
    image = i.find('div',{'class':'image__container'}).find('picture',{'class':'image__picture'}).find('img').attrs['src']
    name = i.find('div',{'class':'image__metadata'}).find('div',{'class':'image__caption attribution'}).find('span').text.split(":")[0]
    cap = i.find('div',{'class':'image__metadata'}).find('div',{'class':'image__caption attribution'}).find('span').text.split(":")[1]

    if name == "World's most scenic airports":
        name = "10. Nadi International Airport, Fiji"

    data_dict = {
        'Name':name,
        'Caption':cap,
        'Image':image
    }

    data_list_dict.append(data_dict)

In [20]:
os.makedirs("scraped_images", exist_ok=True)

In [21]:
for index,value in enumerate(data_list_dict):
    response = requests.get(value['Image'])

    filename = f"image_{index+1}.jpg"

    file_path = os.path.join("scraped_images", filename)

    with open(file_path, "wb") as f:
        f.write(response.content)

    print("Downloaded:", file_path)

Downloaded: scraped_images\image_1.jpg
Downloaded: scraped_images\image_2.jpg
Downloaded: scraped_images\image_3.jpg
Downloaded: scraped_images\image_4.jpg
Downloaded: scraped_images\image_5.jpg
Downloaded: scraped_images\image_6.jpg
Downloaded: scraped_images\image_7.jpg
Downloaded: scraped_images\image_8.jpg
Downloaded: scraped_images\image_9.jpg
Downloaded: scraped_images\image_10.jpg


In [22]:
doc = Document()

In [23]:
head = doc.add_heading(title,level=1)
head.alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_heading("by"+" "+byline_name,level=3)
doc.add_heading(updated,level=3)
doc.add_paragraph('')
doc.add_heading(head_content,level=2)
for i in content:
    doc.add_paragraph(i)
doc.add_paragraph('')
doc.add_heading(top_10_text,level=1)
doc.add_paragraph('')
for index,value in enumerate(data_list_dict):
    doc.add_picture(f'scraped_images\image_{index+1}.jpg', width=Inches(5), height=Inches(3))
    doc.add_heading(value['Name'][3:],level=3)
    doc.add_paragraph(value['Caption'])
doc.save("scraped_document.docx")

  doc.add_picture(f'scraped_images\image_{index+1}.jpg', width=Inches(5), height=Inches(3))
