<a href="https://colab.research.google.com/github/kleczekr/dtc/blob/main/dtc_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Code for scraping the page of Days to Come

The following code can be used to scrape the page of Days to Come, an internet magazine created by a tour marketplace TourRadar.

In [None]:
from bs4 import BeautifulSoup
import urllib.request
import requests
import pandas as pd

### Scraping links of the articles

In [None]:
base_link = 'https://www.tourradar.com/days-to-come/latest-posts/page/'

In [None]:
dtc_article_links = []

In [None]:
link = base_link + str(i)
page = urllib.request.urlopen(link)
soup = BeautifulSoup(page, 'html.parser')
for element in soup.findAll('a', attrs={'rel': 'bookmark'}):
  dtc_article_links.append(element['href'])

In [None]:
# as the code above retrieved many duplicates, here comes
# a simple loop to clean them up
dtc_links = []
for i in dtc_article_links:
    if i not in dtc_links:
        dtc_links.append(i)

### Scraping the article pages

In [None]:
# defining column names up front can help you with creating the
# function for scraping the page
column_names = ['title', 'author', 'link', 'paragraph_number', 'categories', 'tags', 'metadescription', 'header', 'image_caption', 'image_alttext', 'paragraph_text']

In [None]:
# after some trial and error I've come up with the following code
# for scraping the pages of individual articles
def article_dataframize(article_link):
  '''
  the function receives URL address of an article and returns a Pandas
  dataframe with the column names defined in the cell above
  the function can't be reused for articles other than the Days to Come internet
  magazine, as it is adjusted specifically to its article structure
  '''
  article_df = pd.DataFrame(columns=column_names)
  soup = BeautifulSoup(requests.get(article_link).content, 'html.parser')
  title = soup.find('h1').text
  try:
    author = soup.find('div', attrs={'class': 'post-author__name black'}).find('a').contents[2].strip()
  except:
    author = 'no author mentioned'
  link = article_link
  categories = []
  try:
    for item in soup.find('span', attrs={'class': 'cat-links'}).find_all('a'):
      categories.append(item.get_text())
  except:
    categories = ['no categories']
  tags = []
  try:
    for item in soup.find('span', attrs={'class': 'tags-links'}).find_all('a'):
      tags.append(item.get_text())
  except:
    tags = ['no tags']
  try:
    metadescription = soup.find('meta', attrs={'name': 'description'})['content']
  except:
    metadescription = 'no metadescription'
  paragraph_number = 0
  header = 'no header'
  image_caption = 'no image'
  image_alttext = 'no image'
  try:
    for child in soup.find('div', attrs={'class': 'entry-content'}).findChildren():
      if child.name == 'h2':
        header = child.text
      elif child.name == 'p':
        paragraph_text = child.text
        paragraph_number += 1
      elif child.name == 'figure':
        try:
          image_alttext = child.find('img')['alt']
        except:
          image_alttext = 'no alt text'
        image_caption = child.find('figcaption').text
      article_df.loc[paragraph_number-1] = [title, author, link, paragraph_number, categories, tags, metadescription, header, image_caption, image_alttext, paragraph_text]
  except:
    paragraph_text = 'no visible paragraph structure'
    article_df.loc[paragraph_number-1] = [title, author, link, paragraph_number, categories, tags, metadescription, header, image_caption, image_alttext, paragraph_text]
  return article_df

In [None]:
df = pd.DataFrame(columns=column_names)

In [None]:
for item in dtc_links:
  print('Commencing dataframization of the link:\n'+item+'\n')
  df = df.append(article_dataframize(item), ignore_index=True)
  print('Completed dataframization of the link:\n'+item+'\n'+'*-'*30)
  print('The dataframe has now the length of ' + str(len(df)) + ' rows!!\n\n' + '*-'*30+'\n')

Commencing dataframization of the link:
https://www.tourradar.com/days-to-come/4-ways-to-explore-poland-outside-its-hotspots/



  arr_value = np.array(value)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Commencing dataframization of the link:
https://www.tourradar.com/days-to-come/cuba-vs-jamaica/

Completed dataframization of the link:
https://www.tourradar.com/days-to-come/cuba-vs-jamaica/
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
The dataframe has now the length of 8455 rows!!

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

Commencing dataframization of the link:
https://www.tourradar.com/days-to-come/best-countries-for-vegan-travellers/

Completed dataframization of the link:
https://www.tourradar.com/days-to-come/best-countries-for-vegan-travellers/
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
The dataframe has now the length of 8470 rows!!

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

Commencing dataframization of the link:
https://www.tourradar.com/days-to-come/amalfi-coast-vs-tuscany/

Completed dataframization of the link:
https://www.tourradar.c

In [None]:
df = df[df.paragraph_text != '']