# Extracting the data

### Import the libraries

In [None]:
# to get the URL
import requests

# to parse the HTMLDOM
from bs4 import BeautifulSoup

# patience is virtue
import time
import glob

### Load a page

In [None]:
# the download file
file = 'data/PBS/economy-france-threatens-new-rules-on-facebook-as-zuckerberg-visits.html'

# parse the file
soup = BeautifulSoup(open(file), 'html.parser')

# get the title
if soup.find('meta', property='og:title'):
  title = soup.find('meta', property='og:title')['content']

# get the description
if soup.find('meta', property='og:description'):
  description = soup.find('meta', property='og:description')['content']

# article:published_time
published_time = soup.find('meta', property='article:published_time')['content']

# article:section
section = soup.find('meta', property='article:section')['content']

# article:tag
tags = soup.find('meta', property='article:tag')['content']
tags = tags.split(', ')

# content
content = soup.find(class_='body-text')

# paragraphs
paragraphs = content.find_all('p')

body = []
for p in paragraphs:
  text = p.get_text()
  if 'WATCH:' not in text:
    body.append(text)

data = {
  'title': title,
  'description': description,
  'published_time': published_time,
  'tags': tags,
  'content': body
}

data

### Create a function from this

In [None]:
def extract(file):
  # the file
  # file = 'data/wired/economy-france-threatens-new-rules-on-facebook-as-zuckerberg-visits.html'

  # parse the file
  soup = BeautifulSoup(open(file), 'html.parser')

  # get the title
  title = soup.find('meta', property='og:title')['content']

  # I discussed this briefly during the presentation
  # somethimes an element simply does not exist and will break code. You can use the solutions below to check.
  # option 1 will result in more lines of code; but imo makes the code more comprehensible for others when working in projects

  # option 1: check if it exists with a simple if statement and get the description
  if soup.find('meta', property='og:description'):
    description = soup.find('meta', property='og:description')['content']
  else:
    description = 'No description found'

  # option 2: an one line if statement
  description = soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else 'No description found'
  
  # article:published_time
  published_time = soup.find('meta', property='article:published_time')['content']

  # article:section
  section = soup.find('meta', property='article:section')['content']

  # article:tag
  tags = soup.find('meta', property='article:tag')['content']
  tags = tags.split(', ')

  # content
  content = soup.find(class_='body-text')

  # paragraphs
  paragraphs = content.find_all('p')

  body = []
  for p in paragraphs:
    text = p.get_text()
    if 'WATCH:' not in text:
      body.append(text)

  data = {
    'title': title,
    'description': description,
    'published_time': published_time,
    'tags': tags,
    'content': body
  }

  return data

### Iterate through all the files

In [None]:
# load html files
files = glob.glob('data/PBS/*.html')

# iterate
for file in files[1:30]:
  data = extract(file)
  print(data)