In [30]:
# website scraping

In [31]:
import requests
from bs4 import BeautifulSoup
import dateparser
import re
import pandas as pd
import numpy as np

to do list:


-~~ if contains class="more-link", redirect the link and use the link to update the post~~ 

- factorize the functions, and use a class to 
- search for report date based on the post date + day of the week (more flexible)
- improve the doc-string quality
- find a way to split the wetsuit info for shoe, wetsuit,glove and hoed 
- exception handling 

In [32]:
def read_posts(url):
    """
    extract all the posts on each webpage
    """
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    posts = soup.find_all(id=re.compile("post"))

    return posts


def forecast_parser(post):
    """
    loop through each paragrah using the headings contained in the tag <strong>
    """
    tags = post.find_all('strong')

    dates = []
    contents = []
    days_of_week = []

    for tag in tags:
        # dates needs to be corrected using the year specified in the post
        date = dateparser.parse(tag.get_text())
        dayofweek = tag.get_text().split()[0].lower()
        dates.append(date)
        days_of_week.append(dayofweek)

        content = tag.parent.get_text()
        contents.append(content)

    return dates, contents, days_of_week


def post_to_df(post):
    """
    extract data from each post, write to dataframe 
    """
    post_id = post['id']
    post_title = post.find_all('h2')[0].get_text()
    post_date = dateparser.parse(' '.join(post_title.split()[1::]))
    
    forecasts = post.find_all(id = 'surf-weerbericht')[0]
    forecast_dates, forecast_contents, days_of_week = forecast_parser(forecasts)
    
    df = pd.DataFrame({'report_date': forecast_dates,
                       'report_content': forecast_contents,
                       'day_of_week': days_of_week})

    wetsuit_adviezen_str = wetsuit_recommendation(forecasts)

    df['post_date'] = post_date  # year need to be udpated
    df['post_id'] = post_id
    df['post_title'] = post_title
    df['wetsuit_adviezen_str'] = wetsuit_adviezen_str
    df['wetsuit_adviezen'] = df['day_of_week']
    df = df.replace({"weekend", 'zaterdag'})

    try:
#         wetsuit_adviezen_str = wetsuit_adviezen_str.get()
        wetsuit_adviezen_dict = str_to_dict(wetsuit_adviezen_str)
        df.replace({"wetsuit_adviezen": wetsuit_adviezen_dict}, inplace=True)

    except:
        df['wetsuit_adviezen'] = None

    return df

def wetsuit_recommendation(forecasts):
    """
    get wetsuit recommendation
    """
    wetsuit_adviezen = None
    
    for content in forecasts.find_all('p'):

        text = re.sub(r'\W+', ' ', content.get_text()).lower()
        key_words = ['wetsuit adviezen', 'Wetsuit advies', "Wetsuit mm advies", 'Wetsuit diktes']

        if any(word.lower() in text for word in key_words):
#             print('a match')
            wetsuit_adviezen = content
            return wetsuit_adviezen.get_text()
        
    return wetsuit_adviezen


def str_to_dict(wetsuit_adviezen):
    """
    convert wetsuit_adviezen to a dictionary that uses dayofweek as the key
    """

    # ['Tests run: 1', ' Failures: 0', ' Errors: 0']
    a = wetsuit_adviezen.split('\n')

    d = {}

    for b in a:
        try:
            i = b.split(': ')
            d[i[0].lower()] = i[1]
        except:
            d[i[0].lower()] = None
            
    d = update_weekend(d)

    return d


def update_weekend(d): 
    
    # broadcast the values assigned to weekend, to Friday, saterday and sunday   
    if 'weekend' in d.keys():
        d['zaterdag'] = d['weekend']
        d['zondag'] = d['weekend']
        
        if 'vrijdag' not in d.keys():
            d['vrijdag'] = d['weekend']
    
    return d

def get_href(url):
    """
    
    """
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    hrefs = soup.find_all('a', text=re.compile('Surfweer*'), rel = 'bookmark')
    return hrefs

def get_post(post_url):
    """
    get 
    """
    page = requests.get(post_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    post = soup.find('div', class_ = "post")
    
    return post

## test replace with dict 

 correct the meer info issue  (some posts were not completely shown)
- to resolve this issue, i will first read the hyper-link 
- use the hyper-link to retrieve the information 


In [33]:
post_urls = []
page_numbers = np.arange(10) + 1

for page_number in page_numbers:
    
    page_url = 'http://surfweer.nl/surf/page/{:}/'.format(page_number)
    hrefs = get_href(page_url)
    post_urls.append([href.get('href') for href in hrefs])
    
post_urls = sum(post_urls, [])

In [34]:
dfs = []

for post_url in post_urls:
    post = get_post(post_url) 
    df = post_to_df(post) 
    df['post_url'] = post_url
    dfs.append(df)

In [35]:
df_combined = pd.concat(dfs)

In [36]:
df_combined.reset_index(inplace = True)

In [37]:
df_combined.loc[4,'wetsuit_adviezen_str']

'Wetsuit advies deze week\nMaandag: 6/5/4mm wetsuit, want 3mm, schoen 6mm, cap 2mm\nDinsdag: Nieuwe 5/4mm wetsuit, schoen 6mm en want 3mm, cap 2mm\nWoensdag: wetsuit 6mm of 5/4mm cap +1mm, hand 5mm, schoen 5mm\nDonderdag: Min. 5/4mm dikte schoen 6mm, hand 4mm, cap 3mm'

In [None]:
# df_combined

In [39]:
from datetime import date

today = date.today()
print("Today's date:", today)

Today's date: 2021-01-26


In [40]:
df_combined.to_csv('surfweer_data_{:}.csv'.format(str(today).replace('-','_')))