In [1]:
# website scraping

In [2]:
import requests
from bs4 import BeautifulSoup
import dateparser
import re
import pandas as pd
import numpy as np

to do list:


- search for report date based on the post date + day of the week (more flexible)

In [57]:
def read_posts(url):
    """
    extract all the posts on each webpage
    """
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    posts = soup.find_all(id=re.compile("post"))

    return posts


def report_parser(post):
    """
    loop through each paragrah using the headings contained in the tag <strong>
    """
    tags = post.find_all('strong')

    dates = []
    contents = []
    days_of_week = []

    for tag in tags:
        # dates needs to be corrected using the year specified in the post
        date = dateparser.parse(tag.get_text())
        dayofweek = tag.get_text().split()[0].lower()
        dates.append(date)
        days_of_week.append(dayofweek)

        content = tag.parent.get_text()
        contents.append(content)

    return dates, contents, days_of_week


def post_to_df(post):
    """
    extract data from each post, write to dataframe 
    """
    post_id = post['id']
    post_title = post.find_all('h2')[0].string
    post_date = dateparser.parse(' '.join(post_title.split()[1::]))

    report_dates, report_contents, days_of_week = report_parser(post)

    #  store the data in a dataframe
    df = pd.DataFrame({'report_date': report_dates,
                       'report_content': report_contents,
                       'day_of_week': days_of_week})

    wetsuit_adviezen_str = wetsuit_recommendation(report_contents)
    wetsuit_adviezen_dict = str_to_dict(wetsuit_adviezen_str)

    df['post_date'] = post_date  # year need to be udpated
    df['post_id'] = post_id
    df['post_title'] = post_title
    df['wetsuit_adviezen_str'] = wetsuit_adviezen_str
    df['wetsuit_adviezen'] = df['day_of_week']
    df.replace({"wetsuit_adviezen": wetsuit_adviezen_dict}, inplace=True)

    return df


def wetsuit_recommendation(contents):
    """
    get wetsuit recommendation
    """
    wetsuit_adviezen = None

    for content in contents:
        # to be upadated in based on the keyword variation (case sensitive)
        if 'wetsuit adviezen' in re.sub(r'\W+', ' ', content).lower():
            wetsuit_adviezen = content
    return wetsuit_adviezen


def str_to_dict(wetsuit_adviezen):
    """
    convert wetsuit_adviezen to a dictionary that uses dayofweek as the key
    """

    # ['Tests run: 1', ' Failures: 0', ' Errors: 0']
    a = wetsuit_adviezen.split('\n')

    d = {}

    for b in a:
        try:
            i = b.split(': ')
            d[i[0].lower()] = i[1]
        except:
            d[i[0].lower()] = None

    return d

In [45]:
re.sub(r'\W+', ' ', 'Wetsuit-adviezen').lower() 

'wetsuit adviezen'

In [46]:
# test parse wetsuit advices 

In [47]:
posts = read_posts(url = "http://surfweer.nl/surf/page/1/")

In [58]:
post_to_df(posts[0])

Unnamed: 0,report_date,report_content,day_of_week,post_date,post_id,post_title,wetsuit_adviezen_str,wetsuit_adviezen
0,2021-01-12,Dinsdag 12 januari\nHet blijft de hele dag waa...,dinsdag,2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","6/5/4mm wetsuit, hand 4mm, schoen 7mm, cap 3mm"
1,2021-01-13,Woensdag 13 januari\nIneens zijn de golven twe...,woensdag,2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","Nieuwe 5/4mm wetsuit, schoen 6mm en hand 4mm, ..."
2,2021-01-14,"Donderdag 14 januari\nDe wind is weg, maar de ...",donderdag,2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","wetsuit 6mm of 5/4mm cap +1mm, hand 5mm, schoe..."
3,NaT,Vrijdag en het weekend\nWaar de swell donderda...,vrijdag,2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...",vrijdag
4,NaT,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","wetsuit-adviezen,",2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","wetsuit-adviezen,"


In [34]:
urls = [ "http://surfweer.nl/surf/page/{:}/".format(page) for page in np.arange(3)+1]

In [35]:
posts = []

for url in urls: 
    posts.append(read_posts(url))
    
posts = sum(posts, []) # flatten the nested list

In [64]:
dfs = []
ii = 0
for post in posts:
    ii+=1
    print( ii)
    df = post_to_df(post)
    dfs.append(df)
    
df_combined = pd.concat(dfs)

1
2
3
4


AttributeError: 'NoneType' object has no attribute 'split'

In [60]:
post

<div class="post_alt" id="post-33019">
<h2><a href="http://surfweer.nl/surf/surfweer-vrijdag-18-december-2020/" rel="bookmark">Surfweer vrijdag 18 december 2020</a></h2>
<div class="entry">
<div id="surf-weerbericht">
<p>De wind komt vrijdag uit een iets te aflandige richting, waardoor er alleen golven zijn Noord-Holland en en gedeelte van Zuid-Holland. Dit weekend ruimt de wind naar een iets meer zuidwest richting. Wel minder hard, maar de opgebouwde swell uit het zuiden kan dan makkelijker richting land deinen. Na het weekend swell uit het noorden, op dinsdag en vooral rond de kerstdagen. Gezien de strenge lockdown maatregelen nogmaals aandacht voor de <a href="https://surfweer.nl/regels" rel="noopener" target="_blank">surfweer.nl/regels</a> in en om het water. En mocht je nog wat nodig hebben om warm de winter door te komen, <a href="https://surfshop.natural-high.nl/" rel="noopener" target="_blank">bestel op tijd</a> want op=op (code ’25FORXMAS’ is te gebruiken voor 25% nieuwe-colle

## test replace with dict 

In [36]:
df_test = post_to_df(posts[0])
wetsuit_adviezen = df_test['wetsuit_adviezen'].values[0]
wetsuit_adviezen_dict = str_to_dict(wetsuit_adviezen)
df_test.replace({"day_of_week": wetsuit_adviezen_dict})

Unnamed: 0,report_date,report_content,day_of_week,post_date,post_id,post_title,wetsuit_adviezen_str,wetsuit_adviezen
0,2021-01-12,Dinsdag 12 januari\nHet blijft de hele dag waa...,dinsdag,2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","6/5/4mm wetsuit, hand 4mm, schoen 7mm, cap 3mm"
1,2021-01-13,Woensdag 13 januari\nIneens zijn de golven twe...,woensdag,2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","Nieuwe 5/4mm wetsuit, schoen 6mm en hand 4mm, ..."
2,2021-01-14,"Donderdag 14 januari\nDe wind is weg, maar de ...",donderdag,2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","wetsuit 6mm of 5/4mm cap +1mm, hand 5mm, schoe..."
3,NaT,Vrijdag en het weekend\nWaar de swell donderda...,vrijdag,2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...",vrijdag
4,NaT,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","wetsuit-adviezen,",2021-01-12,post-33062,Surfweer dinsdag 12 januari 2021,"Wetsuit-adviezen, De zee is +6 C\nDinsdag: 6/5...","wetsuit-adviezen,"


In [None]:
# df_combined.head(20)

In [None]:
# df_combined['report_date'].dt.year = df_combined['post_date'].dt.year.values

In [None]:
# df_combined.dropna()

#### test