In [76]:
# Import libraries/dependencies
import pandas as pd 
import os
import requests
from splinter import Browser
from bs4 import BeautifulSoup as bs 
import datetime as dt
import time
import pymongo

In [77]:
# Initialize PyMongo to work with MongoDB local server
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [78]:
# Define database and collection
db = client.surfline_db
collection = db.news

In [79]:
# Define the Surfline blog page URL
url = 'https://www.surfline.com/surf-news'

# Retrive the page with the request module
response = requests.get(url)

# Convert the response to text to obtain the html
html = response.text

In [80]:
### We will use a Chrome browser to view and download the HTML contents of the website using the Splinter library
# Establish Chrome driver executable path. Make sure to dfine actual location on your drive.
executable_path = {'executable_path': 'C:/Users/jaysu/chromedriver'}

# Open a splinter browser
browser = Browser('chrome', **executable_path, headless=False)

In [81]:
# Visit the defined URL on your splinter browser
browser.visit(url)
time.sleep(5)

In [82]:
### We will use the BeautifulSoup Library to scrape and parse the HTML
# Create a BeautifulSoup object with the splinter broswer.html method to parse the html with 'html.parser' or 'lmxl' formats
soup = bs(browser.html, 'html.parser')

In [83]:
### Create a list of weblinks of each news article that we will iterate through
# Find all the image cards that hold the links to the stories with the BSoup object
news_links = soup.find_all('div', class_='quiver-feed-card__image')

# Print the number of articles and the first element contains element contents desired
print(f'Number of stories {len(news_links)}')
print(f'Example of the first html element block:')
print(news_links[3])

Number of stories 85
Example of the first html element block:
<div class="quiver-feed-card__image"><a aria-label="Feed Card Image" href="https://www.surfline.com/surf-news/watch-tom-carroll-matt-grainger-break-tow-foiling-glide-empty-outside-sydney-reef/100137" style='background-image: url("https://www.surfline.com/cdn-cgi/image/w=740,q=85,f=auto,fit=contain/https://d14fqx6aetz9ka.cloudfront.net/wp-content/uploads/2020/10/13192933/TC_Surfline_foil_oct20.jpg");' tabindex="0" target=""></a></div>


In [85]:
### We now begin the scraping of the webpage with a for loop and uploading the data to a MongoDB database
# Begin for loop through the news_links list, create empty lists to populate scrape, and insert_one() to the collection at each iteration
for item in news_links:
    # title = []
    p_text = [] # the articles text contents
    tags_text = [] # the tags of the article
    date2 = []
    url = item.find('a')['href']
    browser.visit(url)
    time.sleep(1)
    soup1 = bs(browser.html, 'html.parser')
    title = soup1.find('h1').text # title of article
    p_copy = soup1.find_all('p')
    for copy in p_copy:
        p_text.append(copy.text)
   
    try:
        tags = soup1.find('ul', class_='sl-article-tags')
        for tag in tags.find_all('li'):
            tags_text.append(tag.text)
    except:
        print("No tags")

    try:
        element = soup1.find('div', class_='sl-editorial-author__details__date')
        date_update = element.text.split('.', maxsplit=1)
        date2 = date_update[0]
    except:
        print('No date')

    post = {
        'title': title,
        'p_text': p_text,
        'tags_text': tags_text,
        'date': date2
    }
    print(f'Scraped news article: {date2} {title}')
    print('--------------------------------')
    collection.insert_one(post)

No date
Scraped news article: [] Russo Cam: A watery look into the North Shore’s "secret society."
--------------------------------
No tags
Scraped news article: Nov 2nd, 2020 Watch: Meet Kehu Butler, the 20-Year-Old Rising Surf Star From Mount Maunganui, NZ
--------------------------------
Scraped news article: Oct 17th, 2020 With No 2020 Olympics, How're the Surfers Feeling?
--------------------------------
No tags
Scraped news article: Oct 13th, 2020 Watch Tom Carroll & Matt Grainger Break Down Tow-Foiling and Glide an Empty Outside Sydney Reef
--------------------------------
No tags
Scraped news article: Jan 18th, 2021 A Powerful South and SSE Groundswell Is About To Rock the Australian East Coast
--------------------------------
No tags
Scraped news article: Jan 15th, 2021 A Ripping New Rage Vid, Pete Mel's Stoke Is Everyone's Stoke, and Maverick's on Skis?
--------------------------------
No tags
Scraped news article: Jan 13th, 2021 RIP: Gordon James Philipson (1938-2021)
------

In [10]:
# Uncommment code below to test to see what the "post" dictionary looks like
# post