In [34]:
# Import libraries/dependencies
import pandas as pd 
import os
import requests
from splinter import Browser
from bs4 import BeautifulSoup as bs 
import datetime as dt
import time
import pymongo

In [35]:
# Initialize PyMongo to work with MongoDB local server
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [36]:
# Define database and collection
db = client.surfline
collection = db.news

In [37]:
# Define the Surfline blog page URL
url = 'https://www.surfline.com/surf-news'

# Retrive the page with the request module
response = requests.get(url)

# Convert the response to text to obtain the html
html = response.text

In [38]:
### We will use a Chrome browser to view and download the HTML contents of the website using the Splinter library
# Establish Chrome driver executable path. Make sure to dfine actual location on your drive.
executable_path = {'executable_path': 'C:/Users/jaysu/chromedriver'}

# Open a splinter browser
browser = Browser('chrome', **executable_path, headless=False)

In [39]:
# Visit the defined URL on your splinter browser
browser.visit(url)
time.sleep(5)

KeyboardInterrupt: 

In [None]:
### We will use the BeautifulSoup Library to scrape and parse the HTML
# Create a BeautifulSoup object with the splinter broswer.html method to parse the html with 'html.parser' or 'lmxl' formats
soup = bs(browser.html, 'html.parser')

In [44]:
### Create a list of weblinks of each news article that we will iterate through
# Find all the image cards that hold the links to the stories with the BSoup object
news_links = soup.find_all('div', class_='quiver-feed-card__image')

# Print the number of articles and the first element contains element contents desired
print(f'Number of stories {len(news_links)}')
print(f'Example of the first html element block:')
print(news_links[3])

Number of stories 85
Example of the first html element block:
<div class="quiver-feed-card__image"><a aria-label="Feed Card Image" href="https://www.surfline.com/surf-news/watch-tom-carroll-matt-grainger-break-tow-foiling-glide-empty-outside-sydney-reef/100137" style='background-image: url("https://www.surfline.com/cdn-cgi/image/w=740,q=85,f=auto,fit=contain/https://d14fqx6aetz9ka.cloudfront.net/wp-content/uploads/2020/10/13192933/TC_Surfline_foil_oct20.jpg");' tabindex="0" target=""></a></div>


In [75]:
####### INCLUDE THE SCRAPPING OF THE DATA OF EACH ARTICLE


### We now begin the scraping of the webpage with a for loop and uploading the data to a MongoDB database
# Begin for loop through the news_links list, create empty lists to populate scrape, and insert_one() to the collection at each iteration
for item in news_links[1:3]:
    # title = []
    p_text = [] # the articles text contents
    tags_text = [] # the tags of the article
    date2 = []
    url = item.find('a')['href']
    browser.visit(url)
    time.sleep(1)
    soup1 = bs(browser.html, 'html.parser')
    title = soup1.find('h1').text # title of article
    p_copy = soup1.find_all('p')
    for copy in p_copy:
        p_text.append(copy.text)
   
    try:
        tags = soup1.find('ul', class_='sl-article-tags')
        for tag in tags.find_all('li'):
            tags_text.append(tag.text)
    except:
        print("No tags")

    try:
        element = soup1.find('div', class_='sl-editorial-author__details__date')
        date_update = element.text.split('.', maxsplit=1)
        date2 = date_update[0]
    except:
        print('No date')

    post = {
        'title': title,
        'p_text': p_text,
        'tags_text': tags_text,
        'date': date2
    }
    print(f'Scraped news article: {date2} {title}')
    print('--------------------------------')
    collection.insert_one(post)
post

No tags
Scraped news article: Nov 2nd, 2020 Watch: Meet Kehu Butler, the 20-Year-Old Rising Surf Star From Mount Maunganui, NZ
--------------------------------
Scraped news article: Oct 17th, 2020 With No 2020 Olympics, How're the Surfers Feeling?
--------------------------------


{'title': "With No 2020 Olympics, How're the Surfers Feeling?",
 'p_text': ['Quickly access the spots you care about most.',
  'Two historic things: Surfing’s inaugural debut in the Olympics; the Olympics being postponed. Never in the 124-year history of the Games has either happened.',
  'Then came 2020.',
  'So, how have future Olympic surfers been dealing with this rollercoaster of the pandemic and potential Olympic glory? Surprisingly well, actually. In the video above, produced by the International Surfing Association (ISA), we check in with two gold medal hopefuls, Jordy Smith and Sally Fitzgibbons – both of whom are slated to represent their home countries in the now 2021 Tokyo Games – to hear how they’ve been spending this bizarro year.',
  'Jordy Smith at the 2019 ISA World Surfing Games. Photo: ISA/Ben Reed',
  'For Jordy: “Before COVID hit, I was in Hawaii. I was over there training and surfing. At that point I had to make the decision to get back to South Africa as the Unit

In [61]:
post['date']

['Nov 2nd, 2020', ' Updated 3 months ago.']

In [52]:
url = news_links[1].find('a')['href']
url

'https://www.surfline.com/surf-news/watch-meet-kehu-butler-20-year-old-rising-surf-star-mount-maunganui-nz/102380'

In [54]:
browser.visit(url)

In [10]:
# Uncommment code below to test to see what the "post" dictionary looks like
# post

In [21]:
test = news_links[2].find('div', class_="sl-editorial-author__details__date")

In [22]:
print(test)

None


In [23]:
news_links[2]

<div class="quiver-feed-card__image"><a aria-label="Feed Card Image" href="https://www.surfline.com/surf-news/no-2020-olympics-howre-surfers-feeling/100712" style='background-image: url("https://www.surfline.com/cdn-cgi/image/w=740,q=85,f=auto,fit=contain/https://d14fqx6aetz9ka.cloudfront.net/wp-content/uploads/2020/10/16161552/RSA_Jordy_Smith_ISA_Ben_Reed-10.jpg");' tabindex="0" target=""></a></div>

In [45]:
for item in news_links[1]:
    # title = []
    date_article = []
    p_text = [] # the articles text contents
    tags_text = [] # the tags of the article
    url = item.find('a')['href']
    browser.visit(url)
    time.sleep(1)
    soup1 = bs(browser.html, 'html.parser')
    title = soup1.find('h1').text # title of article
    p_copy = soup1.find_all('p')
    for copy in p_copy:
        p_text.append(copy.text)
    try:
        tags = soup1.find('ul', class_='sl-article-tags')
        for tag in tags.find_all('li'):
            tags_text.append(tag.text)
    except:
        print("No tags")
    try:
        element = soup.find('div', class_='sl-editorial-author__details__date')
        date_update = element.text.split('.', maxsplit=1)
        date = date_update[0]
    except:
        print('No date')
    post = {
        'title': title,
        'p_text': p_text,
        'tags_text': tags_text,
    }
    print(f'Scraped news article: {title}')
    print('--------------------------------')
    # collection.insert_one(post)
post

TypeError: 'NoneType' object is not subscriptable

In [72]:
url2 = 'https://www.surfline.com/surf-news/watch-meet-kehu-butler-20-year-old-rising-surf-star-mount-maunganui-nz/102380'
browser.visit(url2)
# time.sleep(1)
soup1 = bs(browser.html, 'html.parser')
element = soup1.find('div', class_='sl-editorial-author__details__date')
element

<div class="sl-editorial-author__details__date">Nov 2nd, 2020. <span><span class="sl-editorial-author__details__date__updated">Updated</span> 3 months ago.</span></div>

In [73]:
date = element.text.split('.', maxsplit=1)
date = date[0]
date

'Nov 2nd, 2020'