### Everyday nwac.us gives a forecast for each area across the state, giving each area it's own unique url. The urls are nested within dates as well, so this notebook will perform a scrape to generate a list of urls within each day.

### These daily urls also contain more list of url forecasts. Once saved, a mongodb is where the htmls will be collected, which will then be utilized for the next scrape to then take the html code using beautifulsoup!


## import scrape_soup with the function and other module imports

## do not initiate another shadow nor close the first one that opens when the module loads, for the functions rely on the original window. 

In [1]:
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
import pymongo
import time
import pandas as pd
import numpy as np
import random
import json
import scrape_soup as s_soup
%load_ext autoreload
%autoreload 2

### Provide a list of years to scrape, then pass the list through the collect_day_urls function

In [10]:
years = range(2014,2020)
browser = Chrome()
daily_master_list = s_soup.collect_day_urls(years,browser)

In [11]:
len(daily_master_list)

1514

In [13]:
# This takes quite a while, rather than running the cell below, 
#just grab the file 'master_url_list.json'

browser = Chrome()
s_soup.get_mountain_urls_from_day_urls(daily_master_list,browser,)

In [14]:
urls_df = pd.read_json('master_url_list.json',lines=True)

In [15]:
urls_df.shape

(0, 0)

### Rather than parsing through all webpages, I will specify just the stevens pass urls

In [17]:
stevens = urls_df.loc[urls_df['url'].str.contains('cascade-west-stevens-pass')]

In [18]:
stevens.shape

(1136, 1)

In [19]:
stevens_urls = list(stevens['url'])

### stevens_urls also has forcasts for north of stevens-pass, lets try and filter that out back in the dataframe


In [20]:
stevens_true = stevens.loc[~stevens["url"].str.contains("north")]

In [21]:
stevens_list = list(stevens_true['url'])

In [22]:
len(stevens_list)

991

In [32]:
mc = pymongo.MongoClient()
db = mc['avalanche']
forecast_coll = db['stevens_forecasts']


In [None]:
# def scrape_url(url, browser=browser, delay=3):
#     """Returns the HTML source from a URL."""
#     browser.get(url)
#     time.sleep(delay)
#     html = browser.page_source
#     return html



In [None]:
# def collect_page(url, browser=browser, coll=coll, delay=3):
#     """Scrapes and saves the source of a web page."""
#     docs = list(coll.find({'url': url}))
#     if len(docs) == 0:
#         html = scrape_url(browser=browser,
#                           url=url,
#                           delay=delay)
#         coll.insert_one({
#             'url': url,
#             'html': html,
#         })
#     doc = coll.find_one({'url' : url})
#     return doc

In [44]:
browser = Chrome()

In [46]:
for url in stevens_list:
    s_soup.collect_page(url,browser)

In [47]:
db.list_collection_names()

['forecasts_test', 'stevens_forecasts', 'forecasts']

In [48]:
coll.count_documents({})

991

In [49]:
cur = coll.find({})
stevens = [forecasts for forecasts in cur ]

In [50]:
len(stevens)

991

### Now we have all the desired html pages, beautiful soup can be implemented to grab the forecasts!

In [None]:
# def create_danger_info(page):
#     '''Collects the relevant info from the webpage html'''
    
    
#     danger_above_tag = '#treeline-above > div.span4.elev-day1-column > div.danger-description > h4'
#     danger_near_tag ='#treeline-near > div.span4.elev-day1-column > div.danger-description > h4'
#     danger_below_tag = '#treeline-below > div.span4.elev-day1-column > div.danger-description > h4'
#     area_tag = '#main-content > h2'
#     date_tomorrow_tag = '#elevation-levels-header > div.span4.desktop.elevation-day-name > p'
#     soup = BeautifulSoup(page['html'],'lxml')
#     area = soup.select_one(area_tag).text
#     date_tomorrow = soup.select_one(date_tomorrow_tag).text
#     danger_above = soup.select_one(danger_above_tag).text
#     danger_near = soup.select_one(danger_near_tag).text
#     danger_below = soup.select_one(danger_below_tag).text
#     return {
#             'date_tomorrow': date_tomorrow,
#             'area': area,
#             'danger_above_treeline': danger_above,
#             'danger_near_treeline': danger_near,
#             'danger_below_treeline': danger_below,
#             }


In [None]:
with open('stevens_forecast_danger_test.json','w') as f:
    for page in stevens:
        try:
            line = s_soup.create_danger_info(page)
            json.dump(line, f)
            f.write('\n')
        except:
            pass

In [4]:
df = pd.read_json('json_archive/stevens_forecast_danger.json',lines=True)

In [6]:
df.shape

(967, 5)

### Clean up data for nulls and duplicates before finalizing

In [7]:
df.shape, df.dropna().shape, df.drop_duplicates().shape

((967, 5), (967, 5), (830, 5))

In [8]:
df.isna().sum()

area                     0
danger_above_treeline    0
danger_below_treeline    0
danger_near_treeline     0
date_tomorrow            0
dtype: int64

In [9]:
df.to_json('stevens_forecast_soup.json')