# Scraper
-----
- Extracts data from weather balloon readings
- Transforms data into standard format
- Displays data on easily understandable graphs
---

In [130]:
import os
import requests
import pandas as pd
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup
import numpy as np
from math import floor
from pathlib import Path  
from alive_progress import alive_bar
import ipywidgets as widgets
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import seaborn as sns
from windrose import WindroseAxes
from PIL import Image

now = [datetime.now().year, datetime.now().month, datetime.now().day]

## Define baseline params
---
- URL to be used
- Columns to be extracted
- Weather stations to examine
- Current date/time
---

In [132]:
# Function to truncate a colormap
# Used to remove white / black (or other) bookended values
def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
    new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
    return new_cmap

#### Scraper Details

In [133]:
scraper_details = {
  'base_url': "http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST",
  'cols': ["Date", "Station", "Pressure", "Height", "Temp", "DewPoint", "Relative_Humidity", "Mean_Mixed_Layer", "Wind_Direction", "Wind_Speed", "Potential_Temp", "Equivalent_Potential_Temp", "Virtual_Potential_Temp"]
}

#### Graph details

In [134]:
graph_details ={
  'temperature': {
    'palette': [sns.color_palette("Paired", n_colors=6)[idx] for idx in [1, 0, 3, 2, 5, 4]],
    'ranges': [timedelta(hours=-1), timedelta(hours=-24), timedelta(days=-3), timedelta(days=-7), timedelta(days=-30), timedelta(days=-90), timedelta(days=-180)],
  },
  'wind': {
    'palette': truncate_colormap(cm.viridis_r, minval=0, maxval=0.9),
    'ranges': [timedelta(hours=-12), timedelta(hours=-24), timedelta(days=-3), timedelta(days=-7)]
  }
}

#### Balloon locations

In [135]:
locations = [
  {'id': 72797, 'name': 'Quillayute', 'temp_palette': graph_details['temperature']['palette'][0:2]},
  {'id': 73033, 'name': 'Vernon', 'temp_palette': graph_details['temperature']['palette'][2:4]},
  {'id': 71109, 'name': 'Port Hardy', 'temp_palette': graph_details['temperature']['palette'][4:6]}
]

### Generate URLs to be scraped
---

In [136]:
for location in locations:
    location['url'] = '{}&YEAR={}&MONTH={}&FROM={:0>2d}00&TO={:0>2d}23&STNM={}&REPLOT=1'.format(scraper_details['base_url'], datetime.now().year, datetime.now().month, datetime.now().day, datetime.now().day, location['id'])

for location in locations:
    print(location['url'])
    

http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2023&MONTH=6&FROM=1900&TO=1923&STNM=72797&REPLOT=1
http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2023&MONTH=6&FROM=1900&TO=1923&STNM=73033&REPLOT=1
http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2023&MONTH=6&FROM=1900&TO=1923&STNM=71109&REPLOT=1


## Scrape new data from each location
---

In [110]:
for location in locations:
    search = 0
    data = []
    while search >= 0:
        # scrape site data
        # ~~~~~~~~~~~~~~~~
        search += 1
        page = requests.get(location['url'])
        soup = BeautifulSoup(page.content, "html.parser")
        results = str(soup.find('pre'))
        sevhundy = results[results.find("700.0"):].split()[:11]
        eightfiddy = results[results.find("850.0"):].split()[:11]

        # catch erroneous reads
        # site can hit too many requests
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        if 'e' not in sevhundy[0] and 'e' not in eightfiddy[0]:
            search = -1
        elif search == 10:
            print(url)
            problem_urls.append(url)
            search = -1
            
    # we want 700 first // 850 second
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    data.append(sevhundy)
    data[-1].insert(0, '%02d-%02d-%02d' % (now[0], now[1], now[2]))
    data[-1].insert(1, location['name'])

    data.append(eightfiddy)
    data[-1].insert(0, '%02d-%02d-%02d' % (now[0], now[1], now[2]))
    data[-1].insert(1, location['name'])
        
    location['data'] = data

In [111]:
for location in locations:
    print(location['data'])

[['2023-06-06', 'Quillayute', '700.0', '3125', '2.8', '-22.2', '14', '0.93', '305', '15', '305.6', '308.7', '305.7'], ['2023-06-06', 'Quillayute', '850.0', '1537', '10.0', '-17.0', '13', '1.19', '15', '15', '296.6', '300.5', '296.8']]
[['2023-06-06', 'Vernon', '700.0', '3121', '0.0', '-23.0', '16', '0.86', '275', '12', '302.4', '305.4', '302.6'], ['2023-06-06', 'Vernon', '850.0', '1544', '10.8', '-4.2', '35', '3.31', '15', '5', '297.4', '307.6', '298.0']]
[['2023-06-06', 'Port Hardy', '700.0', '3115', '1.0', '-42.0', '2', '0.14', '295', '9', '303.6', '304.1', '303.6'], ['2023-06-06', 'Port Hardy', '850.0', '1529', '9.8', '-14.2', '17', '1.51', '350', '9', '296.4', '301.2', '296.7']]


In [112]:
# Remove rogue "e" values from pressure field
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pressures = [700, 850]
ctr = 0
for location in locations:
    for row in location['data']:
        try:
            if(not int(float(row[2])) in pressures):
                row[2] = pressures[ctr%2]
        except Exception as e:
            print(e)
            
        ctr += 1
        print(row)

['2023-06-06', 'Quillayute', '700.0', '3125', '2.8', '-22.2', '14', '0.93', '305', '15', '305.6', '308.7', '305.7']
['2023-06-06', 'Quillayute', '850.0', '1537', '10.0', '-17.0', '13', '1.19', '15', '15', '296.6', '300.5', '296.8']
['2023-06-06', 'Vernon', '700.0', '3121', '0.0', '-23.0', '16', '0.86', '275', '12', '302.4', '305.4', '302.6']
['2023-06-06', 'Vernon', '850.0', '1544', '10.8', '-4.2', '35', '3.31', '15', '5', '297.4', '307.6', '298.0']
['2023-06-06', 'Port Hardy', '700.0', '3115', '1.0', '-42.0', '2', '0.14', '295', '9', '303.6', '304.1', '303.6']
['2023-06-06', 'Port Hardy', '850.0', '1529', '9.8', '-14.2', '17', '1.51', '350', '9', '296.4', '301.2', '296.7']


### Convert to Pandas DataFrame

In [115]:
new_data = pd.DataFrame(locations[0]['data'] + locations[1]['data'] + locations[2]['data'], columns=scraper_details['cols'])

# Convert DataFrame to numeric values
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for col in scraper_details['cols'][2:]:
    try:
        new_data[col] = pd.to_numeric(new_data[col])
    except:
        print(col)
    
# Convert wind speed from Knots to KM/H
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
new_data['Wind_Speed'] = new_data['Wind_Speed'].multiply(1.852)
new_data.head()


Unnamed: 0,Date,Station,Pressure,Height,Temp,DewPoint,Relative_Humidity,Mean_Mixed_Layer,Wind_Direction,Wind_Speed,Potential_Temp,Equivalent_Potential_Temp,Virtual_Potential_Temp
0,2023-06-06,Quillayute,700.0,3125,2.8,-22.2,14,0.93,305,27.78,305.6,308.7,305.7
1,2023-06-06,Quillayute,850.0,1537,10.0,-17.0,13,1.19,15,27.78,296.6,300.5,296.8
2,2023-06-06,Vernon,700.0,3121,0.0,-23.0,16,0.86,275,22.224,302.4,305.4,302.6
3,2023-06-06,Vernon,850.0,1544,10.8,-4.2,35,3.31,15,9.26,297.4,307.6,298.0
4,2023-06-06,Port Hardy,700.0,3115,1.0,-42.0,2,0.14,295,16.668,303.6,304.1,303.6


## Read season data
---

In [126]:
# Calculate filepath for current season data
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if datetime.now().month < 8 :
    this_season = '{}-{}'.format(datetime.now().year -1, datetime.now().year)
else:
    this_season = '{}-{}'.format(datetime.now().year, datetime.now().year + 1)
    
data_path = './data/balloon-data-{}.csv'.format(this_season)

# If there is not file for this season, create a new file
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if not os.path.exists(data_path):
    filepath = Path(data_path)
    filepath.parent.mkdir(parents=True, exist_ok=True) 
    new_data.to_csv(filepath)
# Concat current season data with today's data and overwrite file
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
else:
    season_data = pd.read_csv(data_path)
    season_data.head()
    


In [129]:
# Why isn't this code finding the current file?
# Do we want to keep the old file for backup purposes?

# First search for current file
# Look for missing data dates between today and start_date
# Scrape data for missing data only
# Append missing data to DataFrame
# Save updated DF to CSV