# Scraper
-----
- Load most recent weather.csv
- Collect list of missing data between start_date and today
- Scrape balloon data for any missing dates
- Clean / normalize new data
- Concatenate new and histrolcal DataFrames
- Update weather.csv
---

In [85]:
import os
import requests
import pandas as pd
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup
import numpy as np
from math import floor
from pathlib import Path  
from alive_progress import alive_bar
import ipywidgets as widgets
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import seaborn as sns
from windrose import WindroseAxes
from PIL import Image
from google.cloud import storage

## Load Historical Weather Data
---

In [86]:
data_path = './data/weather.csv'

# If data exists, load it
# ~~~~~~~~~~~~~~~~~~~~~~~
if os.path.exists(data_path):
    weather_data = pd.read_csv(data_path)
    print(weather_data.head())
else:
    print("No historical data exists")

         Date     Station  Pressure  Height  Temp  DewPoint  \
0  2020-01-01  Quillayute       700  2969.0  -0.9      -0.9   
1  2020-01-01  Quillayute       850  1393.0   6.2       6.2   
2  2020-01-01      Vernon       700  2880.0  -5.5      -7.0   
3  2020-01-01      Vernon       850  1357.0  -3.9      -5.8   
4  2020-01-01  Port Hardy       700  2856.0  -4.3      -5.8   

   Relative_Humidity  Mean_Mixed_Layer  Wind_Direction  Wind_Speed  \
0              100.0              5.14           255.0     100.008   
1              100.0              7.04           260.0      81.488   
2               89.0              3.25           260.0      94.452   
3               87.0              2.93           170.0      18.520   
4               89.0              3.56           250.0      87.044   

   Potential_Temp  Equivalent_Potential_Temp  Virtual_Potential_Temp  
0           301.5                      317.2                   302.4  
1           292.6                      313.1              

## Create Range of Missing Dates
---
Extension: Check any dates that are incomplete
- Missing one location but not all
- Missing values

In [87]:
dates = []
delta = timedelta(days=1)
start_date = date(2020, 1, 1)
end_date = datetime.today().date()

# Create list of all dates between start_date and today
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
while start_date <= end_date:
    # add current date to list by converting  it to iso format
    dates.append(start_date.isoformat())
    # increment start date by timedelta
    start_date += delta

In [88]:
# Display start and end of date range
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for x in range(5):
    print(dates[x])
print('...')   
for x in range(5):
    print(dates[-x])

2020-01-01
2020-01-02
2020-01-03
2020-01-04
2020-01-05
...
2020-01-01
2023-08-15
2023-08-14
2023-08-13
2023-08-12


In [89]:
removed = []
# Remove dates from list that are alrady populated in weather_data
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for d in weather_data['Date'].unique().tolist():
    if d in dates:
        dates.remove(d)
        removed.append(d)

### Extension
would it also be worth checking for entries with missing values?

In [90]:
# Display start and end of date range
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for d in removed:
    print(d)

2020-01-01
2020-01-02
2020-01-03
2020-01-04
2020-01-05
2020-01-06
2020-01-07
2020-01-08
2020-01-09
2020-01-10
2020-01-11
2020-01-12
2020-01-13
2020-01-14
2020-01-15
2020-01-16
2020-01-17
2020-01-18
2020-01-19
2020-01-20
2020-01-21
2020-01-22
2020-01-23
2020-01-24
2020-01-25
2020-01-26
2020-01-27
2020-01-28
2020-01-29
2020-01-30
2020-01-31
2020-02-01
2020-02-02
2020-02-03
2020-02-04
2020-02-05
2020-02-06
2020-02-07
2020-02-08
2020-02-09
2020-02-10
2020-02-11
2020-02-12
2020-02-13
2020-02-14
2020-02-15
2020-02-16
2020-02-17
2020-02-18
2020-02-19
2020-02-20
2020-02-21
2020-02-22
2020-02-23
2020-02-24
2020-02-25
2020-02-26
2020-02-27
2020-02-28
2020-02-29
2020-03-01
2020-03-02
2020-03-03
2020-03-04
2020-03-05
2020-03-06
2020-03-07
2020-03-08
2020-03-09
2020-03-10
2020-03-11
2020-03-12
2020-03-13
2020-03-14
2020-03-15
2020-03-16
2020-03-17
2020-03-18
2020-03-19
2020-03-20
2020-03-21
2020-03-22
2020-03-23
2020-03-24
2020-03-25
2020-03-26
2020-03-27
2020-03-28
2020-03-29
2020-03-30
2020-03-31

## Generate URLs to be Scraped
---

In [91]:
base_url = "http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST"

stations = [
    {"id": 72797, "name": "Quillayute"},
    {"id": 73033, "name": "Vernon"},
    {"id": 71109, "name": "Port Hardy"}]

In [92]:
for d in dates:
    print(d)

2023-08-15


In [93]:
# Create list of URLs for each station
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
urls = []
dates_idx = []
station_idx = []
for d in dates:
    d = d.split('-')
    for station in stations:
        url = base_url
        url += "&YEAR={}".format(d[0])
        url += "&MONTH={}".format(d[1])
        #url += "&FROM=" + str(now[1]) + str(now[2])
        #url += "&TO=" + str(now[1]) + str(now[2])
        url += "&FROM={:0>2d}00".format(int(d[2]))
        url += "&TO={:0>2d}23".format(int(d[2]))
        url += "&STNM={}".format(station["id"])
        url += "&REPLOT=1"
        urls.append(url)
        
        
        dates_idx.append("%02d-%02d-%02d" % (int(d[0]), int(d[1]), int(d[2])))
        station_idx.append(station["name"])


In [94]:
for url in urls:
    print(url)

http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2023&MONTH=08&FROM=1500&TO=1523&STNM=72797&REPLOT=1
http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2023&MONTH=08&FROM=1500&TO=1523&STNM=73033&REPLOT=1
http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2023&MONTH=08&FROM=1500&TO=1523&STNM=71109&REPLOT=1


## Extract data from list of URLs
---
- Iterate through URLs
- Extract 700 and 850 data items
- Append data to array

In [95]:
data = []
reading_idx = []
problem_urls = []

print("Extracting data from URLs: ")
with alive_bar(len(urls), force_tty=True) as bar:
    for i, url in enumerate(urls):
        search = 0
        while search >= 0:
            # Scrape site data
            # ~~~~~~~~~~~~~~~~
            search += 1
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            results = str(soup.find('pre'))
            sevhundy = results[results.find("700"):].split()[:11]
            eightfiddy = results[results.find("850"):].split()[:11]
            
            # Catch erroneous reads
            # Site can hit too many requests
            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            if 'e' not in sevhundy[0] and 'e' not in eightfiddy[0]:
                search = -1
            elif search == 10:
                print(url)
                problem_urls.append(url)
                search = -1

        # Extract 700 first // 850 second
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        data.append(sevhundy)
        data[-1].insert(0, dates_idx[i])
        data[-1].insert(1, station_idx[i])
        
        data.append(eightfiddy)
        data[-1].insert(0, dates_idx[i])
        data[-1].insert(1, station_idx[i])
        
        bar()
        

print("\nComplete")

Extracting data from URLs: 
|████████████████████████████████████████| 3/3 [100%] in 9.3s (0.25/s)                                                  

Complete


In [105]:
for row in data:
    print(row)

['2023-08-15', 'Quillayute', 700, '3230', '12.6', '-13.4', '15', '1.95', '20', '6', '316.4', '323.1', '316.8']
['2023-08-15', 'Quillayute', 850, '1568', '25.2', '-2.8', '16', '3.68', '0', '15', '312.5', '324.5', '313.2']
['2023-08-15', 'Vernon', 700, '3224', '10.2', '-4.8', '34', '3.84', '300', '15', '313.8', '326.3', '314.5']
['2023-08-15', 'Vernon', 850, '1569', '24.2', '6.2', '31', '7.04', '355', '7', '311.5', '333.6', '312.8']
['2023-08-15', 'Port Hardy', 700, '3228', '8.0', '-1.0', '53', '5.11', '335', '25', '311.3', '327.6', '312.3']
['2023-08-15', 'Port Hardy', 850, '1595', '16.8', '8.8', '59', '8.43', '355', '20', '303.7', '329.2', '305.3']


In [97]:
print("Problem URLs:")
for url in problem_urls:
    print(url)

Problem URLs:


## Clean the Extracted Data
---

In [98]:
# Remove rogue "e" values from pressure field
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pressures = [700, 850]
ctr = 0
for item in data:
    if(not str(item[2]).isnumeric()):
        item[2] = pressures[ctr%2]
    ctr += 1

In [99]:
cols = ["Date", 
        "Station", 
        "Pressure", 
        "Height", 
        "Temp", 
        "DewPoint", 
        "Relative_Humidity", 
        "Mean_Mixed_Layer", 
        "Wind_Direction", 
        "Wind_Speed", 
        "Potential_Temp",
        "Equivalent_Potential_Temp",
        "Virtual_Potential_Temp"]

In [100]:
new_data = pd.DataFrame(data, columns=cols)

# Convert DataFrame to numeric values
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for col in cols[2:]:
    try:
        new_data[col] = pd.to_numeric(new_data[col])
    except:
        print(col)
    
# Convert wind speed from Knots to KM/H
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
new_data['Wind_Speed'] = new_data['Wind_Speed'].multiply(1.852)
new_data.head(10)

Unnamed: 0,Date,Station,Pressure,Height,Temp,DewPoint,Relative_Humidity,Mean_Mixed_Layer,Wind_Direction,Wind_Speed,Potential_Temp,Equivalent_Potential_Temp,Virtual_Potential_Temp
0,2023-08-15,Quillayute,700,3230,12.6,-13.4,15,1.95,20,11.112,316.4,323.1,316.8
1,2023-08-15,Quillayute,850,1568,25.2,-2.8,16,3.68,0,27.78,312.5,324.5,313.2
2,2023-08-15,Vernon,700,3224,10.2,-4.8,34,3.84,300,27.78,313.8,326.3,314.5
3,2023-08-15,Vernon,850,1569,24.2,6.2,31,7.04,355,12.964,311.5,333.6,312.8
4,2023-08-15,Port Hardy,700,3228,8.0,-1.0,53,5.11,335,46.3,311.3,327.6,312.3
5,2023-08-15,Port Hardy,850,1595,16.8,8.8,59,8.43,355,37.04,303.7,329.2,305.3


In [101]:
new_data.tail()

Unnamed: 0,Date,Station,Pressure,Height,Temp,DewPoint,Relative_Humidity,Mean_Mixed_Layer,Wind_Direction,Wind_Speed,Potential_Temp,Equivalent_Potential_Temp,Virtual_Potential_Temp
1,2023-08-15,Quillayute,850,1568,25.2,-2.8,16,3.68,0,27.78,312.5,324.5,313.2
2,2023-08-15,Vernon,700,3224,10.2,-4.8,34,3.84,300,27.78,313.8,326.3,314.5
3,2023-08-15,Vernon,850,1569,24.2,6.2,31,7.04,355,12.964,311.5,333.6,312.8
4,2023-08-15,Port Hardy,700,3228,8.0,-1.0,53,5.11,335,46.3,311.3,327.6,312.3
5,2023-08-15,Port Hardy,850,1595,16.8,8.8,59,8.43,355,37.04,303.7,329.2,305.3


## Combine New and Old Data

In [102]:
complete_data = pd.concat([weather_data, new_data])
complete_data = complete_data.sort_values(by=['Date'])
#complete_data = complete_data.drop('Unnamed: 0', axis=1)
complete_data.head(20)

Unnamed: 0,Date,Station,Pressure,Height,Temp,DewPoint,Relative_Humidity,Mean_Mixed_Layer,Wind_Direction,Wind_Speed,Potential_Temp,Equivalent_Potential_Temp,Virtual_Potential_Temp
0,2020-01-01,Quillayute,700,2969.0,-0.9,-0.9,100.0,5.14,255.0,100.008,301.5,317.2,302.4
1,2020-01-01,Quillayute,850,1393.0,6.2,6.2,100.0,7.04,260.0,81.488,292.6,313.1,293.9
2,2020-01-01,Vernon,700,2880.0,-5.5,-7.0,89.0,3.25,260.0,94.452,296.4,306.3,296.9
3,2020-01-01,Vernon,850,1357.0,-3.9,-5.8,87.0,2.93,170.0,18.52,282.1,290.5,282.6
4,2020-01-01,Port Hardy,700,2856.0,-4.3,-5.8,89.0,3.56,250.0,87.044,297.7,308.6,298.3
5,2020-01-01,Port Hardy,850,1301.0,3.4,1.9,90.0,5.19,250.0,59.264,289.7,304.8,290.6
6,2020-01-02,Quillayute,700,2946.0,-10.7,-23.7,34.0,0.81,260.0,50.004,290.6,293.2,290.8
7,2020-01-02,Quillayute,850,1426.0,-1.3,-1.6,98.0,4.02,280.0,42.596,284.8,296.4,285.5
8,2020-01-02,Vernon,700,2882.0,-9.5,-12.3,80.0,2.14,305.0,31.484,291.9,298.5,292.3
9,2020-01-02,Vernon,850,1351.0,2.2,-4.8,60.0,3.16,245.0,33.336,288.4,297.8,289.0


In [103]:
complete_data.tail(20)

Unnamed: 0,Date,Station,Pressure,Height,Temp,DewPoint,Relative_Humidity,Mean_Mixed_Layer,Wind_Direction,Wind_Speed,Potential_Temp,Equivalent_Potential_Temp,Virtual_Potential_Temp
7914,2023-08-12,Port Hardy,850,1524.0,8.8,8.8,100.0,8.43,245.0,37.04,295.4,319.9,296.9
7915,2023-08-12,Port Hardy,700,3118.0,1.4,1.4,100.0,6.09,255.0,50.004,304.0,322.6,305.1
7920,2023-08-13,Quillayute,700,3211.0,7.0,1.0,66.0,5.91,5.0,35.188,310.2,328.8,311.3
7921,2023-08-13,Quillayute,850,1586.0,15.4,-9.6,17.0,2.18,5.0,20.372,302.3,309.3,302.7
7922,2023-08-13,Vernon,700,3193.0,3.0,-1.6,72.0,4.89,330.0,27.78,305.8,321.0,306.7
7923,2023-08-13,Vernon,850,1580.0,16.8,4.8,45.0,6.38,345.0,20.372,303.7,323.2,304.9
7924,2023-08-13,Port Hardy,700,3221.0,7.4,1.4,66.0,6.09,310.0,31.484,310.6,329.8,311.8
7925,2023-08-13,Port Hardy,850,1602.0,13.4,9.7,78.0,8.96,0.0,20.372,300.2,326.8,301.8
7926,2023-08-14,Port Hardy,700,3243.0,8.8,-11.2,23.0,2.33,310.0,37.04,312.2,320.0,312.6
7927,2023-08-14,Quillayute,700,3237.0,11.6,-4.4,32.0,3.96,60.0,11.112,315.3,328.3,316.1


## Additional Cleaning on ALL Data
---

In [104]:
# Remove values that are exceptionally high or low
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for col in cols[2:]:
    q_low = complete_data[col].quantile(0.01)
    q_hi = complete_data[col].quantile(0.99)
    
    complete_clean_data = complete_data[(complete_data[col] < q_hi) & (complete_data[col] > q_low)]

## Export Clean Data to CSV
---

In [67]:
# Save backup before overwriting weather.csv
filepath = Path('./data/backups/weather-{}.csv'.format(datetime.today().date()))
filepath.parent.mkdir(parents=True, exist_ok=True) 
weather_data.to_csv(filepath, index=False)

In [68]:
# Overwrite existing weather.csv with most recent data
filepath = Path('./data/weather.csv')
filepath.parent.mkdir(parents=True, exist_ok=True) 
complete_data.to_csv(filepath, index=False)

In [69]:
# Overwrite existing weather.csv with most recent data
filepath = Path('./data/weather_clean.csv')
filepath.parent.mkdir(parents=True, exist_ok=True) 
complete_clean_data.to_csv(filepath, index=False)

## Publish Data to Google Cloud
---

In [75]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='../service_account.json'

In [76]:
storage_client = storage.Client()

In [78]:
bucket_name = 'weather_aurorabc'
source_file_name = './data/weather_clean.csv'
bucket = storage_client.bucket(bucket_name)

### Update weather.csv with most recent data

In [79]:
blob = bucket.blob('weather.csv')
blob.upload_from_filename(source_file_name)

### Save most recent data as a backup

In [84]:
blob = bucket.blob('backups/weather-{}.csv'.format(datetime.today().date()))
blob.upload_from_filename(source_file_name)

In [124]:
cols = ["Date", 
    "Station", 
    "Pressure", 
    "Height", 
    "Temp", 
    "DewPoint", 
    "Relative_Humidity", 
    "Mean_Mixed_Layer", 
    "Wind_Direction", 
    "Wind_Speed", 
    "Potential_Temp",
    "Equivalent_Potential_Temp",
    "Virtual_Potential_Temp"]


In [125]:
data = [['2020-03-30', 'Port Hardy', '>'],['2020-03-30', 'ABC', '>'],['2020-03-30', 'DEF', '>']]

In [126]:
print(data)

[['2020-03-30', 'Port Hardy', '>'], ['2020-03-30', 'ABC', '>'], ['2020-03-30', 'DEF', '>']]


In [132]:
data = [row for row in data if len(row) >= len(cols)]

In [133]:
print(data)

[]
