# Scraper 3.1
---
Goal of 3.1 is to include both AM and PM reading from ballon data
- Import Dataset from GoogleCloudStorage
- Find Missing Values in Dataset
- Get New Data from Missing Values
- Standardize DataTypes within New Data
- Combine New Data with Old Data
- Patch Missing Values within Complete Dataset
- Upload Complete dataset to GoogleCloudStorage

In [25]:
import base64
import os, io
import requests
import pandas as pd
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup
import numpy as np
from math import floor
from pathlib import Path  
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import seaborn as sns
from windrose import WindroseAxes
from PIL import Image
from google.cloud import storage
import google.cloud.logging
import logging
from flask import Response

In [26]:
from alive_progress import alive_bar

In [27]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../service_account.json'

In [28]:
# -------------------------------------------------------------------------------
# LOGGING SETUP
# -------------------------------------------------------------------------------
client = google.cloud.logging.Client()
client.setup_logging()

In [29]:
# -------------------------------------------------------------------------------
# CREATE DATERANGE
# -------------------------------------------------------------------------------
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

# Import Dataset
---
Import existing weather data from GoogleCloudStorage as Pandas Dataframe

In [30]:
# -------------------------------------------------------------------------------
# IMPORT DATASET
# Function to import all weather data
# Data Patching completed in this step
# -------------------------------------------------------------------------------
def import_weather_data():
    # Connect to Google Cloud Storage
    # -------------------------------
    storage_client = storage.Client()

    # Open Bucket
    # -----------
    bucket_name = 'weather_aurorabc'
    bucket = storage_client.bucket(bucket_name)

    # Open Blob
    # ---------
    blob_name = 'weather_ampm.csv'
    #blob_name = 'backups/weather-2023-09-26.csv'
    blob = bucket.blob(blob_name)

    # Read Weather Data from Blob
    # ---------------------------
    with blob.open("r") as f:
        weather_data = pd.read_csv(f)

    # Cast all data variables to float
    weather_data = weather_data.astype(
        {'Height':'float',
        'Temp':'float',
        'DewPoint':'float',
        'Relative_Humidity':'float',
        'Mean_Mixed_Layer':'float',
        'Wind_Direction':'float',
        'Wind_Speed':'float',
        'Potential_Temp':'float',
        'Equivalent_Potential_Temp':'float',
        'Virtual_Potential_Temp':'float',
        })

    return weather_data

# Find Missing Values
---
Look through existing dataset and create list of missing data values

In [277]:
# -------------------------------------------------------------------------------
# FIND MISSING VALS
# Function to find incomplete or missing vals within a dataset
# -------------------------------------------------------------------------------
def find_missing_vals(weather_data):
    missing_data = []
    stations = [
        {"id": 72797, "name": "Quillayute"},
        {"id": 73033, "name": "Vernon"},
        {"id": 71109, "name": "Port Hardy"}]
    pressures = [700, 850]
    start_date = date(2020, 1, 1)
    end_date = datetime.today().date()
    dates = daterange(start_date, end_date)
    
    # Loop through all dates from Jan 1st 2020 until today
    # For each date in the range, find the station / pressure / time combinations that are missing
    for i, d in enumerate(dates):
        missing_data.append({
            'date': str(d),
            'stations': []
        })
        for j, s in enumerate(stations):
            # Create empty array of pressures for each station at this date value
            for p in pressures:
                am = weather_data.loc[(weather_data['Date'] == str(d)) & (weather_data['Station'] == str(s['name'])) & (weather_data['Pressure'] == int(p)) & (weather_data['Time'] == 'AM')]
                pm = weather_data.loc[(weather_data['Date'] == str(d)) & (weather_data['Station'] == str(s['name'])) & (weather_data['Pressure'] == int(p)) & (weather_data['Time'] == 'PM')]
                
                
                # If there is no record in the dataset for this date...
                if am.empty or pm.empty:
                    missing_data[i]['stations'].append({'id': s['id'], 'name': s['name'], 'pressures': []}) 
                    missing_data[i]['stations'][-1]['pressures'].append(str(p))
       

                    continue
                
                # If there are null values for Temp or WindSpeed...
                if not am['Temp'].any() or not am['Wind_Speed'].any() or not pm['Temp'].any() or not pm['Wind_Speed'].any():
                    missing_data[i]['stations'].append({'id': s['id'], 'name': s['name'], 'pressures': []}) 
                    missing_data[i]['stations'][-1]['pressures'].append(str(p))
      
    
    
    # Remove unecessary rows where no new data is required
    missing_data = [x for x in missing_data if x['stations']]

    return missing_data

# Scrape Data
---
Using a list of incomplete data, generate a list of URLs and scrape. 

Return new Dataset in the format of a Pandas Dataframe

In [249]:
def scrape_data(missing_vals):
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # GENERATE LIST OF URLS TO BE SCRAPED
    print("Creating list of URLs to be scraped")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    base_url = "http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST"
    times = {'AM': [0, 11], 'PM': [12, 23]}
    
    '''
    x = {
        'date': '2020-01-01', 
        'stations': [
            {'id': 72797, 'name': 'Quillayute', 'pressures': ['700']}, 
            {'id': 73033, 'name': 'Vernon', 'pressures': ['850']}, 
            {'id': 71109, 'name': 'Port Hardy', 'pressures': []}]
        }
    '''
    
    print("Creating dates list...")

    # Create list of URLs for each station
    # ------------------------------------
    urls = []
    dates_idx = []
    station_idx = []
    for val in missing_vals:
        d = val['date'].split('-')
        for station in val['stations']:
            for t, time in times.items():
                url = base_url
                url += "&YEAR={}".format(d[0])
                url += "&MONTH={}".format(d[1])
                url += "&FROM={:0>2d}{:0>2d}".format(int(d[2]), time[0])
                url += "&TO={:0>2d}{:0>2d}".format(int(d[2]), time[1])
                url += "&STNM={}".format(station["id"])
                url += "&REPLOT=1"
                urls.append(url)

                dates_idx.append("%02d-%02d-%02d" % (int(d[0]), int(d[1]), int(d[2])))
                station_idx.append(station["name"])

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # EXTRACT DATA FROM URLS
    print("Extracting data from list of URLs")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    #print("Scraping URLS...")
    #print(urls)


    data = []
    ampm = 0
    problem_urls = []
    with alive_bar(len(urls), force_tty=True) as bar:
        for i, url in enumerate(urls):
            search = 0
            while search >= 0:
                # Scrape site data
                # ~~~~~~~~~~~~~~~~
                search += 1
                page = requests.get(url)
                soup = BeautifulSoup(page.content, "html.parser")
                results = str(soup.find('pre'))
                sevhundy = results[results.find("700"):].split()[:11]
                eightfiddy = results[results.find("850"):].split()[:11]

                # Catch erroneous reads
                # Site can hit too many requests
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                if 'e' not in sevhundy[0] and 'e' not in eightfiddy[0]:
                    search = -1
                # Try 3 times then move on
                elif search == 3:
                    print("Could not extract data from: {}".format(url))
                    problem_urls.append(url)
                    search = -1

            # Extract 700 first // 850 second
            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            data.append(sevhundy)
            data[-1].insert(0, dates_idx[i])
            data[-1].insert(1, station_idx[i])
            data[-1].insert(2, list(times.keys())[ampm%2])

            data.append(eightfiddy)
            data[-1].insert(0, dates_idx[i])
            data[-1].insert(1, station_idx[i])
            data[-1].insert(2, list(times.keys())[ampm%2])
            
            ampm += 1
                        
            bar()

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # CLEAN EXTRACTED DATA
    print("Cleaning extracted data")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    # Define data cols
    # ----------------
    cols = [
        "Date", 
        "Station", 
        "Time",
        "Pressure", 
        "Height", 
        "Temp", 
        "DewPoint", 
        "Relative_Humidity", 
        "Mean_Mixed_Layer", 
        "Wind_Direction", 
        "Wind_Speed", 
        "Potential_Temp",
        "Equivalent_Potential_Temp",
        "Virtual_Potential_Temp"]

    # Remove incomplete rows from dataset
    # -----------------------------------
    data = [row for row in data if len(row) >= len(cols)]

    # Remove rogue "e" values from pressure field
    # -------------------------------------------
    pressures = [700, 850]
    ctr = 0
    for item in data:
        if(not str(item[3]).isnumeric()):
            item[3] = pressures[ctr%2]
        ctr += 1

    # Convert data to Pandas DataFrame
    # --------------------------------
    new_data = pd.DataFrame(data, columns=cols)
    return new_data

# Standardize Datatypes
---

In [215]:
def standardize_dtypes(data):
    return data.astype( dtype={
                'Date' : str, 
                'Station': str,
                'Time': str,
                'Pressure': int,
                'Height': float,
                'Temp': float,
                'DewPoint': float,
                'Relative_Humidity': float,
                'Mean_Mixed_Layer': float,
                'Wind_Direction': float,
                'Wind_Speed': float,
                'Potential_Temp': float,
                'Equivalent_Potential_Temp': float,
                'Virtual_Potential_Temp': float,
                                      })

# Patch Dataset
---
Populate missing or incomplete data entries with NaN values

In [216]:
# -------------------------------------------------------------------------------
# PATCH DATASET
# Function to create empty entries for missing data
# This results in a data entries for each expected date/location/pressure reading
# -------------------------------------------------------------------------------
def patch_dataset(data, start_date=date(2020, 1, 1), end_date=datetime.today().date()):

    dates = daterange(start_date, end_date)


    stations = ['Quillayute', 'Vernon', 'Port Hardy']
    times = {'AM': [0, 11], 'PM': [12, 23]}
    pressures = [700, 850]
    idx = len(data)
    # Itterate through date range
    for date in dates:
        for station in stations:
            for t, time in times.items():
                for pressure in pressures:
                    # If data is missing, add empty values
                    if data[(data['Date'] == date.strftime("%Y-%m-%d")) & (data['Station'] == station) & (data['Time'] == t) & (data['Pressure'] == pressure)].empty:
                        #print("Missing data for {}:\n\tDate:{}\n\tPressure:{}\n".format(station, date.strftime("%Y-%m-%d"), pressure))
                        new_row = {
                            'Date': date.strftime("%Y-%m-%d"), 
                            'Station': station, 
                            'Time': t,
                            'Pressure': pressure, 
                            'Height': np.nan,
                            'Temp': np.nan,
                            'DewPoint': np.nan,
                            'Relative_Humidity': np.nan,
                            'Mean_Mixed_Layer': np.nan,
                            'Wind_Direction': np.nan,
                            'Wind_Speed': np.nan,
                            'Potential_Temp': np.nan,
                            'Equivalent_Potential_Temp': np.nan,
                            'Virtual_Potential_Temp': np.nan}

                        #data.loc[idx] = new_row
                        data.loc[idx] = [date.strftime("%Y-%m-%d"), station, t, pressure, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
                        idx += 1
                        #data.loc[-1] = new_row

    # Return complete data
    print("Patching Complete")
    return data.sort_values(by=['Date', 'Station', 'Pressure'])

# Upload Complete Dataset
---
Upload a complete dataset to the GoogleCloudStorage bucket and create daily backup

In [217]:
# -------------------------------------------------------------------------------
# UPLOAD WEATHER DATA
# Saves data to cloud bucket
# -------------------------------------------------------------------------------
def upload_weather_data(data):
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # SAVE DATA TO GOOGLE CLOUD
    print("Saving data to Google Cloud Bucket")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  

    # Connect to Google Cloud Storage
    # -------------------------------
    storage_client = storage.Client()

    # Open Bucket
    # -----------
    bucket_name = 'weather_aurorabc'
    bucket = storage_client.bucket(bucket_name)

    # Save today's backup
    # -------------------
    mybuffer = io.BytesIO()
    data.to_csv(mybuffer, index=False)

    blob = bucket.blob('backups/weather_ampm-{}.csv'.format(datetime.today().date()))
    blob.upload_from_string(mybuffer.getvalue())

    # Update weather.csv
    # ------------------
    blob = bucket.blob('weather_ampm.csv')
    blob.upload_from_string(mybuffer.getvalue())

---
# Main
---

## Import Data

In [329]:
# Import existing weather data from Google Cloud
# ----------------------------------------------
weather_data = import_weather_data()
print(weather_data.size)

235200


In [330]:
for index, row in weather_data.iterrows():
    print("{}\n".format(row))

Date                         2020-01-01
Station                      Port Hardy
Time                                 AM
Pressure                            700
Height                           2856.0
Temp                               -4.3
DewPoint                           -5.8
Relative_Humidity                  89.0
Mean_Mixed_Layer                   3.56
Wind_Direction                    250.0
Wind_Speed                       87.044
Potential_Temp                    297.7
Equivalent_Potential_Temp         308.6
Virtual_Potential_Temp            298.3
Name: 0, dtype: object

Date                         2020-01-01
Station                      Port Hardy
Time                                 AM
Pressure                            850
Height                           1301.0
Temp                                3.4
DewPoint                            1.9
Relative_Humidity                  90.0
Mean_Mixed_Layer                   5.19
Wind_Direction                    250.0
Wind_Speed      

Date                         2020-06-01
Station                      Port Hardy
Time                                 PM
Pressure                            700
Height                              NaN
Temp                                NaN
DewPoint                            NaN
Relative_Humidity                   NaN
Mean_Mixed_Layer                    NaN
Wind_Direction                      NaN
Wind_Speed                          NaN
Potential_Temp                      NaN
Equivalent_Potential_Temp           NaN
Virtual_Potential_Temp              NaN
Name: 1826, dtype: object

Date                         2020-06-01
Station                      Port Hardy
Time                                 PM
Pressure                            850
Height                              NaN
Temp                                NaN
DewPoint                            NaN
Relative_Humidity                   NaN
Mean_Mixed_Layer                    NaN
Wind_Direction                      NaN
Wind_Speed   

Date                         2020-07-23
Station                          Vernon
Time                                 PM
Pressure                            700
Height                              NaN
Temp                                NaN
DewPoint                            NaN
Relative_Humidity                   NaN
Mean_Mixed_Layer                    NaN
Wind_Direction                      NaN
Wind_Speed                          NaN
Potential_Temp                      NaN
Equivalent_Potential_Temp           NaN
Virtual_Potential_Temp              NaN
Name: 2458, dtype: object

Date                         2020-07-23
Station                          Vernon
Time                                 PM
Pressure                            850
Height                              NaN
Temp                                NaN
DewPoint                            NaN
Relative_Humidity                   NaN
Mean_Mixed_Layer                    NaN
Wind_Direction                      NaN
Wind_Speed   

Date                         2020-11-26
Station                      Quillayute
Time                                 PM
Pressure                            850
Height                              NaN
Temp                                NaN
DewPoint                            NaN
Relative_Humidity                   NaN
Mean_Mixed_Layer                    NaN
Wind_Direction                      NaN
Wind_Speed                          NaN
Potential_Temp                      NaN
Equivalent_Potential_Temp           NaN
Virtual_Potential_Temp              NaN
Name: 3967, dtype: object

Date                         2020-11-26
Station                          Vernon
Time                                 AM
Pressure                            700
Height                              NaN
Temp                                NaN
DewPoint                            NaN
Relative_Humidity                   NaN
Mean_Mixed_Layer                    NaN
Wind_Direction                      NaN
Wind_Speed   

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Find Missing Data

In [331]:
# Find missing or 'nan' values within current dataset
# ---------------------------------------------------
missing_data = find_missing_vals(weather_data)
print(len(missing_data))

1315


In [332]:
for row in missing_data:
    print(row)
    print("\n")

{'date': '2020-01-03', 'stations': [{'id': 73033, 'name': 'Vernon', 'pressures': ['700']}, {'id': 73033, 'name': 'Vernon', 'pressures': ['850']}]}


{'date': '2020-01-25', 'stations': [{'id': 72797, 'name': 'Quillayute', 'pressures': ['850']}, {'id': 71109, 'name': 'Port Hardy', 'pressures': ['850']}]}


{'date': '2020-01-30', 'stations': [{'id': 72797, 'name': 'Quillayute', 'pressures': ['850']}]}


{'date': '2020-02-03', 'stations': [{'id': 72797, 'name': 'Quillayute', 'pressures': ['700']}, {'id': 72797, 'name': 'Quillayute', 'pressures': ['850']}]}


{'date': '2020-02-18', 'stations': [{'id': 71109, 'name': 'Port Hardy', 'pressures': ['700']}, {'id': 71109, 'name': 'Port Hardy', 'pressures': ['850']}]}


{'date': '2020-02-20', 'stations': [{'id': 71109, 'name': 'Port Hardy', 'pressures': ['850']}]}


{'date': '2020-03-06', 'stations': [{'id': 73033, 'name': 'Vernon', 'pressures': ['850']}]}


{'date': '2020-03-09', 'stations': [{'id': 72797, 'name': 'Quillayute', 'pressures': ['850

## Get New Data

In [333]:
# Scrape weather balloons for new data within defined date range
# --------------------------------------------------------------
#new_data = scrape_data(dates)
new_data = scrape_data(missing_data[:25])
print(new_data.size)

Creating list of URLs to be scraped
Creating dates list...
Extracting data from list of URLs
on 10: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=02&FROM=0300&TO=0311&STNM=72797&REPLOT=1
on 12: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=02&FROM=0300&TO=0311&STNM=72797&REPLOT=1
on 25: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=03&FROM=2012&TO=2023&STNM=72797&REPLOT=1
on 27: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=03&FROM=2012&TO=2023&STNM=72797&REPLOT=1
on 41: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=03&FROM=3012&TO=3023&STNM=71109&REPLOT=1
on 43: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=03&FROM=3012&TO=3023&STNM=71109&REPLOT=1
on 44: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MO

KeyboardInterrupt: 

In [None]:
new_data.dropna(subset=['Temp', 'Wind_Speed'])
new_data.head(10)

### Standardize DataTypes

In [None]:
new_data = standardize_dtypes(new_data)
new_data.head(10)

In [None]:
new_data.dtypes

In [None]:
# Convert wind speed from Knots to KM/H
# -------------------------------------
new_data['Wind_Speed'] = new_data['Wind_Speed'].multiply(1.852)
new_data.head(10)

## Combine New and Old Data

In [None]:
# Combine existing weather data with newly extracted data
# -------------------------------------------------------
#complete_data = combine_data(weather_data, new_data)
complete_data = pd.concat([weather_data, new_data])
complete_data = complete_data.sort_values(by=['Date', 'Station', 'Time', 'Pressure'])
complete_data.head(-20)

In [None]:
print('Weather Data: {}\nMissing Data: {}\nNew Data: {}\nComplete Data: {}'
      .format(weather_data.size, len(missing_data), new_data.size, complete_data.size))

In [None]:
if True in complete_data.duplicated(subset=['Date', 'Station', 'Time', 'Pressure']).to_list():
    print("Duplicates Found")
    complete_data = complete_data.drop_duplicates(subset=['Date', 'Station', 'Time', 'Pressure'], keep='last')


In [None]:
print('Weather Data: {}\nMissing Data: {}\nNew Data: {}\nComplete Data: {}'
      .format(weather_data.size, len(missing_data), new_data.size, complete_data.size))

In [None]:
for index, row in complete_data.iterrows():
    print("{}\n".format(row))

## Patch Dataset

In [None]:
# Patch dataset with nan values for any missing fields
# ----------------------------------------------------
final_data = patch_dataset(complete_data)
final_data = final_data.sort_values(by=['Date', 'Station', 'Time', 'Pressure'])
print("Final Data: {}".format(final_data.size))

In [None]:
final_data.head(-100)

## Upload Data

In [None]:
# Upload clean data to bucket
# ---------------------------
upload_weather_data(complete_data)

In [None]:
# Up to 2020-03-27

---
# End
---