In [425]:
import base64
import os, io
import requests
import pandas as pd
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup
import numpy as np
from math import floor
from pathlib import Path  
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import seaborn as sns
from windrose import WindroseAxes
from PIL import Image
from google.cloud import storage
import google.cloud.logging
import logging
from flask import Response

In [426]:
from alive_progress import alive_bar

In [427]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../service_account.json'

In [428]:
# -------------------------------------------------------------------------------
# LOGGING SETUP
# -------------------------------------------------------------------------------
client = google.cloud.logging.Client()
client.setup_logging()

In [429]:
# -------------------------------------------------------------------------------
# CREATE DATERANGE
# -------------------------------------------------------------------------------
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

In [430]:
# -------------------------------------------------------------------------------
# PATCH DATASET
# Function to create empty entries for missing data
# This results in a data entries for each expected date/location/pressure reading
# -------------------------------------------------------------------------------
def patch_dataset(data, start_date, end_date):
    dates = daterange(start_date, end_date)


    stations = ['Quillayute', 'Vernon', 'Port Hardy']
    pressures = [700, 850]
    idx = len(data)
    # Itterate through date range
    for date in dates:
        for station in stations:
            for pressure in pressures:
                # If data is missing, add empty values
                if data[(data['Date'] == date.strftime("%Y-%m-%d")) & (data['Station'] == station) & (data['Pressure'] == pressure)].empty:
                    #print("Missing data for {}:\n\tDate:{}\n\tPressure:{}\n".format(station, date.strftime("%Y-%m-%d"), pressure))
                    new_row = {
                        'Date': date.strftime("%Y-%m-%d"), 
                        'Station': station, 
                        'Pressure': pressure, 
                        'Height': np.nan,
                        'Temp': np.nan,
                        'DewPoint': np.nan,
                        'Relative_Humidity': np.nan,
                        'Mean_Mixed_Layer': np.nan,
                        'Wind_Direction': np.nan,
                        'Wind_Speed': np.nan,
                        'Potential_Temp': np.nan,
                        'Equivalent_Potential_Temp': np.nan,
                        'Virtual_Potential_Temp': np.nan}

                    data.loc[idx] = new_row
                    idx += 1
                    #data.loc[-1] = new_row

    # Return complete data
    print("Patching Complete")
    return data.sort_values(by=['Date', 'Station', 'Pressure'])

In [431]:
# -------------------------------------------------------------------------------
# IMPORT DATASET
# Function to import all weather data
# Data Patching completed in this step
# -------------------------------------------------------------------------------
def import_weather_data():
    # Connect to Google Cloud Storage
    # -------------------------------
    storage_client = storage.Client()

    # Open Bucket
    # -----------
    bucket_name = 'weather_aurorabc'
    bucket = storage_client.bucket(bucket_name)

    # Open Blob
    # ---------
    blob_name = 'weather.csv'
    #blob_name = 'backups/weather-2023-09-26.csv'
    blob = bucket.blob(blob_name)

    # Read Weather Data from Blob
    # ---------------------------
    with blob.open("r") as f:
        weather_data = pd.read_csv(f)

    # Cast all data variables to float
    weather_data = weather_data.astype(
        {'Height':'float',
        'Temp':'float',
        'DewPoint':'float',
        'Relative_Humidity':'float',
        'Mean_Mixed_Layer':'float',
        'Wind_Direction':'float',
        'Wind_Speed':'float',
        'Potential_Temp':'float',
        'Equivalent_Potential_Temp':'float',
        'Virtual_Potential_Temp':'float',
        })

    return weather_data

In [432]:
# -------------------------------------------------------------------------------
# UPLOAD WEATHER DATA
# Saves data to cloud bucket
# -------------------------------------------------------------------------------
def upload_weather_data(data):
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # SAVE DATA TO GOOGLE CLOUD
    print("Saving data to Google Cloud Bucket")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  

    # Connect to Google Cloud Storage
    # -------------------------------
    storage_client = storage.Client()

    # Open Bucket
    # -----------
    bucket_name = 'weather_aurorabc'
    bucket = storage_client.bucket(bucket_name)

    # Save today's backup
    # -------------------
    mybuffer = io.BytesIO()
    data.to_csv(mybuffer, index=False)

    blob = bucket.blob('backups/weather-{}.csv'.format(datetime.today().date()))
    blob.upload_from_string(mybuffer.getvalue())

    # Update weather.csv
    # ------------------
    blob = bucket.blob('weather.csv')
    blob.upload_from_string(mybuffer.getvalue())

In [433]:
def scrape_data(dates):
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # GENERATE LIST OF URLS TO BE SCRAPED
    print("Creating list of URLs to be scraped")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    base_url = "http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST"

    stations = [
        {"id": 72797, "name": "Quillayute"},
        {"id": 73033, "name": "Vernon"},
        {"id": 71109, "name": "Port Hardy"}]

    print("Creating dates list...")

    # Create list of URLs for each station
    # ------------------------------------
    urls = []
    dates_idx = []
    station_idx = []
    for d in dates:
        d = d.split('-')
        for station in stations:
            url = base_url
            url += "&YEAR={}".format(d[0])
            url += "&MONTH={}".format(d[1])
            #url += "&FROM=" + str(now[1]) + str(now[2])
            #url += "&TO=" + str(now[1]) + str(now[2])
            url += "&FROM={:0>2d}00".format(int(d[2]))
            url += "&TO={:0>2d}23".format(int(d[2]))
            url += "&STNM={}".format(station["id"])
            url += "&REPLOT=1"
            urls.append(url)

            dates_idx.append("%02d-%02d-%02d" % (int(d[0]), int(d[1]), int(d[2])))
            station_idx.append(station["name"])

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # EXTRACT DATA FROM URLS
    print("Extracting data from list of URLs")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    #print("Scraping URLS...")
    #print(urls)


    data = []
    problem_urls = []
    for i, url in enumerate(urls):
        search = 0
        while search >= 0:
            # Scrape site data
            # ~~~~~~~~~~~~~~~~
            search += 1
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            results = str(soup.find('pre'))
            sevhundy = results[results.find("700"):].split()[:11]
            eightfiddy = results[results.find("850"):].split()[:11]

            # Catch erroneous reads
            # Site can hit too many requests
            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            if 'e' not in sevhundy[0] and 'e' not in eightfiddy[0]:
                search = -1
            elif search == 10:
                print(url)
                problem_urls.append(url)
                search = -1

        # Extract 700 first // 850 second
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        data.append(sevhundy)
        data[-1].insert(0, dates_idx[i])
        data[-1].insert(1, station_idx[i])

        data.append(eightfiddy)
        data[-1].insert(0, dates_idx[i])
        data[-1].insert(1, station_idx[i])

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # CLEAN EXTRACTED DATA
    print("Cleaning extracted data")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    # Define data cols
    # ----------------
    cols = ["Date", 
        "Station", 
        "Pressure", 
        "Height", 
        "Temp", 
        "DewPoint", 
        "Relative_Humidity", 
        "Mean_Mixed_Layer", 
        "Wind_Direction", 
        "Wind_Speed", 
        "Potential_Temp",
        "Equivalent_Potential_Temp",
        "Virtual_Potential_Temp"]

    # Remove incomplete rows from dataset
    # -----------------------------------
    data = [row for row in data if len(row) >= len(cols)]

    # Remove rogue "e" values from pressure field
    # -------------------------------------------
    pressures = [700, 850]
    ctr = 0
    for item in data:
        if(not str(item[2]).isnumeric()):
            item[2] = pressures[ctr%2]
        ctr += 1

    # Convert data to Pandas DataFrame
    # --------------------------------
    new_data = pd.DataFrame(data, columns=cols)
    return new_data

In [434]:
def scrape_data_2(missing_vals):
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # GENERATE LIST OF URLS TO BE SCRAPED
    print("Creating list of URLs to be scraped")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    base_url = "http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST"

    '''
    x = {
        'date': '2020-01-01', 
        'stations': [
            {'id': 72797, 'name': 'Quillayute', 'pressures': ['700']}, 
            {'id': 73033, 'name': 'Vernon', 'pressures': ['850']}, 
            {'id': 71109, 'name': 'Port Hardy', 'pressures': []}]
        }
    '''
    print("Creating dates list...")

    # Create list of URLs for each station
    # ------------------------------------
    urls = []
    dates_idx = []
    station_idx = []
    for val in missing_vals:
        d = val['date'].split('-')
        for station in val['stations']:
            url = base_url
            url += "&YEAR={}".format(d[0])
            url += "&MONTH={}".format(d[1])
            #url += "&FROM=" + str(now[1]) + str(now[2])
            #url += "&TO=" + str(now[1]) + str(now[2])
            url += "&FROM={:0>2d}00".format(int(d[2]))
            url += "&TO={:0>2d}23".format(int(d[2]))
            url += "&STNM={}".format(station["id"])
            url += "&REPLOT=1"
            urls.append(url)

            dates_idx.append("%02d-%02d-%02d" % (int(d[0]), int(d[1]), int(d[2])))
            station_idx.append(station["name"])

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # EXTRACT DATA FROM URLS
    print("Extracting data from list of URLs")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    #print("Scraping URLS...")
    #print(urls)


    data = []
    problem_urls = []
    with alive_bar(len(urls), force_tty=True) as bar:

        for i, url in enumerate(urls):
            search = 0
            while search >= 0:
                # Scrape site data
                # ~~~~~~~~~~~~~~~~
                search += 1
                page = requests.get(url)
                soup = BeautifulSoup(page.content, "html.parser")
                results = str(soup.find('pre'))
                sevhundy = results[results.find("700"):].split()[:11]
                eightfiddy = results[results.find("850"):].split()[:11]

                # Catch erroneous reads
                # Site can hit too many requests
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                if 'e' not in sevhundy[0] and 'e' not in eightfiddy[0]:
                    search = -1
                elif search == 10:
                    print(url)
                    problem_urls.append(url)
                    search = -1

            # Extract 700 first // 850 second
            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            data.append(sevhundy)
            data[-1].insert(0, dates_idx[i])
            data[-1].insert(1, station_idx[i])

            data.append(eightfiddy)
            data[-1].insert(0, dates_idx[i])
            data[-1].insert(1, station_idx[i])
            
            bar()

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # CLEAN EXTRACTED DATA
    print("Cleaning extracted data")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    # Define data cols
    # ----------------
    cols = ["Date", 
        "Station", 
        "Pressure", 
        "Height", 
        "Temp", 
        "DewPoint", 
        "Relative_Humidity", 
        "Mean_Mixed_Layer", 
        "Wind_Direction", 
        "Wind_Speed", 
        "Potential_Temp",
        "Equivalent_Potential_Temp",
        "Virtual_Potential_Temp"]

    # Remove incomplete rows from dataset
    # -----------------------------------
    data = [row for row in data if len(row) >= len(cols)]

    # Remove rogue "e" values from pressure field
    # -------------------------------------------
    pressures = [700, 850]
    ctr = 0
    for item in data:
        if(not str(item[2]).isnumeric()):
            item[2] = pressures[ctr%2]
        ctr += 1

    # Convert data to Pandas DataFrame
    # --------------------------------
    new_data = pd.DataFrame(data, columns=cols)
    return new_data

In [435]:
weather_data = import_weather_data()
for index, row in weather_data.iterrows():
    print("{}\n".format(row))

Date                         2020-01-01
Station                      Port Hardy
Pressure                            700
Height                           2856.0
Temp                               -4.3
DewPoint                           -5.8
Relative_Humidity                  89.0
Mean_Mixed_Layer                   3.56
Wind_Direction                    250.0
Wind_Speed                       87.044
Potential_Temp                    297.7
Equivalent_Potential_Temp         308.6
Virtual_Potential_Temp            298.3
Name: 0, dtype: object

Date                         2020-01-01
Station                      Port Hardy
Pressure                            850
Height                           1301.0
Temp                                3.4
DewPoint                            1.9
Relative_Humidity                  90.0
Mean_Mixed_Layer                   5.19
Wind_Direction                    250.0
Wind_Speed                       59.264
Potential_Temp                    289.7
Equivalent_Poten

Date                         2022-03-10
Station                      Port Hardy
Pressure                            700
Height                           3033.0
Temp                               -8.1
DewPoint                          -55.1
Relative_Humidity                   1.0
Mean_Mixed_Layer                   0.03
Wind_Direction                      5.0
Wind_Speed                         46.3
Potential_Temp                    293.5
Equivalent_Potential_Temp         293.6
Virtual_Potential_Temp            293.5
Name: 1973, dtype: object

Date                         2022-03-11
Station                      Port Hardy
Pressure                            700
Height                           3079.0
Temp                               -6.9
DewPoint                           -8.9
Relative_Humidity                  86.0
Mean_Mixed_Layer                    2.8
Wind_Direction                    280.0
Wind_Speed                       38.892
Potential_Temp                    294.8
Equivalent_Po

In [436]:
def find_missing_vals(weather_data):
    missing_data = []
    stations = [
        {"id": 72797, "name": "Quillayute"},
        {"id": 73033, "name": "Vernon"},
        {"id": 71109, "name": "Port Hardy"}]
    pressures = [700, 850]
    start_date = date(2020, 1, 1)
    end_date = datetime.today().date()
    dates = daterange(start_date, end_date)
    
    # Loop through all dates from Jan 1st 2020 until today
    # For each date in the range, find the station / pressure combinations that are missing
    for i, d in enumerate(dates):
        missing_data.append({
            'date': str(d),
            'stations': []
        })
        for j, s in enumerate(stations):
            # Create empty array of pressures for each station at this date value
            missing_data[i]['stations'].append({'id': s['id'], 'name': s['name'], 'pressures': []}) 
            for p in pressures:
                x = weather_data.loc[(weather_data['Date'] == str(d)) & (weather_data['Station'] == str(s['name'])) & (weather_data['Pressure'] == int(p))]

                # If there is no record in the dataset for this date...
                if x.empty:
                    missing_data[i]['stations'][j]['pressures'].append(str(p))
                    #missing_data[i][s['id']]['pressures'].append(str(p))
                    '''
                    missing_data.append({
                        "Date": str(d),
                        "Station": s,
                        "Pressure": p})
                    '''

                    break

                # If there are null values for Temp or WindSpeed...
                if x['Temp'].empty or x['Wind_Speed'].empty:
                    missing_data[i]['stations'][j]['pressures'].append(str(p))
                    #missing_data[i][s['id']]['pressures'].append(str(p))
                    '''
                    missing_data.append({
                        "Date": str(d),
                        "Station": s,
                        "Pressure": p})
                    break
                    '''
                    
    return missing_data

In [437]:
missing_data = find_missing_vals(weather_data)
print(len(missing_data))

1385


In [438]:
for row in missing_data:
    print(row)
    print("\n")

{'date': '2020-01-01', 'stations': [{'id': 72797, 'name': 'Quillayute', 'pressures': ['700']}, {'id': 73033, 'name': 'Vernon', 'pressures': ['850']}, {'id': 71109, 'name': 'Port Hardy', 'pressures': []}]}


{'date': '2020-01-02', 'stations': [{'id': 72797, 'name': 'Quillayute', 'pressures': ['850']}, {'id': 73033, 'name': 'Vernon', 'pressures': ['850']}, {'id': 71109, 'name': 'Port Hardy', 'pressures': ['700']}]}


{'date': '2020-01-03', 'stations': [{'id': 72797, 'name': 'Quillayute', 'pressures': ['700']}, {'id': 73033, 'name': 'Vernon', 'pressures': ['700']}, {'id': 71109, 'name': 'Port Hardy', 'pressures': ['850']}]}


{'date': '2020-01-04', 'stations': [{'id': 72797, 'name': 'Quillayute', 'pressures': []}, {'id': 73033, 'name': 'Vernon', 'pressures': ['850']}, {'id': 71109, 'name': 'Port Hardy', 'pressures': ['850']}]}


{'date': '2020-01-05', 'stations': [{'id': 72797, 'name': 'Quillayute', 'pressures': ['700']}, {'id': 73033, 'name': 'Vernon', 'pressures': ['700']}, {'id': 71109

In [440]:
'''
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CREATING RANGE OF DATES TO BE SCRAPED
print("Creating range of dates to be scraped...\n")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Create Date Range
# -----------------
delta = timedelta(days=1)
start_date = date(2020, 1, 1)
end_date = datetime.today().date()
new_dates = daterange(start_date, end_date)


# Remove dates from list that are alrady populated in weather_data
# ----------------------------------------------------------------
for d in weather_data['Date'].unique().tolist():
    if d in new_dates:
        new_dates.remove(d)
        
incomplete_dates = weather_data[weather_data['Temp'].isna()]['Date']

dates = list(new_dates) + incomplete_dates.to_list()

print("Incomplete dates: {}\n".format(incomplete_dates))
print("Dates: {}\n".format(dates))
'''

'\n# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n# CREATING RANGE OF DATES TO BE SCRAPED\nprint("Creating range of dates to be scraped...\n")\n# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n# Create Date Range\n# -----------------\ndelta = timedelta(days=1)\nstart_date = date(2020, 1, 1)\nend_date = datetime.today().date()\nnew_dates = daterange(start_date, end_date)\n\n\n# Remove dates from list that are alrady populated in weather_data\n# ----------------------------------------------------------------\nfor d in weather_data[\'Date\'].unique().tolist():\n    if d in new_dates:\n        new_dates.remove(d)\n        \nincomplete_dates = weather_data[weather_data[\'Temp\'].isna()][\'Date\']\n\ndates = list(new_dates) + incomplete_dates.to_list()\n\nprint("Incomplete dates: {}\n".format(incomplete_dates))\nprint("Dates: {}\n".format(dates))\n'

In [None]:
# Scrape weather balloons for new data within defined date range
# --------------------------------------------------------------
#new_data = scrape_data(dates)
new_data = scrape_data_2(missing_data)

# Convert DataFrame to numeric values
# -----------------------------------
for col in cols[2:]:
    try:
        new_data[col] = pd.to_numeric(new_data[col])
    except Exception as e:
        print("{}: {}".format(col, e))

# Convert wind speed from Knots to KM/H
# -------------------------------------
new_data['Wind_Speed'] = new_data['Wind_Speed'].multiply(1.852)
'''
print("NEW DATA ({})".format(new_data.shape[0]))
for index, row in new_data.iterrows():
    print("{}\n".format(row))
'''

    
# Combine New and Old Data
# ------------------------
complete_data = pd.concat([weather_data, new_data])
#complete_data = complete_data.sort_values(by=['Date'])
print("COMPLETE DATA ({})".format(complete_data.shape[0]))


# Remove values that are exceptionally high or low
# ------------------------------------------------
for col in cols[2:]:
    q_low = complete_data[col].quantile(0.01)
    q_hi = complete_data[col].quantile(0.99)

    complete_clean_data = complete_data[(complete_data[col] < q_hi) & (complete_data[col] > q_low)]

final_data = patch_dataset(complete_clean_data, start_date, end_date)
print("FINAL DATA ({})".format(final_data.shape[0]))

Creating list of URLs to be scraped
Creating dates list...
Extracting data from list of URLs
on 413: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=05&FROM=1700&TO=1723&STNM=71109&REPLOT=1
on 829: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=10&FROM=0300&TO=0323&STNM=73033&REPLOT=1
on 832: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=10&FROM=0400&TO=0423&STNM=73033&REPLOT=1
on 835: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=10&FROM=0500&TO=0523&STNM=73033&REPLOT=1
on 843: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=10&FROM=0800&TO=0823&STNM=72797&REPLOT=1
on 846: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=10&FROM=0900&TO=0923&STNM=72797&REPLOT=1
on 994: http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=

In [None]:
# Upload clean data to bucket
# ---------------------------
upload_weather_data(final_data)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# COMPLETE
print("Successfully completed scrape")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  

In [416]:
complete_clean_data.head(10)

Unnamed: 0,Date,Station,Pressure,Height,Temp,DewPoint,Relative_Humidity,Mean_Mixed_Layer,Wind_Direction,Wind_Speed,Potential_Temp,Equivalent_Potential_Temp,Virtual_Potential_Temp
0,2020-01-01,Port Hardy,700,2856.0,-4.3,-5.8,89.0,3.56,250.0,87.044,297.7,308.6,298.3
1,2020-01-01,Port Hardy,850,1301.0,3.4,1.9,90.0,5.19,250.0,59.264,289.7,304.8,290.6
2,2020-01-01,Quillayute,700,2969.0,-0.9,-0.9,100.0,5.14,255.0,100.008,301.5,317.2,302.4
3,2020-01-01,Quillayute,850,1393.0,6.2,6.2,100.0,7.04,260.0,81.488,292.6,313.1,293.9
4,2020-01-01,Vernon,700,2880.0,-5.5,-7.0,89.0,3.25,260.0,94.452,296.4,306.3,296.9
5,2020-01-02,Quillayute,700,2946.0,-10.7,-23.7,34.0,0.81,260.0,50.004,290.6,293.2,290.8
6,2020-01-02,Vernon,700,2882.0,-9.5,-12.3,80.0,2.14,305.0,31.484,291.9,298.5,292.3
7,2020-01-02,Vernon,850,1351.0,2.2,-4.8,60.0,3.16,245.0,33.336,288.4,297.8,289.0
8,2020-01-03,Port Hardy,700,2889.0,-2.9,-4.7,87.0,3.87,245.0,92.6,299.2,311.1,299.9
9,2020-01-03,Quillayute,700,2974.0,-2.3,-2.7,97.0,4.5,270.0,90.748,299.9,313.6,300.7


In [None]:
final_data.tail(10)

In [285]:
weather_data = import_weather_data()

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CREATING RANGE OF DATES TO BE SCRAPED
print("Creating range of dates to be scraped")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Create Date Range
# -----------------
delta = timedelta(days=1)
start_date = date(2020, 1, 1)
end_date = datetime.today().date()
dates = daterange(start_date, end_date)





Creating range of dates to be scraped


In [288]:
print(weather_data['Date'])

0       2020-01-01
1       2020-01-01
2       2020-01-01
3       2020-01-01
4       2020-01-01
           ...    
8227    2023-10-03
8228    2023-10-03
8229    2023-10-03
8230    2023-10-03
8231    2023-10-03
Name: Date, Length: 8232, dtype: object


In [289]:
#incomplete_data = weather_data['Temp'].isnull()
incomplete_dates = weather_data[weather_data['Temp'].isna()]['Date']
print(incomplete_dates)

5       2020-01-01
6       2020-01-02
7       2020-01-02
9       2020-01-02
13      2020-01-03
           ...    
8227    2023-10-03
8228    2023-10-03
8229    2023-10-03
8230    2023-10-03
8231    2023-10-03
Name: Date, Length: 3893, dtype: object


In [290]:
for index, row in incomplete_data.iterrows():
    print("{}\n".format(row))

AttributeError: 'Series' object has no attribute 'iterrows'

In [220]:
incomplete_dates.head(100)

Unnamed: 0,Date,Station,Pressure,Height,Temp,DewPoint,Relative_Humidity,Mean_Mixed_Layer,Wind_Direction,Wind_Speed,Potential_Temp,Equivalent_Potential_Temp,Virtual_Potential_Temp
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,False,False,False,False,False,False,False,False,False,False,False,False,False
96,False,False,False,False,False,False,False,False,False,False,False,False,False
97,False,False,False,False,False,False,False,False,False,False,False,False,False
98,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
# Remove dates from list that are alrady populated in weather_data
# ----------------------------------------------------------------
for d in weather_data['Date'].unique().tolist():
    if d in dates:
        dates.remove(d)