# Example Notebook for blogpost 3 Tips for First-Time Machine Learning Projects


## 1. Shared Data Folder in Colab
This code will only work if the notebook is executed inside Google Colab and the file: 'Station-Inventory.csv.gz' has been added inside a 'Shared-Data' folder.

In [2]:
from google.colab import drive
import pandas as pd

# Make Google Drive available inside noteboook
drive.mount('/content/gdrive')

# Path containing all data used in the project
root_path = '/content/gdrive/My Drive/Shared-Data/'

# Path of specific notebook needed for this notebook
path = root_path + 'Station-Inventory.csv.gz'

# Read in gzip file as pandas dataframe
weather_stations = pd.read_csv(path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
len(weather_stations)

8766

## 2. aiohttp to speed up data acquisition

In [5]:
!pip install aiohttp

Collecting aiohttp
[?25l  Downloading https://files.pythonhosted.org/packages/7c/39/7eb5f98d24904e0f6d3edb505d4aa60e3ef83c0a58d6fe18244a51757247/aiohttp-3.6.2-cp36-cp36m-manylinux1_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 1.7MB/s 
[?25hCollecting idna-ssl>=1.0; python_version < "3.7"
  Downloading https://files.pythonhosted.org/packages/46/03/07c4894aae38b0de52b52586b24bf189bb83e4ddabfe2e2c8f2419eec6f4/idna-ssl-1.1.0.tar.gz
Collecting yarl<2.0,>=1.0
[?25l  Downloading https://files.pythonhosted.org/packages/a0/b4/2cbeaf2c3ea53865d9613b315fe24e78c66acedb1df7e4be4e064c87203b/yarl-1.5.1-cp36-cp36m-manylinux1_x86_64.whl (257kB)
[K     |████████████████████████████████| 266kB 10.5MB/s 
[?25hCollecting multidict<5.0,>=4.5
[?25l  Downloading https://files.pythonhosted.org/packages/1a/95/f50352b5366e7d579e8b99631680a9e32e1b22adfa1629a8f23b1d22d5e2/multidict-4.7.6-cp36-cp36m-manylinux1_x86_64.whl (148kB)
[K     |████████████████████████████████| 153kB 10.8MB/s 

In [6]:
import io
import time
import asyncio
import aiohttp
import nest_asyncio
nest_asyncio.apply()

In [7]:
async def fetch_dataframe(session, url):
  """ Asynchronously fetches a csv file using the url """
  async with session.get(url) as response:
    byte_code = await response.content.read()
    df = pd.read_csv(io.BytesIO(byte_code), encoding='utf8')
    return df
      
async def fetch_concurrent(urls):
  """ Make concurrent calls """
  loop = asyncio.get_event_loop()
  async with aiohttp.ClientSession() as session:
    tasks = []
    for url in urls:
      tasks.append(loop.create_task(fetch_dataframe(session, url)))

    all_dfs = []

    # Loop through each completed task
    for df in asyncio.as_completed(tasks):
      df = await df
      all_dfs.append(df)

    # Concatenate all fetched dataframes into one
    return pd.concat(all_dfs, ignore_index=True)

In [8]:
def get_url(stationID, year, month):
    """ Gets url for downloading weather csv """
    url = "http://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&" +\
          f"stationID={stationID}&Year={year}&Month={month}&Day=14&timeframe=1" +\
          "&submit=Download+Data"
    return url

In [9]:
tic = time.time()
urls = [get_url(stationID=31468, year=year, month=month) for month in range(1,13) for year in range(2010, 2019)]
df = asyncio.run(fetch_concurrent(urls))
toc = time.time()
print(toc-tic)
df.head()

38.51241493225098


Unnamed: 0,Longitude (x),Latitude (y),Station Name,Climate ID,Date/Time,Year,Month,Day,Time,Temp (°C),Temp Flag,Dew Point Temp (°C),Dew Point Temp Flag,Rel Hum (%),Rel Hum Flag,Wind Dir (10s deg),Wind Dir Flag,Wind Spd (km/h),Wind Spd Flag,Visibility (km),Visibility Flag,Stn Press (kPa),Stn Press Flag,Hmdx,Hmdx Flag,Wind Chill,Wind Chill Flag,Weather
0,-118.29,55.62,PEORIA AGDM,3075160,2014-12-01 00:00,2014,12,1,00:00,-25.6,,-28.2,,79.0,,8.0,,4.0,,,,,M,,,-30.0,,
1,-118.29,55.62,PEORIA AGDM,3075160,2014-12-01 01:00,2014,12,1,01:00,-26.9,,-29.7,,77.0,,6.0,,3.0,,,,,M,,,-30.0,,
2,-118.29,55.62,PEORIA AGDM,3075160,2014-12-01 02:00,2014,12,1,02:00,-26.4,,-29.2,,78.0,,1.0,,6.0,,,,,M,,,-32.0,,
3,-118.29,55.62,PEORIA AGDM,3075160,2014-12-01 03:00,2014,12,1,03:00,-27.1,,-30.2,,75.0,,33.0,,3.0,,,,,M,,,-30.0,,
4,-118.29,55.62,PEORIA AGDM,3075160,2014-12-01 04:00,2014,12,1,04:00,-25.7,,-28.3,,79.0,,30.0,,5.0,,,,,M,,,-31.0,,


In [10]:
len(df)

78888

## 3. Vectorizing dataframe operations

Formula for converting relating humidity to absolute humiditiy: https://carnotcycle.wordpress.com/2012/08/04/how-to-convert-relative-humidity-to-absolute-humidity/

In [19]:
import math

def calculate_abs_humidity(temperature, relative_humidity):
  """ Calculates absolute humidity from temperature and relative humidity """
  t = temperature
  rh = relative_humidity
  abs_humidity = (6.112 * math.exp((17.67*t)/(t+243.5)) * rh * 2.1674)/(273.15+t)
  return abs_humidity

In [20]:
%%timeit -n 3
abs_humidity_list = []
for index, row in df.iterrows():
    temperature = row['Temp (°C)']
    rel_humidity = row['Rel Hum (%)']
    abs_humidity = calculate_abs_humidity(temperature, rel_humidity)
    abs_humidity_list.append(abs_humidity)

3 loops, best of 3: 8.99 s per loop


In [21]:
%%timeit -n 3
abs_humidity_series = df.apply(lambda row:        
    calculate_abs_humidity(row['Temp (°C)'], row['Rel Hum (%)']),         
    axis=1)

3 loops, best of 3: 2.01 s per loop


In [22]:
import numpy as np

In [24]:
%%timeit -n 3
abs_humidity_np = np.vectorize(calculate_abs_humidity)(
    df['Temp (°C)'], 
    df['Rel Hum (%)'])

3 loops, best of 3: 37 ms per loop


In [25]:
def calculate_abs_humidity_np(temperature, relative_humidity):
  """ Calculates absolute humidity from temperature and relative humidity """
  t = temperature
  rh = relative_humidity
  abs_humidity = (6.112 * np.exp((17.67*t)/(t+243.5)) * rh * 2.1674)/(273.15+t)
  return abs_humidity

In [26]:
%%timeit -n 3
calculate_abs_humidity_np(df['Temp (°C)'], df['Rel Hum (%)'])

3 loops, best of 3: 6.72 ms per loop
