# Weather Data Preprocessing

## Set up

In [1]:
%matplotlib inline

import logging
import itertools
import json
import os
import pickle
import urllib2
import folium
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from mpl_toolkits.basemap import Basemap
from datetime import datetime
from os import listdir
from os.path import isfile, join
from src.data.parse_dataset import parse_json_files, parse_json_file, get_file_list, parse_dir
from IPython.display import Image
from datetime import date

logger = logging.getLogger()
logger.setLevel(logging.INFO)

## Data Collection

### Single Weather Station Approach

Set the time period for which data will be collected

In [3]:
start_date = date(2016, 5, 16)
end_date = date(2016, 6, 13)

days = pd.date_range(start=start_date, end=end_date, closed='left')

Download the data from Wunderground 

In [4]:
def download(url_string, file_name):
    """Download the given resource to the given file"""
    
    response = urllib2.urlopen(url_string)
    with open(file_name, "wb") as f:
        f.write(response.read())

In [28]:
path = '/home/jfconavarrete/Documents/Work/Dissertation/spts-uoe/data/raw/weather'
lhr_url = 'http://api.wunderground.com/api/8494fbcae3235601/history_%s/q/UK/London.json'

# iterate through all days and stations
for i,day in enumerate(days):
    url_string = lhr_url % (day.strftime('%Y%m%d'))
    file_name = '%s/WEATHER-%s.json' % (path, day.strftime('%Y-%m-%d'))
    
    logger.info('Downloading %s', url_string)
    download(url_string, file_name)
    
    # sleep 60 seconds every 10 requests due to API restrictions 
    if (i % 10) == 9:
        time.sleep(60)

## Data Parsing

Parse the JSON data files

In [2]:
def parse_weather(json_obj):
    """Parses Wunderground API History JSON response"""
    
    return [parse_observation(element) for element in json_obj['history']['observations']]

def parse_observation(observation):
    """Parses a JSON observation object to a dictionary"""
    
    reading = {
        'Timestamp': observation['utcdate']['pretty'],
        'Temp': observation['tempm'],
        'DewPt': observation['dewptm'],
        'Humidity': observation['hum'],
        'WindSpeed': observation['wspdm'],
        'WindDirD': observation['wdird'],
        'WindDirE': observation['wdire'],
        'Visibility': observation['vism'],
        'Pressure': observation['pressurem'],
        'WindChill': observation['windchillm'],
        'Precipitation': observation['precipm'],
        'Condition': observation['conds'],
        'Fog': observation['fog'],
        'Rain': observation['rain'],
        'Snow': observation['snow'],
        'Hail': observation['hail'],
        'Thunder': observation['thunder'],
        'Tornado': observation['tornado'],
    }

    return reading

In [3]:
def get_file_date(file_name):
    """Gets the file's date"""
    
    file_basename = os.path.basename(file_name)
    idx = file_basename.find('-')
    file_date = file_basename[idx + 1:]
    return datetime.strptime(file_date, '%Y-%m-%d.json')

Convert the raw data to a Pandas DataFrame

In [303]:
records = parse_dir('/home/jfconavarrete/Documents/Work/Dissertation/spts-uoe/data/raw/weather', 
                    parse_weather, sort_fn=get_file_date)

weather_dataset = pd.DataFrame(list(itertools.chain.from_iterable(records)))

## Technically Correct Data

In [304]:
# replace missing values
replace_empty = lambda col: col.replace('T', np.nan).replace('N/A', np.nan).replace('', np.nan)
replace_na = lambda col: col.replace('-9999.00', np.nan).replace('-9999', np.nan).replace('-9999.0', np.nan).replace('-999', np.nan)
weather_dataset = weather_dataset.apply(replace_empty, axis=1).apply(replace_na, axis=1)

# convert columns to their appropriate datatypes
weather_dataset['Fog'] = weather_dataset['Fog'].astype('bool_')
weather_dataset['Hail'] = weather_dataset['Hail'].astype('bool_')
weather_dataset['Rain'] = weather_dataset['Rain'].astype('bool_')
weather_dataset['Snow'] = weather_dataset['Snow'].astype('bool_')
weather_dataset['Tornado'] = weather_dataset['Tornado'].astype('bool_')
weather_dataset['Thunder'] = weather_dataset['Snow'].astype('bool_')

weather_dataset['Precipitation'] = weather_dataset['Precipitation'].astype('float32')
weather_dataset['Visibility'] = weather_dataset['Visibility'].astype('float32')
weather_dataset['WindChill'] = weather_dataset['WindChill'].astype('float32')
weather_dataset['WindSpeed'] = weather_dataset['WindSpeed'].astype('float32')
weather_dataset['DewPt'] = weather_dataset['DewPt'].astype('float32')
weather_dataset['Humidity'] = weather_dataset['Humidity'].astype('float32')
weather_dataset['Pressure'] = weather_dataset['Pressure'].astype('float32')
weather_dataset['Temp'] = weather_dataset['Temp'].astype('float32')
weather_dataset['WindDirD'] = weather_dataset['WindDirD'].astype('float32')

weather_dataset['Timestamp'] =  pd.to_datetime(weather_dataset['Timestamp'], format='%I:%M %p %Z on %B %d, %Y', errors='raise')

In [305]:
weather_dataset.sort_values(by=['Timestamp'], inplace=True)

In [306]:
def expand_datetime(df, datetime_col):
    df['Weekday'] = df[datetime_col].apply(lambda x: x.dayofweek) # Monday=0, Sunday=6
    df['Minute'] = df[datetime_col].apply(lambda x: x.minute)
    df['Hour'] = df[datetime_col].apply(lambda x: x.hour)
    return df

expand_datetime(weather_dataset, 'Timestamp')

Unnamed: 0,Condition,DewPt,Fog,Hail,Humidity,Precipitation,Pressure,Rain,Snow,Temp,...,Timestamp,Tornado,Visibility,WindChill,WindDirD,WindDirE,WindSpeed,Weekday,Minute,Hour
0,Overcast,8.0,True,True,71.0,,1022.0,True,True,12.0,...,2016-05-15 23:00:00,True,19.0,,100.0,East,14.800000,6,0,23
1,Mostly Cloudy,7.0,True,True,72.0,,1021.0,True,True,12.0,...,2016-05-15 23:20:00,True,10.0,,120.0,ESE,11.100000,6,20,23
2,Unknown,8.0,True,True,82.0,,1021.0,True,True,11.0,...,2016-05-15 23:50:00,True,10.0,,120.0,ESE,9.300000,6,50,23
3,Mostly Cloudy,8.0,True,True,77.0,,1022.0,True,True,11.0,...,2016-05-16 00:00:00,True,18.0,,120.0,ESE,13.000000,0,0,0
4,Unknown,8.0,True,True,82.0,,1021.0,True,True,11.0,...,2016-05-16 00:20:00,True,10.0,,80.0,East,11.100000,0,20,0
5,Unknown,8.0,True,True,82.0,,1021.0,True,True,11.0,...,2016-05-16 00:50:00,True,10.0,,70.0,ENE,9.300000,0,50,0
6,Mostly Cloudy,8.0,True,True,77.0,,1022.0,True,True,11.0,...,2016-05-16 01:00:00,True,17.0,,60.0,ENE,9.300000,0,0,1
7,Unknown,7.0,True,True,82.0,,1021.0,True,True,10.0,...,2016-05-16 01:20:00,True,10.0,,80.0,East,7.400000,0,20,1
8,Unknown,8.0,True,True,87.0,,1021.0,True,True,10.0,...,2016-05-16 01:50:00,True,10.0,,80.0,East,5.600000,0,50,1
9,Scattered Clouds,8.0,True,True,84.0,,1021.0,True,True,10.0,...,2016-05-16 02:00:00,True,15.0,,80.0,East,5.600000,0,0,2


## Consistent Data

Include data just between 7:00 to 23:00

In [307]:
weather_dataset = weather_dataset.drop(weather_dataset[(weather_dataset['Hour'] < 7) | (weather_dataset['Hour'] > 22)].index)

In [308]:
weather_dataset.shape

(1334, 21)

In [309]:
weather_dataset.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1334 entries, 24 to 2004
Data columns (total 21 columns):
Condition        1286 non-null object
DewPt            1334 non-null float32
Fog              1334 non-null bool
Hail             1334 non-null bool
Humidity         1334 non-null float32
Precipitation    0 non-null float32
Pressure         1334 non-null float32
Rain             1334 non-null bool
Snow             1334 non-null bool
Temp             1334 non-null float32
Thunder          1334 non-null bool
Timestamp        1334 non-null datetime64[ns]
Tornado          1334 non-null bool
Visibility       1331 non-null float32
WindChill        0 non-null float32
WindDirD         1334 non-null float32
WindDirE         1334 non-null object
WindSpeed        1334 non-null float32
Weekday          1334 non-null int64
Minute           1334 non-null int64
Hour             1334 non-null int64
dtypes: bool(6), datetime64[ns](1), float32(9), int64(3), object(2)
memory usage: 334.0 KB


In [310]:
weather_dataset.head()

Unnamed: 0,Condition,DewPt,Fog,Hail,Humidity,Precipitation,Pressure,Rain,Snow,Temp,...,Timestamp,Tornado,Visibility,WindChill,WindDirD,WindDirE,WindSpeed,Weekday,Minute,Hour
24,Scattered Clouds,8.0,True,True,77.0,,1022.0,True,True,11.0,...,2016-05-16 07:00:00,True,13.0,,20.0,NNE,9.3,0,0,7
25,Partly Cloudy,7.0,True,True,76.0,,1022.0,True,True,11.0,...,2016-05-16 07:20:00,True,10.0,,30.0,NNE,9.3,0,20,7
26,Unknown,7.0,True,True,72.0,,1022.0,True,True,12.0,...,2016-05-16 07:50:00,True,10.0,,20.0,NNE,7.4,0,50,7
27,Scattered Clouds,7.0,True,True,64.0,,1022.0,True,True,12.0,...,2016-05-16 08:00:00,True,18.0,,20.0,NNE,9.3,0,0,8
28,Unknown,6.0,True,True,67.0,,1022.0,True,True,12.0,...,2016-05-16 08:20:00,True,10.0,,350.0,North,7.4,0,20,8


In [311]:
weather_dataset.describe()

Unnamed: 0,DewPt,Humidity,Precipitation,Pressure,Temp,Visibility,WindChill,WindDirD,WindSpeed,Weekday,Minute,Hour
count,1334.0,1334.0,0.0,1334.0,1334.0,1331.0,0.0,1334.0,1334.0,1334.0,1334.0,1334.0
mean,10.311094,68.200897,,1015.959534,16.035233,14.900301,,152.721146,13.319413,3.001499,23.470765,14.503748
std,3.038621,16.045976,,5.265908,3.459475,9.886402,,125.337108,5.407516,1.998874,20.529933,4.619464
min,-2.0,21.0,,1002.0,9.0,3.7,,0.0,1.9,0.0,0.0,7.0
25%,8.0,56.0,,1013.0,13.0,10.0,,30.0,9.3,1.0,0.0,10.0
50%,10.0,72.0,,1017.0,16.0,10.0,,120.0,13.0,3.0,20.0,15.0
75%,12.0,82.0,,1020.0,18.0,15.0,,260.0,16.700001,5.0,50.0,19.0
max,19.0,100.0,,1024.0,25.0,50.0,,360.0,33.299999,6.0,50.0,22.0


In [312]:
weather_dataset.apply(lambda x:x.nunique())

Condition          18
DewPt              20
Fog                 1
Hail                1
Humidity           75
Precipitation       0
Pressure           23
Rain                1
Snow                1
Temp               17
Thunder             1
Timestamp        1334
Tornado             1
Visibility         35
WindChill           0
WindDirD           37
WindDirE           17
WindSpeed          17
Weekday             7
Minute              3
Hour               16
dtype: int64

In [313]:
weather_dataset.isnull().sum()

Condition          48
DewPt               0
Fog                 0
Hail                0
Humidity            0
Precipitation    1334
Pressure            0
Rain                0
Snow                0
Temp                0
Thunder             0
Timestamp           0
Tornado             0
Visibility          3
WindChill        1334
WindDirD            0
WindDirE            0
WindSpeed           0
Weekday             0
Minute              0
Hour                0
dtype: int64

## Build