# Merge Datasets

## Set Up

In [188]:
%matplotlib inline

import logging
import itertools
import json
import os
import pickle
import folium
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from mpl_toolkits.basemap import Basemap
from datetime import datetime
from os import listdir
from os.path import isfile, join
from src.data.parse_dataset import parse_dir, parse_json_files, get_file_list
from IPython.display import Image
from datetime import date

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [189]:
stations_dataset = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings_dataset = pickle.load(open('data/parsed/readings_dataset_final.p', 'rb'))
weather_dataset = pickle.load(open('data/parsed/weather_dataset_final.p', 'rb'))

In [190]:
print readings_dataset.shape
print stations_dataset.shape
print weather_dataset.shape

(739335, 6)
(775, 13)
(1670, 15)


## Merge Readings and Weather

Use binary search to look for the closest date to the given reading.

In [191]:
def binarySearch(data, val):
    """Find the closest val in data"""
    
    lo, hi = 0, len(data) - 1
    best_ind = lo
    while lo <= hi:
        mid = lo + (hi - lo) / 2
        if data.iat[mid] < val:
            lo = mid + 1
        elif data.iat[mid] > val:
            hi = mid - 1
        else:
            best_ind = mid
            break
        # check if data[mid] is closer to val than data[best_ind] 
        if abs(data.iat[mid] - val) < abs(data.iat[best_ind] - val):
            best_ind = mid
    return best_ind

In [192]:
readings_dataset['WeatherIdx'] = readings_dataset['Timestamp'].apply(lambda val: weather_dataset['Timestamp'].index[binarySearch(weather_dataset['Timestamp'], val)])

In [193]:
readings_weather = pd.merge(readings_dataset, weather_dataset, right_index=True, left_on='WeatherIdx')
readings_weather['DifferenceS'] = (readings_weather['Timestamp_x'] - readings_weather['Timestamp_y']) / pd.np.timedelta64(1, 's')
readings_weather['DifferenceS'] = readings_weather['DifferenceS'].apply(math.fabs)

In [194]:
readings_weather_view = readings_weather[['Timestamp_x', 'Timestamp_y', 'DifferenceS']]

In [200]:
readings_weather_view.sample(10)

Unnamed: 0,Timestamp_x,Timestamp_y,DifferenceS
604214,2016-06-13 17:28:17.703,2016-06-13 17:20:00,497.703
48405,2016-05-18 08:21:29.533,2016-05-18 08:20:00,89.533
17581,2016-05-16 18:06:46.650,2016-05-16 18:00:00,406.65
503521,2016-06-08 18:27:47.903,2016-06-08 18:20:00,467.903
650202,2016-06-15 17:53:29.147,2016-06-15 17:50:00,209.147
375393,2016-06-02 20:04:31.460,2016-06-02 20:00:00,271.46
265198,2016-05-27 18:28:37.773,2016-05-27 18:20:00,517.773
295902,2016-05-29 13:13:17.800,2016-05-29 13:20:00,402.2
700258,2016-06-17 19:34:12.770,2016-06-17 19:20:00,852.77
388478,2016-06-03 14:53:00.173,2016-06-03 14:50:00,180.173


In [201]:
readings_weather_view.describe()

Unnamed: 0,DifferenceS
count,739335.0
mean,351.707642
std,235.590068
min,0.013
25%,149.393
50%,306.127
75%,497.223
max,2392.833


In [197]:
readings_weather.rename(columns={'Timestamp_x': 'Timestamp'}, inplace=True)
readings_weather.drop(['Timestamp_y', 'WeatherIdx'], axis=1, inplace=True)

In [198]:
readings_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 739335 entries, 0 to 739334
Data columns (total 21 columns):
Id                 739335 non-null object
Timestamp          739335 non-null datetime64[ns]
NbBikes            739335 non-null uint16
NbDocks            739335 non-null uint16
NbEmptyDocks       739335 non-null uint16
NbUnusableDocks    739335 non-null uint16
DewPt              739335 non-null float32
Fog                739335 non-null bool
Hail               739335 non-null bool
Humidity           739335 non-null float32
Pressure           739335 non-null float32
Rain               739335 non-null bool
Snow               739335 non-null bool
Temp               739335 non-null float32
Thunder            739335 non-null bool
Tornado            739335 non-null bool
Visibility         739335 non-null float32
WindDirD           739335 non-null float32
WindDirE           739335 non-null object
WindSpeed          739335 non-null float32
DifferenceS        739335 non-null float64
dty

In [202]:
pickle.dump(readings_weather, open("data/parsed/readings_weather_dataset_final.p", "wb"))