# Merge Datasets

## Set Up

In [13]:
%matplotlib inline

import logging
import itertools
import json
import os
import pickle
import folium
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from mpl_toolkits.basemap import Basemap
from datetime import datetime
from os import listdir
from os.path import isfile, join
from src.data.parse_dataset import parse_dir, parse_json_files, get_file_list
from IPython.display import Image
from datetime import date

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [14]:
stations_dataset = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings_dataset = pickle.load(open('data/parsed/readings_dataset_final.p', 'rb'))
weather_dataset = pickle.load(open('data/parsed/weather_dataset_final.p', 'rb'))

In [15]:
print readings_dataset.shape
print stations_dataset.shape
print weather_dataset.shape

(1483149, 6)
(779, 13)
(3008, 16)


## Merge Readings and Weather

Use binary search to look for the closest date to the given reading.

In [16]:
def binarySearch(data, val):
    """Find the closest val in data"""
    
    lo, hi = 0, len(data) - 1
    best_ind = lo
    while lo <= hi:
        mid = lo + (hi - lo) / 2
        if data.iat[mid] < val:
            lo = mid + 1
        elif data.iat[mid] > val:
            hi = mid - 1
        else:
            best_ind = mid
            break
        # check if data[mid] is closer to val than data[best_ind] 
        if abs(data.iat[mid] - val) < abs(data.iat[best_ind] - val):
            best_ind = mid
    return best_ind

In [17]:
readings_dataset['WeatherIdx'] = readings_dataset['Timestamp'].apply(lambda val: weather_dataset['Timestamp'].index[binarySearch(weather_dataset['Timestamp'], val)])

In [18]:
readings_weather = pd.merge(readings_dataset, weather_dataset, right_index=True, left_on='WeatherIdx')
readings_weather['DifferenceS'] = (readings_weather['Timestamp_x'] - readings_weather['Timestamp_y']) / pd.np.timedelta64(1, 's')
readings_weather['DifferenceS'] = readings_weather['DifferenceS'].apply(math.fabs)

In [19]:
readings_weather_view = readings_weather[['Timestamp_x', 'Timestamp_y', 'DifferenceS']]

In [30]:
readings_weather_view.sort_values(by=['DifferenceS'], ascending=False)

Unnamed: 0,Timestamp_x,Timestamp_y,DifferenceS
1483148,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023
1483135,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023
1483123,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023
1483124,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023
1483125,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023
1483126,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023
1483127,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023
1483128,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023
1483129,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023
1483130,2016-06-26 23:56:49.023,2016-06-26 22:50:00,4009.023


In [21]:
readings_weather_view.describe()

Unnamed: 0,DifferenceS
count,1483149.0
mean,350.9571
std,237.9013
min,0.013
25%,149.393
50%,301.337
75%,502.953
max,4009.023


In [22]:
readings_weather.rename(columns={'Timestamp_x': 'Timestamp'}, inplace=True)
readings_weather.drop(['Timestamp_y', 'WeatherIdx'], axis=1, inplace=True)

In [23]:
readings_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1483149 entries, 0 to 1483148
Data columns (total 22 columns):
Id                 1483149 non-null object
Timestamp          1483149 non-null datetime64[ns]
NbBikes            1483149 non-null uint16
NbDocks            1483149 non-null uint16
NbEmptyDocks       1483149 non-null uint16
NbUnusableDocks    1483149 non-null uint16
Condition          1439017 non-null object
DewPt              1483149 non-null float32
Fog                1483149 non-null bool
Hail               1483149 non-null bool
Humidity           1483149 non-null float32
Pressure           1483149 non-null float32
Rain               1483149 non-null bool
Snow               1483149 non-null bool
Temp               1483149 non-null float32
Thunder            1483149 non-null bool
Tornado            1483149 non-null bool
Visibility         1483149 non-null float32
WindDirD           1483149 non-null float32
WindDirE           1483149 non-null object
WindSpeed          148314

In [24]:
pickle.dump(readings_weather, open("data/parsed/readings_weather_dataset_final.p", "wb"))