In [6]:
import os; os.sys.path.append(os.path.dirname(os.path.abspath('.'))) # for relative imports
from utils.nab_data import NABData
import numpy as np
import pandas as pd

In [54]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict, OrderedDict
from natural.number import ordinal
from natural.date import compress
from dateutil.parser import parse

class NABData(object):
    data_dir = os.path.join(os.path.dirname(os.path.abspath('.')), 'data')
    
    def __init__(self):
        self._load()
    
    def _dir_iter(self):
        for root, dirs, files in os.walk(self.data_dir):
            for filename in filter(lambda x: x.find('csv') != -1, files):
                yield root.split('/')[-1], filename
                
    @staticmethod
    def _format_timeseries(df):
        df['timestamp'] = df['timestamp'].map(parse)
        return df.set_index('timestamp').sort_index()
    
    @staticmethod
    def _filename2key(filename):
        return filename[:filename.find('.csv')]
                
    def _load(self):
        self.files = defaultdict(dict)
        cache_miss = False
        hdf_path = self._hdf_path()
        for folder, filename in self._dir_iter():
            try:
                self.files[folder][filename] = pd.read_hdf(hdf_path, os.path.join(folder, self._filename2key(filename)))
            except (KeyError, IOError):
                self.files[folder][filename] = self._format_timeseries(pd.read_csv(os.path.join(self.data_dir, folder, filename)))
                cache_miss = True
        if cache_miss: 
            self._save_hdf()
            
    def _hdf_path(self):
        return os.path.join(self.data_dir, 'cache.h5')
    
    def _save_hdf(self):
        hdf_path = self._hdf_path()
        for category, filename, df in self._files_iter():
            key = os.path.join(category, self._filename2key(filename))
            df.to_hdf(hdf_path, key)
            
    def __getitem__(self, items):
        return self.files[items]
    
    def _files_iter(self):
        for cat_key in self.files.keys():
            for filename in self.files[cat_key].keys():
                yield (cat_key, filename, self.files[cat_key][filename])
    
    def summary(self):
        data = []
        cols = ['category', 'file', 'length', 'features', 'period', 'periods_vary', 'min', 
                'max', 'mean', 'std', '25th_percentile', '50th_percentile', '75th_percentile']
        for category, filename, df in self._files_iter():
            row = dict([('category', category), ('file', filename)])
            row['length'] = len(df)
            row['features'] = len(df.columns)
            row['period'] = compress(df.index[1]-df.index[0])
            row['periods_vary'] = int(np.unique(np.diff(df.index)).shape == (1,))
            row['min'] = df['value'].min()
            row['max'] = df['value'].max()
            row['mean'] = df['value'].mean()
            row['std'] = df['value'].std()
            for q in [25, 50, 75]:
                row['{}_percentile'.format(ordinal(q))] = np.percentile(df['value'], q)
            data.append(row)
        return self._round_float_cols(pd.DataFrame(data)[cols])
    
    @staticmethod
    def _round_float_cols(df, digits=2):
        for col in df.columns: 
            if df[col].dtype in [np.float64, np.float32]:
                df[col] = np.round(df[col], digits)
        return df
        
            
            
            

In [11]:
df = pd.read_csv('/home/jstrong/src/NAB/data/realKnownCause/nyc_taxi.csv')
df.head()

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820


In [45]:
#data = NABData()
np.unique(np.diff(data['realKnownCause']['nyc_taxi.csv'].index)).shape

realKnownCause


(1,)

In [55]:
data = NABData()
data.summary()



Unnamed: 0,category,file,length,features,period,periods_vary,min,max,mean,std,25th_percentile,50th_percentile,75th_percentile
0,realTweets,Twitter_volume_IBM.csv,15893,1,5m,1,0.0,139.0,4.39,5.5,1.0,3.0,6.0
1,realTweets,Twitter_volume_GOOG.csv,15842,1,5m,1,0.0,465.0,20.74,18.56,11.0,16.0,26.0
2,realTweets,Twitter_volume_FB.csv,15833,1,5m,1,0.0,1258.0,17.81,19.74,9.0,14.0,22.0
3,realTweets,Twitter_volume_CRM.csv,15902,1,5m,1,0.0,209.0,3.35,4.61,1.0,2.0,5.0
4,realTweets,Twitter_volume_KO.csv,15851,1,5m,1,0.0,2241.0,11.4,24.8,5.0,8.0,13.0
5,realTweets,Twitter_volume_AMZN.csv,15831,1,5m,1,0.0,1673.0,53.3,30.55,36.0,50.0,65.0
6,realTweets,Twitter_volume_UPS.csv,15866,1,5m,1,0.0,231.0,5.46,21.57,0.0,2.0,4.0
7,realTweets,Twitter_volume_AAPL.csv,15902,1,5m,1,0.0,13479.0,85.55,321.05,29.0,47.0,76.0
8,realTweets,Twitter_volume_CVS.csv,15853,1,5m,1,0.0,50.0,0.36,1.09,0.0,0.0,0.0
9,realTweets,Twitter_volume_PFE.csv,15858,1,5m,1,0.0,36.0,0.87,1.46,0.0,0.0,1.0


In [36]:
data.files.keys()

['realTweets',
 'realKnownCause',
 'artificialNoAnomaly',
 'realAWSCloudwatch',
 'realAdExchange',
 'artificialWithAnomaly',
 'realTraffic']

In [15]:
data = NABData()
data.load()

------ data
------------ README.md
-------- realTweets
-------------- Twitter_volume_IBM.csv
-------------- Twitter_volume_CVS.csv
-------------- Twitter_volume_AMZN.csv
-------------- Twitter_volume_UPS.csv
-------------- Twitter_volume_CRM.csv
-------------- Twitter_volume_FB.csv
-------------- Twitter_volume_AAPL.csv
-------------- Twitter_volume_GOOG.csv
-------------- Twitter_volume_PFE.csv
-------------- Twitter_volume_KO.csv
-------- artificialNoAnomaly
-------------- art_noisy.csv
-------------- art_daily_small_noise.csv
-------------- art_flatline.csv
-------------- art_daily_perfect_square_wave.csv
-------------- art_daily_no_noise.csv
-------- realAdExchange
-------------- exchange-4_cpm_results.csv
-------------- exchange-4_cpc_results.csv
-------------- exchange-3_cpc_results.csv
-------------- exchange-2_cpm_results.csv
-------------- exchange-3_cpm_results.csv
-------------- exchange-2_cpc_results.csv
-------- artificialWithAnomaly
-------------- art_daily_nojump.csv
---