# data conversions

In [1]:
pd.set_option('max_rows',12)
pd.set_option('max_seq_items',4)

# dates

In [2]:
idx = pd.date_range('20130101',freq='h',periods=10000)
idx

DatetimeIndex(['2013-01-01 00:00:00', '2013-01-01 01:00:00', 
               ...
               '2014-02-21 14:00:00', '2014-02-21 15:00:00'], dtype='datetime64[ns]', length=10000, freq='H', tz=None)

In [3]:
strings = idx.format(date_format='%Y%m')
strings

['201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',
 '201301',

In [4]:
def slow_format(strings):
    return [ pd.to_datetime(s,format='%Y%m') for s in strings ]
def reg_format(strings):
    return pd.to_datetime(strings,format='%Y%m')
def fast_format(strings):
    return pd.to_datetime(np.array(strings).astype(int)*100+1,
                          format='%Y%m%d')

In [5]:
results = [slow_format(strings), reg_format(strings), fast_format(strings)]

from itertools import imap, permutations
np.array(list(imap(lambda x: x[0] == x[1], permutations(results, 2)))).all()

True

In [6]:
%timeit slow_format(strings)

1 loops, best of 3: 511 ms per loop


In [7]:
%timeit reg_format(strings)

10 loops, best of 3: 26.9 ms per loop


In [8]:
%timeit fast_format(strings)

100 loops, best of 3: 9.8 ms per loop


# csv and dates

In [3]:
fn = 'data/csv_and_dates.csv'
N = 1e4
df = DataFrame({'A' : np.random.randn(N)}, 
               index=pd.date_range('20130101',freq='s',periods=N))
df.index = df.index.format(date_format='%Y/%b/%d %H:%M:%S')
df.index.name ='dates'
df.to_csv(fn)

In [4]:
!head 'data/csv_and_dates.csv'

dates,A
2013/Jan/01 00:00:00,0.468215295057
2013/Jan/01 00:00:01,-0.0513482047399
2013/Jan/01 00:00:02,0.17859298163
2013/Jan/01 00:00:03,0.664731485816
2013/Jan/01 00:00:04,-0.658816012947
2013/Jan/01 00:00:05,1.23941053276
2013/Jan/01 00:00:06,0.297235802582
2013/Jan/01 00:00:07,-0.480389057802
2013/Jan/01 00:00:08,1.69436861954


In [5]:
def read_dp():
    dp = lambda s: datetime.datetime.strptime(s,'%Y/%b/%d %H:%M:%S')
    return pd.read_csv(fn,
                       date_parser=dp,
                       index_col='dates')
def read_reg():
    return pd.read_csv(fn,parse_dates=['dates'],
                       index_col='dates')
def read_infer():
    return pd.read_csv(fn,parse_dates=['dates'],
                       infer_datetime_format=True,
                       index_col='dates')
def read_post_convert():
    df = pd.read_csv(fn,
                     index_col='dates')
    df.index = pd.to_datetime(df.index,
                              format='%Y/%b/%d %H:%M:%S')
    return df

In [21]:
results = [read_dp(),read_reg(),read_infer(),read_post_convert()]

from itertools import imap, permutations
np.array(list(imap(lambda x: x[0].equals(x[1]), permutations(results, 2)))).all()

True

In [22]:
%timeit read_dp()

10 loops, best of 3: 160 ms per loop


In [23]:
%timeit read_reg()

1 loops, best of 3: 962 ms per loop


In [24]:
%timeit read_infer()

10 loops, best of 3: 86.3 ms per loop


In [25]:
%timeit read_post_convert()

10 loops, best of 3: 80.9 ms per loop
