In [7]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv('example1.csv')
df1

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [3]:
# Storing data aka 'serialization'
# Storing in pickle format is efficient binary data format
df1.to_pickle('example1pickle')

In [4]:
df2 = pd.read_pickle('example1pickle')
df2

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
%timeit df3 = pd.read_csv('example1.csv')
%timeit df4 = pd.read_pickle('example1pickle')

# What's the point? Likely need larger examples

774 µs ± 9.78 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
719 µs ± 7.26 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [12]:
# HDF5 format
df5 = pd.DataFrame(np.random.randn(1000).reshape(100,10))
df5.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.048268,-0.967911,-1.343073,0.210339,-1.368793,-0.203314,-0.888114,0.572396,0.228675,-0.176464
1,-1.021363,2.717552,0.548198,0.743561,0.080105,-0.233037,0.681825,-0.277019,0.47836,0.430508
2,0.614111,0.181113,1.010557,-0.507362,-0.241034,-0.96066,1.554324,-0.32536,-1.148021,0.181351
3,1.165002,-1.317819,1.840129,1.026588,0.458006,-0.130291,-0.30604,-1.372339,-0.160801,1.161025
4,-1.427526,-0.667999,-2.173282,-1.476079,-0.3054,0.475612,1.327212,-1.241973,-0.585748,0.994934


In [14]:
store = pd.HDFStore('hdfexample.h5')
# Plops the file into HDF format and creates a reference to it
# I imagine out of memory?

store['obj1'] = df5

In [16]:
store['obj1_col'] = df5[1]

In [18]:
# Now I've stuck a bunch of other data structures inside the same HDF file
store

<class 'pandas.io.pytables.HDFStore'>
File path: hdfexample.h5

In [19]:
# You have options to store data with a 'table' schema that supports querying
store.put('obj2', df5, format='table')

In [24]:
store.select('obj2', where=['index >= 10 and index <= 15'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
10,0.899943,-2.827862,1.262223,1.159096,0.266037,-1.24285,-0.282857,0.191495,0.333977,0.754115
11,2.079771,-0.487494,-0.114317,0.166797,0.587891,-0.924623,-0.211763,0.147954,-2.870981,1.039374
12,0.048,1.60839,0.514393,0.938207,0.118521,0.76814,1.90915,-1.493587,-0.19487,0.276667
13,0.255644,0.957506,1.096865,0.139253,-2.157379,0.787281,-0.934788,0.359257,1.181489,-0.568167
14,0.818727,-1.722839,0.073789,-0.218033,1.084804,-2.409867,0.493685,1.30499,-1.60722,-0.44504
15,0.381446,-0.956655,0.089791,0.150048,1.17203,-0.708701,1.845058,-0.518008,-0.193112,-0.29131


In [26]:
df5.columns = ['a','b','c','d','e','f','g','h','i','j']

df5.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1.048268,-0.967911,-1.343073,0.210339,-1.368793,-0.203314,-0.888114,0.572396,0.228675,-0.176464
1,-1.021363,2.717552,0.548198,0.743561,0.080105,-0.233037,0.681825,-0.277019,0.47836,0.430508
2,0.614111,0.181113,1.010557,-0.507362,-0.241034,-0.96066,1.554324,-0.32536,-1.148021,0.181351
3,1.165002,-1.317819,1.840129,1.026588,0.458006,-0.130291,-0.30604,-1.372339,-0.160801,1.161025
4,-1.427526,-0.667999,-2.173282,-1.476079,-0.3054,0.475612,1.327212,-1.241973,-0.585748,0.994934


In [28]:
store.put('obj3', df5, format='table')

In [34]:
store.select_column('obj3', column='a')
# store.select('obj3')

# Not working for whatever reason

In [35]:
store.close()

In [36]:
# Good 'ol Excel
excel_file = pd.ExcelFile('excel1.xlsx')

In [37]:
# Then just parse what you're looking
pd.read_excel(excel_file, 'Sheet1')

Unnamed: 0,'a','b','c'
0,'1','2','3'
1,'5','6','7'
2,'9','10','11'


In [40]:
# Other direction requires a writer object, like csv

excel_writer = pd.ExcelWriter('excel2.xlsx')

In [42]:
df5.to_excel(excel_writer, sheet_name='Sheet2')

excel_writer.close()