# Importing Data

In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 6
pd.options.display.max_columns = 8
pd.__version__ 

We often times have a variety of input data.

- CSV
- Excel
- SQL
- JSON
- HDF5
- pickle
- msgpack
- Stata
- BigQuery

This is subset of the data from beeradvocate.com, via [Standford](https://snap.stanford.edu/data/web-RateBeer.html). It's strangely formatted.

This dataset is no longer available!

<p style="font-size:20px"; style=font-family:Courier>
beer/name: Sausa Weizen<br>
beer/beerI: 47986<br>
beer/brewerId: 10325<br>
beer/ABV: 5.00<br>
beer/style: Hefeweizen<br>
review/appearance: 2.5<br>
review/aroma: 2<br>
review/time: 1234817823<br>
review/profileName: stcules<br>
review/text: A lot of foam. But a lot.	In the smell some banana, and then lactic and tart. Not a good start.	Quite dark orange in color, with a lively carbonation (now visible, under the foam).	Again tending to lactic sourness.	Same for the taste. With some yeast and banana.<br>
<br>
beer/name: Red Moon<br>
beer/beerId: 48213<br>
beer/brewerId: 10325<br>
beer/ABV: 6.20<br>
 ...<br>
</p>


# CSV

http://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files

In [None]:
df = pd.read_csv('data/beer2.csv.gz', 
                 index_col=0,
                 parse_dates=['time'],
                 encoding='utf-8')

In [None]:
df

In [None]:
df.info()

In [None]:
# we have some unicode
df.loc[50,'beer_name']

In [None]:
df.to_csv('data/beer.csv', index=False, encoding='utf-8')

# Excel

http://pandas.pydata.org/pandas-docs/stable/io.html#excel-files

In [None]:
df.to_excel('data/beer.xls', index=False, encoding='utf-8')

In [None]:
data = pd.read_excel('data/beer.xls', sheetnames=[0], encoding='utf-8')

# SQL

http://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries

In [None]:
from sqlalchemy import create_engine
!rm -f data/beer.sqlite
engine = create_engine('sqlite:///data/beer.sqlite')

In [None]:
df.to_sql('beer', engine)

In [None]:
data = pd.read_sql('beer', engine)

# JSON

http://pandas.pydata.org/pandas-docs/stable/io.html#json

In [None]:
df.to_json('data/beer.json')

In [None]:
data = pd.read_json('data/beer.json')

# HDF

http://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables

In [None]:
# fixed format
df.to_hdf('data/beer_mixed.hdf',
          'df',
           mode='w',
           format='fixed',
           encoding='utf-8')

In [None]:
# find empty fields
df[df.text.isnull()]

In [None]:
data = pd.read_hdf('data/beer_mixed.hdf','df',encoding='utf-8')

In [None]:
# wildly varying strings
df.text.str.len().describe()

# Timings

In [None]:
%timeit pd.read_excel('data/beer.xls', sheetnames=[0])

In [None]:
timeit pd.read_sql('beer', engine)

In [None]:
%timeit pd.read_json('data/beer.json')

In [None]:
%timeit pd.read_csv('data/beer.csv', parse_dates=['time'])

In [None]:
%timeit pd.read_hdf('data/beer_mixed.hdf','df')

In [None]:
df.to_pickle('data/beer.pkl')
df.to_msgpack('data/beer.msgpack',encoding='utf-8')

In [None]:
%timeit pd.read_pickle('data/beer.pkl')

In [None]:
%timeit pd.read_msgpack('data/beer.msgpack', encoding='utf-8')

# Storing Text vs Data
http://matthewrocklin.com/blog/work/2015/03/16/Fast-Serialization/

# Operating on Large Data

In [None]:
chunks = pd.read_csv('data/beer2.csv.gz', 
                      index_col=0,
                      parse_dates=['time'],
                      chunksize=10000)
for i, chunk in enumerate(chunks):
    print("%d -> %d" % (i, len(chunk)))

# Using Odo
http://odo.readthedocs.org/

# Questions

- which formats provide good fidelity
  - hdf5, pickle, msgpack
  
- which formats can you query
  - hdf5, sql
  
- which formats can you iterate
  - csv, hdf5, sql
  
- which formats provide better interoprability
  - csv, json, excel
  
- which formats can you transmit over the wire
  - json, msgpack
  
- which formats have better compression
  - hdf5, pickle, msgpack
  
- which formats allow multiple datasets in the same file
  - hdf5, msgpack