# Examining the Data

In [11]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 8
pd.options.display.max_columns = 8

In [12]:
df = pd.read_hdf('data/beer.hdf','df').reset_index()
df

Unnamed: 0,abv,beer_id,brewer_id,beer_name,...,profile_name,review_taste,text,time
0,7.0,2511,287,Bell's Cherry Stout,...,blaheath,4.5,Batch 8144\tPitch black in color with a 1/2 f...,2009-10-05 21:31:48
1,5.7,19736,9790,Duck-Rabbit Porter,...,GJ40,4.0,Sampled from a 12oz bottle in a standard pint...,2009-10-05 21:32:09
2,4.8,11098,3182,Fürstenberg Premium Pilsener,...,biegaman,3.5,Haystack yellow with an energetic group of bu...,2009-10-05 21:32:13
3,9.5,28577,3818,Unearthly (Imperial India Pale Ale),...,nick76,4.0,"The aroma has pine, wood, citrus, caramel, an...",2009-10-05 21:32:37
...,...,...,...,...,...,...,...,...,...
99996,7.5,15881,694,Tröegs Nugget Nectar,...,MisterDeeds,4.0,Pouring from a 12 oz bottle into a perfect pi...,2010-03-07 01:30:35
99997,6.7,5722,30,New World Porter,...,parris,3.5,"Poured into a pint glass. Color is dark, dark...",2010-03-07 01:32:46
99998,7.0,829,266,Jamaica Sunset India Pale Ale,...,northyorksammy,4.0,I think this IPA somewhat undervalued. It had...,2010-03-07 01:33:29
99999,6.5,28494,590,Unplugged Cherry Stout,...,mothman,4.5,Poured into darkness snifter.\t\tPours hardly...,2010-03-07 01:34:05


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 13 columns):
abv                  96949 non-null float64
beer_id              100000 non-null int64
brewer_id            100000 non-null int64
beer_name            100000 non-null object
beer_style           100000 non-null object
review_appearance    100000 non-null float64
review_aroma         100000 non-null float64
review_overall       100000 non-null float64
review_palate        100000 non-null float64
profile_name         100000 non-null object
review_taste         100000 non-null float64
text                 99972 non-null object
time                 100000 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(6), int64(2), object(4)
memory usage: 10.7+ MB


# Boolean indexing

Like a where clause in SQL. The indexer (or boolean mask) should be 1-dimensional and the same length as the thing being indexed.

In [5]:
df.abv < 5

AttributeError: 'DataFrame' object has no attribute 'abv'

In [None]:
df[df.abv < 5].head()

Notice that we just used `[]` there. We can pass the boolean indexer in to `.loc` as well.

In [None]:
df.loc[df.abv < 5, ['beer_style', 'review_overall']]

Again, you can get complicated

In [None]:
df[((df.abv < 5) & (df.time > pd.Timestamp('2009-06'))) | (df.review_overall >= 4.5)]

# Creating a boolean indexer with the contents of a column

Select just the rows where the `beer_style` contains IPA. 

In [None]:
df[df.beer_style.str.contains('IPA')]

This is quite powerful. Any method that returns a boolean array is potentially an indexer.

# isin

Useful for seeing if a value is contained in a collection.

In [None]:
beer_ids = df.beer_id.value_counts()
beer_ids

In [None]:
df[df.beer_id.isin(beer_ids[0:3].index)]

# Positional Indexing

In [None]:
df.head(5)

In [None]:
df.iloc[[2,5,10]]

In [None]:
df.iloc[[2,5,10],0:3]

In [None]:
df.ix[[2,5,10],['beer_id','time']]

# Hierarchical Indexing

One of the most powerful and most complicated features of pandas.
Let's you represent high-dimensional datasets in a table.

In [None]:
reviews = df.set_index(['profile_name', 'beer_id', 'time']).sort_index()
reviews.head()

### Why is this useful

In [None]:
top_reviewers = (reviews.index.get_level_values('profile_name')
                 .value_counts()
                 .head(5))
top_reviewers

In [None]:
reviews.loc[top_reviewers.index, :, :]

### Specifying multi-axis indexers

In [None]:
idx = pd.IndexSlice

In [None]:
reviews.loc[idx[top_reviewers.index, [92,54919], :], ['beer_name','beer_style']]

In [None]:
reviews.query('profile_name in @top_reviewers.index & beer_id in [92,54919]')[['beer_name','beer_style']]