# Examining the Data

In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 6
pd.options.display.max_columns = 8
pd.options.display.width = 100

In [None]:
df = pd.read_hdf('data/beer.hdf','df')
df

In [None]:
df.info()

# Text Data & .str accessor

http://pandas.pydata.org/pandas-docs/stable/text.html

In [None]:
df.beer_style

In [None]:
df.beer_style.str.len()

In [None]:
df.beer_style.str.contains('[A|a]merican')

# Datetime Data & .dt accessor

http://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-date-components

In [None]:
df.time

In [None]:
df.time.dt.date

In [None]:
df.time.dt.hour

# Categoricals & .cat accessor

http://pandas.pydata.org/pandas-docs/stable/categorical.html

In [None]:
df.select_dtypes(include=['object']).describe()

In [None]:
df[['beer_style']].info()

In [None]:
df['beer_style'] = df['beer_style'].astype('category')

In [None]:
df[['beer_style']].info()

In [None]:
df.beer_style.cat.codes

In [None]:
df.beer_style.cat.categories

In [None]:
df.beer_style.cat.ordered

# Indexing

http://pandas.pydata.org/pandas-docs/stable/indexing.html

# Boolean indexing

Like a where clause in SQL. The indexer (or boolean mask) should be 1-dimensional and the same length as the thing being indexed.

In [None]:
df.abv < 5

In [None]:
df[df.abv < 5]

Notice that we just used `[]` there. We can pass the boolean indexer in to `.loc` as well.

In [None]:
df.loc[df.abv < 5, ['beer_style', 'review_overall']]

Again, you can get complicated

In [None]:
df[((df.abv < 5) & (df.time > pd.Timestamp('2009-06'))) | (
        df.review_overall >= 4.5)]

# isin

Select just the rows where the `beer_style` contains IPA. 

In [None]:
df[df.beer_style.cat.contains('IPA')]

In [None]:
df[(df.beer_style.astype(object)).str.contains('IPA')]

### Is there another / better way?

In [None]:
cats = (df
          .beer_style
          .cat
          .categories[df.beer_style.cat.categories.str.contains('IPA')]
        )
cats

In [None]:
df.beer_style.isin(cats)

In [None]:
df[df.beer_style.isin(cats)]

This is quite powerful. Any method that returns a boolean array is potentially an indexer.

In [None]:
beer_ids = df.beer_id.value_counts()
beer_ids

In [None]:
df[df.beer_id.isin(beer_ids[0:3].index)]

# Positional Indexing

In [None]:
df.head(5)

In [None]:
df.iloc[[2,5,10]]

In [None]:
df.iloc[[2,5,10],0:3]

# Location Based Indexing

In [None]:
df.loc[[2,5,10],['beer_id','time']]

In [None]:
df.loc[df.beer_id.isin(beer_ids[0:3].index),['beer_id','time']]

# Questions
- why we use ``.loc``
- why do we care about uniqueness
- why do we use ``.ix``

In [None]:
df.index.is_unique

In [None]:
df.set_index('beer_id').index.is_unique

# Hierarchical Indexing

One of the most powerful and most complicated features of pandas.
Let's you represent high-dimensional datasets in a table.

In [None]:
reviews = df.set_index(['profile_name', 'beer_id', 'time']).sort_index()
reviews.head()

# Why is this useful

In [None]:
reviews.index.is_unique

In [None]:
top_reviewers = (reviews
                        .index
                        .get_level_values('profile_name')
                        .value_counts()
                        .head(5)
                )
top_reviewers

In [None]:
pd.options.display.max_rows=4
reviews.loc[top_reviewers.index, :, :]

In [None]:
reviews.loc[[('BeerFMAndy',100,pd.Timestamp('2009-10-22 03:39:21'))]]

### Specifying multi-axis indexers

In [None]:
idx = pd.IndexSlice

In [None]:
reviews.loc[idx[top_reviewers.index, [92,54919], :], 
               ['beer_name','beer_style']]

In [None]:
(reviews
        .query('profile_name in @top_reviewers.index and \
                beer_id in [92,54919]')
        [['beer_name','beer_style']]
 )

In [None]:
reviews.loc[idx[:, [92, 54919], :], ['beer_name' ,'beer_style']]