# Chapter 5: Boolean Indexing
## Recipes
* [Calculating boolean statistics](#Calculating-boolean-statistics)
* [Constructing multiple boolean conditions](#Constructing-multiple-boolean-conditions)
* [Filtering with boolean indexing](#Filtering-with-boolean-indexing)
* [Replicating boolean indexing with index selection](#Replicating-boolean-indexing-with-index-selection)
* [Selecting with unique and sorted indexes](#Selecting-with-unique-and-sorted-indexes)
* [Gaining perspective on stock prices](#Gaining-perspective-on-stock-prices)
* [Translating SQL WHERE clauses](#Translating-SQL-WHERE-clauses)
* [Determining the normality of stock market returns](#Determining-the-normality-of-stock-market-returns)
* [Improving readability of boolean indexing with the query method](#Improving-readability-of-boolean-indexing-with-the-query-method)
* [Preserving Series with the where method](#Preserving-Series-with-the-where-method)
* [Masking DataFrame rows](#Masking-DataFrame-rows)
* [Selecting with booleans, integer location, and labels](#Selecting-with-booleans,-integer-location-and-labels)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Calculating boolean statistics

In [None]:
pd.options.display.max_columns = 50

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
movie.head()

In [None]:
movie_2_hours = movie['duration'] > 120
movie_2_hours.head(10)

In [None]:
movie_2_hours.sum()

In [None]:
movie_2_hours.mean()

In [None]:
movie_2_hours.describe()

In [None]:
movie['duration'].dropna().gt(120).mean()

## How it works...

In [None]:
movie_2_hours.value_counts(normalize=True)

## There's more...

In [None]:
actors = movie[['actor_1_facebook_likes', 'actor_2_facebook_likes']].dropna()
(actors['actor_1_facebook_likes'] > actors['actor_2_facebook_likes']).mean()

# Constructing multiple boolean conditions

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
movie.head()

In [None]:
criteria1 = movie.imdb_score > 8
criteria2 = movie.content_rating == 'PG-13'
criteria3 = (movie.title_year < 2000) | (movie.title_year >= 2010)

criteria2.head()

In [None]:
criteria_final = criteria1 & criteria2 & criteria3
criteria_final.head()

# There's more...

In [None]:
movie.title_year < 2000 | movie.title_year > 2009

# Filtering with boolean indexing

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')

crit_a1 = movie.imdb_score > 8
crit_a2 = movie.content_rating == 'PG-13'
crit_a3 = (movie.title_year < 2000) | (movie.title_year > 2009)
final_crit_a = crit_a1 & crit_a2 & crit_a3

In [None]:
crit_b1 = movie.imdb_score < 5
crit_b2 = movie.content_rating == 'R'
crit_b3 = (movie.title_year >= 2000) & (movie.title_year <= 2010)
final_crit_b = crit_b1 & crit_b2 & crit_b3

In [None]:
final_crit_all = final_crit_a | final_crit_b
final_crit_all.head()

In [None]:
movie[final_crit_all].head()

In [None]:
cols = ['imdb_score', 'content_rating', 'title_year']
movie_filtered = movie.loc[final_crit_all, cols]
movie_filtered.head(10)

# There's more...

In [None]:
final_crit_a2 = (movie.imdb_score > 8) & \
                (movie.content_rating == 'PG-13') & \
                ((movie.title_year < 2000) | (movie.title_year > 2009))
final_crit_a2.equals(final_crit_a)

# Replicating boolean indexing with index selection

In [None]:
college = pd.read_csv('data/college.csv')
college[college['STABBR'] == 'TX'].head()

In [None]:
college2 = college.set_index('STABBR')
college2.loc['TX'].head()

In [None]:
%timeit college[college['STABBR'] == 'TX']

In [None]:
%timeit college2.loc['TX']

In [None]:
%timeit college2 = college.set_index('STABBR')

## There's more...

In [None]:
states =['TX', 'CA', 'NY']
college[college['STABBR'].isin(states)]
college2.loc[states].head()

# Selecting with unique and sorted indexes

In [None]:
college = pd.read_csv('data/college.csv')
college2 = college.set_index('STABBR')

In [None]:
college2.index.is_monotonic

In [None]:
college3 = college2.sort_index()
college3.index.is_monotonic

In [None]:
%timeit college[college['STABBR'] == 'TX']

In [None]:
%timeit college2.loc['TX']

In [None]:
%timeit college3.loc['TX']

In [None]:
college_unique = college.set_index('INSTNM')
college_unique.index.is_unique

In [None]:
college[college['INSTNM'] == 'Stanford University']

In [None]:
college_unique.loc['Stanford University']

In [None]:
%timeit college[college['INSTNM'] == 'Stanford University']

In [None]:
%timeit college_unique.loc['Stanford University']

## There's more...

In [None]:
college.index = college['CITY'] + ', ' + college['STABBR']
college = college.sort_index()
college.head()

In [None]:
college.loc['Miami, FL'].head()

In [None]:
%%timeit 
crit1 = college['CITY'] == 'Miami' 
crit2 = college['STABBR'] == 'FL'
college[crit1 & crit2]

In [None]:
%timeit college.loc['Miami, FL']

In [None]:
college[(college['CITY'] == 'Miami') & (college['STABBR'] == 'FL')].equals(college.loc['Miami, FL'])

# Gaining perspective on stock prices

In [None]:
slb = pd.read_csv('data/slb_stock.csv', index_col='Date', parse_dates=['Date'])
slb.head()

In [None]:
slb_close = slb['Close']
slb_summary = slb_close.describe(percentiles=[.1, .9])
slb_summary

In [None]:
upper_10 = slb_summary.loc['90%']
lower_10 = slb_summary.loc['10%']
criteria = (slb_close < lower_10) | (slb_close > upper_10)
slb_top_bottom_10 = slb_close[criteria]

In [None]:
slb_close.plot(color='black', figsize=(12,6))
slb_top_bottom_10.plot(marker='o', style=' ', ms=4, color='lightgray')

xmin = criteria.index[0]
xmax = criteria.index[-1]
plt.hlines(y=[lower_10, upper_10], xmin=xmin, xmax=xmax,color='black')

## There's more...

In [None]:
slb_close.plot(color='black', figsize=(12,6))
plt.hlines(y=[lower_10, upper_10], 
           xmin=xmin, xmax=xmax,color='lightgray')
plt.fill_between(x=criteria.index, y1=lower_10,
                 y2=slb_close.values, color='black')
plt.fill_between(x=criteria.index,y1=lower_10,
                 y2=slb_close.values, where=slb_close < lower_10,
                 color='lightgray')
plt.fill_between(x=criteria.index, y1=upper_10, 
                 y2=slb_close.values, where=slb_close > upper_10,
                 color='lightgray')

# Translating SQL WHERE clauses

In [None]:
employee = pd.read_csv('data/employee.csv')

In [None]:
employee.DEPARTMENT.value_counts().head()

In [None]:
employee.GENDER.value_counts()

In [None]:
employee.BASE_SALARY.describe().astype(int)

In [None]:
depts = ['Houston Police Department-HPD', 
             'Houston Fire Department (HFD)']
criteria_dept = employee.DEPARTMENT.isin(depts)
criteria_gender = employee.GENDER == 'Female'
criteria_sal = (employee.BASE_SALARY >= 80000) & \
               (employee.BASE_SALARY <= 120000)

In [None]:
criteria_final = criteria_dept & criteria_gender & criteria_sal

In [None]:
select_columns = ['UNIQUE_ID', 'DEPARTMENT', 'GENDER', 'BASE_SALARY']
employee.loc[criteria_final, select_columns].head()

## There's more...

In [None]:
criteria_sal = employee.BASE_SALARY.between(80000, 120000)

In [None]:
top_5_depts = employee.DEPARTMENT.value_counts().index[:5]
criteria = ~employee.DEPARTMENT.isin(top_5_depts)
employee[criteria].head()

# Determining the normality of stock market returns

In [None]:
amzn = pd.read_csv('data/amzn_stock.csv', index_col='Date', parse_dates=['Date'])
amzn.head()

In [None]:
amzn_daily_return = amzn.Close.pct_change()
amzn_daily_return.head()

In [None]:
amzn_daily_return = amzn_daily_return.dropna()
amzn_daily_return.hist(bins=20)

In [None]:
mean = amzn_daily_return.mean()  
std = amzn_daily_return.std()

In [None]:
abs_z_score = amzn_daily_return.sub(mean).abs().div(std)

In [None]:
pcts = [abs_z_score.lt(i).mean() for i in range(1,4)]
print('{:.3f} fall within 1 standard deviation. '
      '{:.3f} within 2 and {:.3f} within 3'.format(*pcts))

In [None]:
def test_return_normality(stock_data):
    close = stock_data['Close']
    daily_return = close.pct_change().dropna()
    daily_return.hist(bins=20)
    mean = daily_return.mean() 
    std = daily_return.std()
    
    abs_z_score = abs(daily_return - mean) / std
    pcts = [abs_z_score.lt(i).mean() for i in range(1,4)]

    print('{:.3f} fall within 1 standard deviation. '
          '{:.3f} within 2 and {:.3f} within 3'.format(*pcts))

In [None]:
slb = pd.read_csv('data/slb_stock.csv', 
                  index_col='Date', parse_dates=['Date'])
test_return_normality(slb)

# Improving readability of boolean indexing with the query method

In [None]:
employee = pd.read_csv('data/employee.csv')
depts = ['Houston Police Department-HPD', 'Houston Fire Department (HFD)']
select_columns = ['UNIQUE_ID', 'DEPARTMENT', 'GENDER', 'BASE_SALARY']

In [None]:
qs = "DEPARTMENT in @depts " \
         "and GENDER == 'Female' " \
         "and 80000 <= BASE_SALARY <= 120000"
        
emp_filtered = employee.query(qs)
emp_filtered[select_columns].head()

# There's more...

In [None]:
top10_depts = employee.DEPARTMENT.value_counts().index[:10].tolist()
qs = "DEPARTMENT not in @top10_depts and GENDER == 'Female'"
employee_filtered2 = employee.query(qs)
employee_filtered2[['DEPARTMENT', 'GENDER']].head()

# Preserving Series with the where method

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
fb_likes = movie['actor_1_facebook_likes'].dropna()
fb_likes.head()

In [None]:
fb_likes.describe(percentiles=[.1, .25, .5, .75, .9]).astype(int)

In [None]:
fb_likes.describe(percentiles=[.1,.25,.5,.75,.9])

In [None]:
fb_likes.hist()

In [None]:
criteria_high = fb_likes < 20000
criteria_high.mean().round(2)

In [None]:
fb_likes.where(criteria_high).head()

In [None]:
fb_likes.where(criteria_high, other=20000).head()

In [None]:
criteria_low = fb_likes > 300
fb_likes_cap = fb_likes.where(criteria_high, other=20000)\
                       .where(criteria_low, 300)
fb_likes_cap.head()

In [None]:
len(fb_likes), len(fb_likes_cap)

In [None]:
fb_likes_cap.hist()

In [None]:
fb_likes_cap2 = fb_likes.clip(lower=300, upper=20000)
fb_likes_cap2.equals(fb_likes_cap)

# Masking DataFrame rows

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
c1 = movie['title_year'] >= 2010
c2 = movie['title_year'].isnull()
criteria = c1 | c2

In [None]:
movie.mask(criteria).head()

In [None]:
movie_mask = movie.mask(criteria).dropna(how='all')
movie_mask.head()

In [None]:
movie_boolean = movie[movie['title_year'] < 2010]
movie_boolean.head()

In [None]:
movie_mask.equals(movie_boolean)

In [None]:
movie_mask.shape == movie_boolean.shape

In [None]:
movie_mask.dtypes == movie_boolean.dtypes

In [None]:
from pandas.testing import assert_frame_equal
assert_frame_equal(movie_boolean, movie_mask, check_dtype=False)

In [None]:
%timeit movie.mask(criteria).dropna(how='all')

In [None]:
%timeit movie[movie['title_year'] < 2010]

# Selecting with booleans, integer location and labels

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
c1 = movie['content_rating'] == 'G'
c2 = movie['imdb_score'] < 4
criteria = c1 & c2

In [None]:
movie_loc = movie.loc[criteria]
movie_loc.head()

In [None]:
movie_loc.equals(movie[criteria])

In [None]:
movie_iloc = movie.iloc[criteria]

In [None]:
movie_iloc = movie.iloc[criteria.values]

In [None]:
movie_iloc.equals(movie_loc)

In [None]:
movie.loc[criteria.values]

In [None]:
criteria_col = movie.dtypes == np.int64
criteria_col.head()

In [None]:
movie.loc[:, criteria_col].head()

In [None]:
movie.iloc[:, criteria_col.values].head()

In [None]:
cols = ['content_rating', 'imdb_score', 'title_year', 'gross']
movie.loc[criteria, cols].sort_values('imdb_score')

In [None]:
col_index = [movie.columns.get_loc(col) for col in cols]
col_index

In [None]:
movie.iloc[criteria.values, col_index].sort_values('imdb_score')

## How it works

In [None]:
a = criteria.values
a[:5]

In [None]:
len(a), len(criteria)

# There's more...

In [None]:
movie.loc[[True, False, True], [True, False, False, True]]