# Pandas Cookbook

## 5장. 불리언 인덱싱 

### 소개

In [1]:
import numpy as np
import pandas as pd

### 불리언 통계량 계산

In [10]:
movie = pd.read_csv('/Users/chosikc/Sites/PandasCookbook/movie.csv', index_col = 'movie_title')
movie.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [11]:
movie_2_hours = movie['duration'] > 120
movie_2_hours.head()

movie_title
Avatar                                         True
Pirates of the Caribbean: At World's End       True
Spectre                                        True
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
Name: duration, dtype: bool

In [12]:
movie_2_hours.sum()

1039

In [13]:
movie_2_hours.mean()

0.2113506916192026

In [14]:
# 누락된 값을 찾아서 삭제할 필요가 있다.

movie['duration'].dropna().gt(120).mean()

0.21199755152009794

In [15]:
movie_2_hours.describe()

count      4916
unique        2
top       False
freq       3877
Name: duration, dtype: object

In [16]:
movie_2_hours.value_counts(normalize=True)

False    0.788649
True     0.211351
Name: duration, dtype: float64

In [18]:
actors = movie[['actor_1_facebook_likes', 'actor_2_facebook_likes']].dropna()
actors

Unnamed: 0_level_0,actor_1_facebook_likes,actor_2_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Avatar,1000.0,936.0
Pirates of the Caribbean: At World's End,40000.0,5000.0
Spectre,11000.0,393.0
The Dark Knight Rises,27000.0,23000.0
Star Wars: Episode VII - The Force Awakens,131.0,12.0
John Carter,640.0,632.0
Spider-Man 3,24000.0,11000.0
Tangled,799.0,553.0
Avengers: Age of Ultron,26000.0,21000.0
Harry Potter and the Half-Blood Prince,25000.0,11000.0


In [19]:
(actors['actor_1_facebook_likes'] > actors['actor_2_facebook_likes']).mean()

0.9777687130328371

### 다중 불리언 조건 구축

- 파이썬에서 불리언 표현은 내장 논리 연산자, and, or, not 을 사용
- 키워드는 pandas의 불리언 인덱싱과 작동하지 않음.
- 각 &, |, ~ 로 대체, 각 표현은 괄호로 묶어야 함.

In [20]:
movie = pd.read_csv('/Users/chosikc/Sites/PandasCookbook/movie.csv', index_col = 'movie_title')

In [21]:
criterial1 = movie.imdb_score > 8
criterial2 = movie.content_rating == 'PG-13'
criterial3 = ((movie.title_year < 2000) | (movie.title_year > 2009))

In [22]:
criterial2.head()

movie_title
Avatar                                         True
Pirates of the Caribbean: At World's End       True
Spectre                                        True
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
Name: content_rating, dtype: bool

In [25]:
criteria_final = criterial1 & criterial2 & criterial3 
criteria_final.head()

movie_title
Avatar                                        False
Pirates of the Caribbean: At World's End      False
Spectre                                       False
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
dtype: bool

In [26]:
5 < 10 and 3 > 4

False

movie.title_year < 2000 | movie.title_year > 2009 <br>
type error

- 파이썬은 0이 아닌 모든 정수는 True 로 간주한다. 
- 빈 문자열을 제외한 모든 문자열은 True 로 간주한다.
- 비어 있지 않은 모든 집합, 튜플, 딕셔너리, 리스트는 True 다.
- 빈 DataFrame, Series 는 True, False 없이 Error 처리 한다. 
- 객체 자체의 참을 검색하려면 Bool 함수를 사용한다.

### 불리언 인덱싱을 사용한 필터링

In [29]:
movie = pd.read_csv('/Users/chosikc/Sites/PandasCookbook/movie.csv', index_col = 'movie_title')
crit_a1 = movie.imdb_score > 8
crit_a2 = movie.content_rating == 'PG-13'
crit_a3 = (movie.title_year < 2000) | (movie.title_year > 2009)
final_crit_a = crit_a1 & crit_a2 & crit_a3

In [31]:
crit_b1 = movie.imdb_score < 5
crit_b2 = movie.content_rating == 'R'
crit_b3 = ((movie.title_year >= 2000) | (movie.title_year <= 2010))
final_crit_b = crit_b1 & crit_b2 & crit_b3

In [32]:
final_crit_all = final_crit_a | final_crit_b
final_crit_all.head()

movie_title
Avatar                                        False
Pirates of the Caribbean: At World's End      False
Spectre                                       False
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
dtype: bool

In [33]:
movie[final_crit_all].head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
The Avengers,Color,Joss Whedon,703.0,173.0,0.0,19000.0,Robert Downey Jr.,26000.0,623279547.0,Action|Adventure|Sci-Fi,...,1722.0,English,USA,PG-13,220000000.0,2012.0,21000.0,8.1,1.85,123000
Captain America: Civil War,Color,Anthony Russo,516.0,147.0,94.0,11000.0,Scarlett Johansson,21000.0,407197282.0,Action|Adventure|Sci-Fi,...,1022.0,English,USA,PG-13,250000000.0,2016.0,19000.0,8.2,2.35,72000
The Lovers,Color,Roland Joffé,10.0,109.0,596.0,283.0,Alice Englert,622.0,,Action|Adventure|Romance|Sci-Fi,...,15.0,English,Belgium,R,,2015.0,525.0,4.5,,677
Guardians of the Galaxy,Color,James Gunn,653.0,121.0,571.0,3000.0,Vin Diesel,14000.0,333130696.0,Action|Adventure|Sci-Fi,...,1097.0,English,USA,PG-13,170000000.0,2014.0,14000.0,8.1,2.35,96000


In [34]:
cols = ['imdb_score','content_rating','title_year']
movie_filtered = movie.loc[final_crit_all, cols]
movie_filtered.head(10)

Unnamed: 0_level_0,imdb_score,content_rating,title_year
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Dark Knight Rises,8.5,PG-13,2012.0
The Avengers,8.1,PG-13,2012.0
Captain America: Civil War,8.2,PG-13,2016.0
The Lovers,4.5,R,2015.0
Guardians of the Galaxy,8.1,PG-13,2014.0
Interstellar,8.6,PG-13,2014.0
Inception,8.8,PG-13,2010.0
The Martian,8.1,PG-13,2015.0
Town & Country,4.4,R,2001.0
Sex and the City 2,4.3,R,2010.0


In [35]:
final_crit_a2 = (movie.imdb_score > 8) & (movie.content_rating == 'PG-13') &  ((movie.title_year < 2000) | (movie.title_year > 2009))
final_crit_a2.equals(final_crit_a)

True

- 불리언 인덱싱에 대한 pandas의 공식 문서 : http://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing
- 파이썬 객체가 참인지 검사하기 : https://docs.python.org/3/library/stdtypes.html#truth

### 인덱스를 사용한 불리언 인덱싱의 복제

### 고유한 정렬된 인덱스를 사용한 선택

### 주가 전망

### SQL WHERE 절 해석

### 주식 시장 수익률의 정규성 검정

### query 메서드를 사용한 불리언 인덱싱의 가독성 개선

### where 메서드를 사용한 Series 보존

### DataFrame 행 마스크

### 불리언, 정수 위치, 레이블을 이용한 선택