# CHAPTER 02. DataFrame 필수 연산

## DataFrame에서 복수 열 선택


In [74]:
import pandas as pd
movie = pd.read_csv("../data/movie.csv")
movie[['director_name', "actor_1_name"]].head(3)


Unnamed: 0,director_name,actor_1_name
0,James Cameron,CCH Pounder
1,Gore Verbinski,Johnny Depp
2,Sam Mendes,Christoph Waltz


### python list 사용하여 코드 가독성 높이기


In [75]:
columns = ["director_name", "actor_1_name"]
movie[columns].head(3)


Unnamed: 0,director_name,actor_1_name
0,James Cameron,CCH Pounder
1,Gore Verbinski,Johnny Depp
2,Sam Mendes,Christoph Waltz


## 메서드를 사용한 열 선택


### select_dtypes()


In [76]:
movie.select_dtypes(include=['int']).head(3)

Unnamed: 0,num_voted_users,cast_total_facebook_likes,movie_facebook_likes
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000


### filter()

In [77]:
movie.filter(like="facebook").head(3)


Unnamed: 0,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,actor_2_facebook_likes,movie_facebook_likes
0,0.0,855.0,1000.0,4834,936.0,33000
1,563.0,1000.0,40000.0,48350,5000.0,0
2,0.0,161.0,11000.0,11700,393.0,85000


In [78]:
movie.filter(regex="^dir").head(3)


Unnamed: 0,director_name,director_facebook_likes
0,James Cameron,0.0
1,Gore Verbinski,563.0
2,Sam Mendes,0.0


In [79]:
movie.filter(items=["director_name", "bacasdf"]).head(3)
# 인덱싱 연산자와 다르게 없는 칼럼이 있어도 KeyError 안내고 그냥 없이 출력함



Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes


## 열 이름 일목요연하게 정렬하기

- 비슷한거 모아서 열 순서 정렬하기
- 중요한거 먼저 나타내기


## 전체 DataFrame에 대한 연산

- 각 열에 대해 연산


In [80]:
movie.count().head(3) # Series


color                     4897
director_name             4814
num_critic_for_reviews    4867
dtype: int64

In [81]:
movie.max().head(3)


num_critic_for_reviews       813
duration                     511
director_facebook_likes    23000
dtype: object

In [82]:
movie.describe().head(3)


Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.924939,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.162547,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458


## DataFrame 메서드 체인으로 묶기


In [83]:
movie.isnull().head(3) #DataFrame


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [84]:
movie.isnull().sum().head(3) # Series


color                      19
director_name             102
num_critic_for_reviews     49
dtype: int64

In [85]:
movie.isnull().sum().sum()

2654

In [86]:
movie.isnull().any() #Series


color                         True
director_name                 True
num_critic_for_reviews        True
duration                      True
director_facebook_likes       True
actor_3_facebook_likes        True
actor_2_name                  True
actor_1_facebook_likes        True
gross                         True
genres                       False
actor_1_name                  True
movie_title                  False
num_voted_users              False
cast_total_facebook_likes    False
actor_3_name                  True
facenumber_in_poster          True
plot_keywords                 True
movie_imdb_link              False
num_user_for_reviews          True
language                      True
country                       True
content_rating                True
budget                        True
title_year                    True
actor_2_facebook_likes        True
imdb_score                   False
aspect_ratio                  True
movie_facebook_likes         False
dtype: bool

In [87]:
movie.isnull().any().any()


True

## DataFrame에서 연산자 이용

- dataframe 의 데이터 타입이 동질이어야 가능.


In [94]:
college = pd.read_csv("../data/college.csv",index_col = "INSTNM")

college_ugds = college.filter(like = "UGDS_")
college_ugds.head(3)


Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715


### 소수점 아래 세번째 자리에서 반올림 하기


In [101]:
((college_ugds + 0.00501) // 0.01 / 100).head(3)


Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27


## 누락값 비교

- pandas 에서 데이터 누락 값은 numpy 의 nan 객체로 나타낸다.
- np.nan 은 == 연산을 통해 False 만 나온다. (자기 자신 포함)
- Series 와 DataFrame 에서 누락 값을 찾을 때 == np.nan 과 같은 식으로 하면 안된다.(eq 도 안됨) (isnull() 사용)
- 두 객체 비교 할때는 equals 를 쓰자.


In [117]:
import numpy as np

(college_ugds == np.nan).sum().sum()


0

In [115]:
college_ugds.isnull().sum().sum()

5949

## DataFrame 연산의 방향 바꾸기

- DataFrame 메서드는 axis 매개변수 있음. 디폴트 = index,
- axis = columns 로 하면 열 방향으로 연산할 수 있다.(각 행의 열들을 연산)


In [120]:
college_ugds.count().head(3)


UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
dtype: int64

In [121]:
college_ugds.count(axis="columns").head(3)

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
dtype: int64

## 대학 캠퍼스의 다양성 지수 발견

### 9개 인종 중에 차지 비율이 15퍼가 넘는 인종의 수를 대학별로 나타내기



In [146]:
college_ugds.dropna(how="all").ge(.15).sum(axis=1).sort_values(ascending=False).head()

INSTNM
Regency Beauty Institute-Austin          5
Central Texas Beauty College-Temple      5
Sullivan and Cogliano Training Center    4
Ambria College of Nursing                4
Berkeley College-New York                4
dtype: int64