In [5]:
import numpy as np
import pandas as pd

In [6]:
data = pd.Series([1, np.nan, 'hello', None])

In [7]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [8]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [9]:
data.dropna()

0        1
2    hello
dtype: object

In [10]:
df = pd.DataFrame([[1, np.nan, 2],
                  [2, 3, 5],
                  [np.nan, 4, 6]])

In [11]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [12]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [13]:
df.dropna(axis = "columns")

Unnamed: 0,2
0,2
1,5
2,6


In [14]:
df[3] = np.nan

In [15]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [16]:
df.dropna(axis = "columns", how = "all")

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [17]:
df.dropna(axis="rows", thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


 - 널 값 채우기

In [19]:
data = pd.Series([1, np.nan, 2, None, 3], index=list("abcde"))

In [20]:
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [22]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [24]:
## forward-fill(이전 값으로 채우기)
data.fillna(method = "ffill")

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [25]:
## back-fill(다음에 오는 값으로 채우기)
data.fillna(method = "bfill")

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [26]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [27]:
df.fillna(method = "ffill", axis = 1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


 - 계층적 인덱싱(hierarchical indexing)

In [28]:
import pandas as pd
import numpy as np

In [31]:
## 나쁜 방식
index = [("California", 2000), ("California", 2010) ,
        ("New York", 2000), ("New York", 2010), 
        ("Texas", 2000), ("Texas", 2010)]

In [32]:
index

[('California', 2000),
 ('California', 2010),
 ('New York', 2000),
 ('New York', 2010),
 ('Texas', 2000),
 ('Texas', 2010)]

In [33]:
populations = [33871648, 37253956, 
              18976457, 19378102, 
              20851820, 25145561]

In [35]:
populations

[33871648, 37253956, 18976457, 19378102, 20851820, 25145561]

In [36]:
pop = pd.Series(populations, index=index)

In [37]:
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [38]:
pop[("California", 2010) : ("Texas", 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [39]:
## 2010 값 선택을 위한 data munging
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

 - Pandas MultiIndex

In [40]:
index = pd.MultiIndex.from_tuples(index)

In [41]:
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [None]:
## 다중 레벨의 인덱싱을 포함하고 있음을 알아두자. 이 경우에는 주 이름과 연도는 물론이고 이 레벨을 인코딩하는 각 데이터 점에 대한 
## 여러 레이블을 가지고 있음.

In [42]:
pop =  pop.reindex(index)

In [43]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [44]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

 - MultiIndex : 추가지원

In [45]:
pop_df = pop.unstack()

In [46]:
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [47]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [48]:
pop_df = pd.DataFrame({"total" : pop,
                      "under18" : [9267089, 9284094,
                                  4687374, 4318033, 
                                  5906301, 6879014]})

In [49]:
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [50]:
f_u18 = pop_df["under18"] / pop_df["total"]
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568
