## Handling Missing Data
아래와 같은 함수를 통해 결측치를 확인하고 대치할 수 있다.
```python
df.dropna(axis=0, how='any')
df.fillna(0)
df.fillna('string')
df.isnull.sum()
df.notnull.sum()
```

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5]],
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [3]:
df.dropna(axis=1, how='any')

Unnamed: 0,D
0,0
1,1
2,5


In [4]:
df.dropna(axis=0, how='all')

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [5]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5


In [6]:
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5


In [7]:
df.isnull().sum()

A    2
B    1
C    3
D    0
dtype: int64

In [8]:
df.notnull().sum()

A    1
B    2
C    0
D    3
dtype: int64

## Reshaping Data
아래와 같은 식을 사용하여 DataFrame을 Reshape할 수 있다.
적절히 활용하여 원하는 형태로 데이터를 바꾸어보자.
```python
pd.melt()
df.groupby()
df.pivot()

```

In [9]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
df

Unnamed: 0,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


In [10]:
pd.melt(df, id_vars=['A'], value_vars=['B'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5


In [11]:
df = pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
df

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [12]:
df.groupby("variable").sum()

Unnamed: 0_level_0,value
variable,Unnamed: 1_level_1
B,9
C,12


In [13]:
df

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [14]:
df.pivot(columns='variable', values='value')

variable,B,C
0,1.0,
1,3.0,
2,5.0,
3,,2.0
4,,4.0
5,,6.0


In [15]:
df = pd.DataFrame({'foo': ['one','one','one','two','two','two'],
                       'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                       'baz': [1, 2, 3, 4, 5, 6]})
df

Unnamed: 0,foo,bar,baz
0,one,A,1
1,one,B,2
2,one,C,3
3,two,A,4
4,two,B,5
5,two,C,6


In [16]:
df2 = df.pivot(index='foo', columns='bar', values='baz')
df2

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [17]:
df.pivot(columns='bar')['baz']

bar,A,B,C
0,1.0,,
1,,2.0,
2,,,3.0
3,4.0,,
4,,5.0,
5,,,6.0


In [18]:
df2

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [19]:
df3 = df2.reset_index()
df3

bar,foo,A,B,C
0,one,1,2,3
1,two,4,5,6


In [20]:
df3.melt(id_vars=['foo'], value_vars=['A', 'B', 'C']).sort_values(by=['foo', 'bar'])

Unnamed: 0,foo,bar,value
0,one,A,1
2,one,B,2
4,one,C,3
1,two,A,4
3,two,B,5
5,two,C,6


## Concat & Merge

concat과 merge 함수를 사용하여 두 개의 DataFrame을 하나로 합칠 수 있다.

In [21]:
s1 = pd.Series(['a', 'b'])
s1

0    a
1    b
dtype: object

In [22]:
s2 = pd.Series(['c', 'd'])
s2

0    c
1    d
dtype: object

In [23]:
pd.concat([s1, s2], ignore_index=True)

0    a
1    b
2    c
3    d
dtype: object

In [24]:
pd.concat([s1, s2], keys=['s1', 's2',], names=['Series name', 'Row ID'])

Series name  Row ID
s1           0         a
             1         b
s2           0         c
             1         d
dtype: object

In [25]:
df1 = pd.DataFrame([['a', 1], ['b', 2]],
                   columns=['letter', 'number'])
df1

Unnamed: 0,letter,number
0,a,1
1,b,2


In [26]:
df2 = pd.DataFrame([['c', 3], ['d', 4]],
                   columns=['letter', 'number'])
df2

Unnamed: 0,letter,number
0,c,3
1,d,4


In [27]:
pd.concat([df1,df2])

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


In [28]:
df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
                   columns=['letter', 'number', 'animal'])
df3

Unnamed: 0,letter,number,animal
0,c,3,cat
1,d,4,dog


In [29]:
pd.concat([df1, df3], sort=False)

Unnamed: 0,letter,number,animal
0,a,1,
1,b,2,
0,c,3,cat
1,d,4,dog


In [30]:
pd.concat([df1, df3], join="inner")

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


In [31]:
adf = pd.DataFrame({"x1": ["A", "B", "C"], "x2": [1, 2, 3]})
adf

Unnamed: 0,x1,x2
0,A,1
1,B,2
2,C,3


In [32]:
bdf = pd.DataFrame({"x1": ["A", "B", "D"], "x3": ["T", "F", "T"]})
bdf

Unnamed: 0,x1,x3
0,A,T
1,B,F
2,D,T


In [33]:
pd.merge(adf, bdf, how='left', on='x1')

Unnamed: 0,x1,x2,x3
0,A,1,T
1,B,2,F
2,C,3,


In [34]:
pd.merge(adf, bdf, how='right', on='x1')

Unnamed: 0,x1,x2,x3
0,A,1.0,T
1,B,2.0,F
2,D,,T


In [35]:
pd.merge(adf, bdf, how='inner', on='x1')

Unnamed: 0,x1,x2,x3
0,A,1,T
1,B,2,F


In [36]:
pd.merge(adf, bdf, how='outer', on='x1')

Unnamed: 0,x1,x2,x3
0,A,1.0,T
1,B,2.0,F
2,C,3.0,
3,D,,T


## Groupby & Cum

groupby와 shift, cum을 활용하여 다양한 연산이 가능하다.
```python
df.groupby(col)['target'].size()
df.gorupby([col1, col2])['target'].mean()
df.shift(n)
df.cumsum()
df.cummax()
df.cummin()
df.cumprod()
```

In [37]:
import seaborn as sns
df = sns.load_dataset("mpg")
df.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite


In [38]:
df.groupby(by="origin")['cylinders'].size()

origin
europe     70
japan      79
usa       249
Name: cylinders, dtype: int64

In [39]:
df["origin"].value_counts()

usa       249
japan      79
europe     70
Name: origin, dtype: int64

In [40]:
pd.DataFrame(df.groupby(['model_year', 'origin'])['cylinders'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,cylinders
model_year,origin,Unnamed: 2_level_1
70,europe,4.0
70,japan,4.0
70,usa,7.636364
71,europe,4.0
71,japan,4.0
71,usa,6.2
72,europe,4.0
72,japan,3.8
72,usa,6.888889
73,europe,4.0


In [41]:
df2 = pd.DataFrame(
[[4, 7, 10],
[5, 11, 8],
[6, 9, 12]],
index=[1, 2, 3],
columns=['a', 'b', 'c'])

df2

Unnamed: 0,a,b,c
1,4,7,10
2,5,11,8
3,6,9,12


In [42]:
df2.shift(1)

Unnamed: 0,a,b,c
1,,,
2,4.0,7.0,10.0
3,5.0,11.0,8.0


In [43]:
df2['b'].shift(2)

1    NaN
2    NaN
3    7.0
Name: b, dtype: float64

In [44]:
df2.cumsum()

Unnamed: 0,a,b,c
1,4,7,10
2,9,18,18
3,15,27,30


In [45]:
df2.cummax()

Unnamed: 0,a,b,c
1,4,7,10
2,5,11,10
3,6,11,12


In [46]:
df2.cummin()

Unnamed: 0,a,b,c
1,4,7,10
2,4,7,8
3,4,7,8


In [47]:
df2.cumprod()

Unnamed: 0,a,b,c
1,4,7,10
2,20,77,80
3,120,693,960
