# Pandas 정리
Pandas의 공식 문서를 보고 정리합니다.

- [유저 가이드](https://pandas.pydata.org/docs/user_guide/index.html)
- [API 레퍼런스](https://pandas.pydata.org/docs/reference/index.html)



### 10 minutes  to pandas 따라하기


In [233]:
import numpy as np
import pandas as pd

# 넘파이와 시리즈 호환
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [234]:
dates = pd.date_range('20130101', periods=6)
dates

# 넘파이와 Dataframe 호환
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
2013-01-01,1.492914,-0.829124,0.53443,-1.779962
2013-01-02,0.250533,0.0763,-2.134205,0.382448
2013-01-03,0.207836,-0.236692,1.831408,1.260429
2013-01-04,-0.031868,1.290931,0.033075,0.619626
2013-01-05,-0.489236,0.917038,0.075151,0.212301
2013-01-06,1.091738,-0.621264,-1.082289,1.825291


In [235]:
dates = pd.date_range('20130101', periods=6)
dates

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
2013-01-01,-2.494151,-0.422089,-0.00937,-1.874347
2013-01-02,1.609916,-1.913429,1.132821,0.856051
2013-01-03,2.052436,-0.202747,-0.455613,0.204076
2013-01-04,0.774299,-0.27726,0.589272,-0.049582
2013-01-05,0.766346,0.784104,-1.191702,0.464242
2013-01-06,0.027132,-0.464599,-0.514315,-0.122955


In [236]:
# 쉽게 딕셔너리에서 컨버트할 수 있음
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [237]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [238]:
# 위에서부터 몇개
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,-2.494151,-0.422089,-0.00937,-1.874347
2013-01-02,1.609916,-1.913429,1.132821,0.856051


In [239]:
# 아래서부터 몇개
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,0.766346,0.784104,-1.191702,0.464242
2013-01-06,0.027132,-0.464599,-0.514315,-0.122955


In [240]:
# 인덱스를 출력해줌
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [241]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [242]:
# NumPy Array로 변경해준다
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [243]:
# 빠르게 Dataframe의 statistics를 보여준다
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.455996,-0.416003,-0.074818,-0.087086
std,1.610284,0.865987,0.835378,0.94607
min,-2.494151,-1.913429,-1.191702,-1.874347
25%,0.211935,-0.453971,-0.49964,-0.104612
50%,0.770322,-0.349674,-0.232491,0.077247
75%,1.401011,-0.221376,0.439612,0.399201
max,2.052436,0.784104,1.132821,0.856051


In [244]:
# 전치행렬 (넘파이와 같음)
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-2.494151,1.609916,2.052436,0.774299,0.766346,0.027132
B,-0.422089,-1.913429,-0.202747,-0.27726,0.784104,-0.464599
C,-0.00937,1.132821,-0.455613,0.589272,-1.191702,-0.514315
D,-1.874347,0.856051,0.204076,-0.049582,0.464242,-0.122955


In [245]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.874347,-0.00937,-0.422089,-2.494151
2013-01-02,0.856051,1.132821,-1.913429,1.609916
2013-01-03,0.204076,-0.455613,-0.202747,2.052436
2013-01-04,-0.049582,0.589272,-0.27726,0.774299
2013-01-05,0.464242,-1.191702,0.784104,0.766346
2013-01-06,-0.122955,-0.514315,-0.464599,0.027132


In [246]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-02,1.609916,-1.913429,1.132821,0.856051
2013-01-06,0.027132,-0.464599,-0.514315,-0.122955
2013-01-01,-2.494151,-0.422089,-0.00937,-1.874347
2013-01-04,0.774299,-0.27726,0.589272,-0.049582
2013-01-03,2.052436,-0.202747,-0.455613,0.204076
2013-01-05,0.766346,0.784104,-1.191702,0.464242


Selection

In [247]:
df['A']

2013-01-01   -2.494151
2013-01-02    1.609916
2013-01-03    2.052436
2013-01-04    0.774299
2013-01-05    0.766346
2013-01-06    0.027132
Freq: D, Name: A, dtype: float64

In [248]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-2.494151,-0.422089,-0.00937,-1.874347
2013-01-02,1.609916,-1.913429,1.132821,0.856051
2013-01-03,2.052436,-0.202747,-0.455613,0.204076


In [249]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.609916,-1.913429,1.132821,0.856051
2013-01-03,2.052436,-0.202747,-0.455613,0.204076
2013-01-04,0.774299,-0.27726,0.589272,-0.049582


In [250]:
# 아래에 loc에 대한 설명이 나와있다. 필요한 데이터만 추출할 때 유용하게 사용할 수 있을 듯 하다
# https://m.blog.naver.com/wideeyed/221964700554
# 아래는 시리즈
df.loc[dates[0]]

A   -2.494151
B   -0.422089
C   -0.009370
D   -1.874347
Name: 2013-01-01 00:00:00, dtype: float64

In [251]:
# 아래는 데이터프레임
df.loc[[dates[0]]]

Unnamed: 0,A,B,C,D
2013-01-01,-2.494151,-0.422089,-0.00937,-1.874347


In [252]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-2.494151,-0.422089
2013-01-02,1.609916,-1.913429
2013-01-03,2.052436,-0.202747
2013-01-04,0.774299,-0.27726
2013-01-05,0.766346,0.784104
2013-01-06,0.027132,-0.464599


In [253]:
df.loc['20130102':'20130104', ['A','B']]

Unnamed: 0,A,B
2013-01-02,1.609916,-1.913429
2013-01-03,2.052436,-0.202747
2013-01-04,0.774299,-0.27726


In [254]:
df.loc['20130102', ['A', 'B']]

A    1.609916
B   -1.913429
Name: 2013-01-02 00:00:00, dtype: float64

In [255]:
# 시리즈에서 A 추출
# 이는 스칼라를 추출하는 것,
df.loc[dates[0], 'A']

-2.4941508019659793

In [256]:
# 스칼라는 at메소드를통해 구하면 더 빠르게 구할 수 있다
df.at[dates[0], 'A']

-2.4941508019659793

In [257]:
# Selection By Position
# 숫자를 이용해 선택하기
df.iloc[3]

A    0.774299
B   -0.277260
C    0.589272
D   -0.049582
Name: 2013-01-04 00:00:00, dtype: float64

In [258]:
# 역시 동일하게 배열이 아닌 값으로 넣으면 시리즈가 나오고
# 배열을 넣으면 Dataframe으로 값이 나옴
df.iloc[[3]]


Unnamed: 0,A,B,C,D
2013-01-04,0.774299,-0.27726,0.589272,-0.049582


In [259]:
# 정수로 슬라이싱
df.iloc[3:5,0:2]


Unnamed: 0,A,B
2013-01-04,0.774299,-0.27726
2013-01-05,0.766346,0.784104


In [260]:
# 특정 위치 가져오기
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,1.609916,1.132821
2013-01-03,2.052436,-0.455613
2013-01-05,0.766346,-1.191702


In [261]:
# 명시적 슬라이싱
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,1.609916,-1.913429,1.132821,0.856051
2013-01-03,2.052436,-0.202747,-0.455613,0.204076


In [262]:
# 명시적 슬라이싱
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.422089,-0.00937
2013-01-02,-1.913429,1.132821
2013-01-03,-0.202747,-0.455613
2013-01-04,-0.27726,0.589272
2013-01-05,0.784104,-1.191702
2013-01-06,-0.464599,-0.514315


In [263]:
# 스칼라(값) 얻기
df.iloc[1,1]

-1.913429056089032

In [264]:
# 역시 at더 빠름
df.iat[1,1]

-1.913429056089032

In [265]:
# 조건주기, 열 A에 대한
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,1.609916,-1.913429,1.132821,0.856051
2013-01-03,2.052436,-0.202747,-0.455613,0.204076
2013-01-04,0.774299,-0.27726,0.589272,-0.049582
2013-01-05,0.766346,0.784104,-1.191702,0.464242
2013-01-06,0.027132,-0.464599,-0.514315,-0.122955


In [266]:
# 전체 데이터프레임에 대한 조건주기
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,1.609916,,1.132821,0.856051
2013-01-03,2.052436,,,0.204076
2013-01-04,0.774299,,0.589272,
2013-01-05,0.766346,0.784104,,0.464242
2013-01-06,0.027132,,,


In [267]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'five']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-2.494151,-0.422089,-0.00937,-1.874347,one
2013-01-02,1.609916,-1.913429,1.132821,0.856051,one
2013-01-03,2.052436,-0.202747,-0.455613,0.204076,two
2013-01-04,0.774299,-0.27726,0.589272,-0.049582,three
2013-01-05,0.766346,0.784104,-1.191702,0.464242,four
2013-01-06,0.027132,-0.464599,-0.514315,-0.122955,five


In [268]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,2.052436,-0.202747,-0.455613,0.204076,two
2013-01-05,0.766346,0.784104,-1.191702,0.464242,four


In [269]:
# 시리즈에 인덱스를 주면 인덱스로 자동정렬
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [270]:
df['F'] = s1

In [271]:
# label로 값 설정
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.422089,-0.00937,-1.874347,
2013-01-02,1.609916,-1.913429,1.132821,0.856051,1.0
2013-01-03,2.052436,-0.202747,-0.455613,0.204076,2.0
2013-01-04,0.774299,-0.27726,0.589272,-0.049582,3.0
2013-01-05,0.766346,0.784104,-1.191702,0.464242,4.0
2013-01-06,0.027132,-0.464599,-0.514315,-0.122955,5.0


In [272]:
# 위치로 값 설정
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.00937,-1.874347,
2013-01-02,1.609916,-1.913429,1.132821,0.856051,1.0
2013-01-03,2.052436,-0.202747,-0.455613,0.204076,2.0
2013-01-04,0.774299,-0.27726,0.589272,-0.049582,3.0
2013-01-05,0.766346,0.784104,-1.191702,0.464242,4.0
2013-01-06,0.027132,-0.464599,-0.514315,-0.122955,5.0


In [273]:
# at메소드가 아닌 loc메소드로도 값 대입 가능
df.iloc[0, 2] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,-1.874347,
2013-01-02,1.609916,-1.913429,1.132821,0.856051,1.0
2013-01-03,2.052436,-0.202747,-0.455613,0.204076,2.0
2013-01-04,0.774299,-0.27726,0.589272,-0.049582,3.0
2013-01-05,0.766346,0.784104,-1.191702,0.464242,4.0
2013-01-06,0.027132,-0.464599,-0.514315,-0.122955,5.0


In [274]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,5,
2013-01-02,1.609916,-1.913429,1.132821,5,1.0
2013-01-03,2.052436,-0.202747,-0.455613,5,2.0
2013-01-04,0.774299,-0.27726,0.589272,5,3.0
2013-01-05,0.766346,0.784104,-1.191702,5,4.0
2013-01-06,0.027132,-0.464599,-0.514315,5,5.0


In [275]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,-5,
2013-01-02,-1.609916,-1.913429,-1.132821,-5,-1.0
2013-01-03,-2.052436,-0.202747,-0.455613,-5,-2.0
2013-01-04,-0.774299,-0.27726,-0.589272,-5,-3.0
2013-01-05,-0.766346,-0.784104,-1.191702,-5,-4.0
2013-01-06,-0.027132,-0.464599,-0.514315,-5,-5.0


In [276]:
# 결측치는 np.nan으로 표현된다. 계산에는 포함되지 않는다.
# Reindexing을 통해 인덱스를 변경 가능하다
df1 = df.reindex(index=dates[0:4], columns=['A', 'B', 'C', 'D'] + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.0,5,1.0
2013-01-02,1.609916,-1.913429,1.132821,5,1.0
2013-01-03,2.052436,-0.202747,-0.455613,5,
2013-01-04,0.774299,-0.27726,0.589272,5,


In [277]:
# 결측치가 있는 행을 지운다
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.0,5,1.0
2013-01-02,1.609916,-1.913429,1.132821,5,1.0


In [278]:
# 결측치를 채운다
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.0,5,1.0
2013-01-02,1.609916,-1.913429,1.132821,5,1.0
2013-01-03,2.052436,-0.202747,-0.455613,5,5.0
2013-01-04,0.774299,-0.27726,0.589272,5,5.0


In [279]:
# Boolean Dataframe을 얻는다. 결측치가 있는 스칼라는 True가 됨
pd.isna(df1)


Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


In [280]:
# Operation

# mean메소드, 파라미터로 axis가 들어감 default는 axis=0
df.mean() # Series

A    0.871688
B   -0.345655
C   -0.073256
D    5.000000
F    3.000000
dtype: float64

In [281]:
df.mean(axis=1)

2013-01-01    1.250000
2013-01-02    1.365861
2013-01-03    1.678815
2013-01-04    1.817262
2013-01-05    1.871750
2013-01-06    1.809644
Freq: D, dtype: float64

In [282]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [283]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,5,
2013-01-02,1.609916,-1.913429,1.132821,5,1.0
2013-01-03,2.052436,-0.202747,-0.455613,5,2.0
2013-01-04,0.774299,-0.27726,0.589272,5,3.0
2013-01-05,0.766346,0.784104,-1.191702,5,4.0
2013-01-06,0.027132,-0.464599,-0.514315,5,5.0


In [284]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,1.052436,-1.202747,-1.455613,4.0,1.0
2013-01-04,-2.225701,-3.27726,-2.410728,2.0,0.0
2013-01-05,-4.233654,-4.215896,-6.191702,0.0,-1.0
2013-01-06,,,,,


In [285]:
# Apply 메소드, 데이터에 함수를 적용한다
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,5,
2013-01-02,1.609916,-1.913429,1.132821,10,1.0
2013-01-03,3.662351,-2.116177,0.677208,15,3.0
2013-01-04,4.43665,-2.393436,1.26648,20,6.0
2013-01-05,5.202996,-1.609332,0.074778,25,10.0
2013-01-06,5.230128,-2.073931,-0.439537,30,15.0


In [286]:
df.apply(lambda x : x.max() - x.min())


A    2.052436
B    2.697533
C    2.324523
D    0.000000
F    4.000000
dtype: float64

In [287]:
# 히스토그래밍
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    0
1    5
2    6
3    6
4    5
5    5
6    5
7    3
8    3
9    0
dtype: int64

In [288]:
s.value_counts()

5    4
6    2
3    2
0    2
dtype: int64

In [289]:
# String method
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [290]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    # print(s)
    # print(['background-color: yellow' if v else '' for v in is_max])
    return ['background-color: yellow' if v else '' for v in is_max]

df.style.apply(highlight_max)

2013-01-01    0.000000
2013-01-02    1.609916
2013-01-03    2.052436
2013-01-04    0.774299
2013-01-05    0.766346
2013-01-06    0.027132
Freq: D, Name: A, dtype: float64
['', '', 'background-color: yellow', '', '', '']
2013-01-01    0.000000
2013-01-02   -1.913429
2013-01-03   -0.202747
2013-01-04   -0.277260
2013-01-05    0.784104
2013-01-06   -0.464599
Freq: D, Name: B, dtype: float64
['', '', '', '', 'background-color: yellow', '']
2013-01-01    0.000000
2013-01-02    1.132821
2013-01-03   -0.455613
2013-01-04    0.589272
2013-01-05   -1.191702
2013-01-06   -0.514315
Freq: D, Name: C, dtype: float64
['', 'background-color: yellow', '', '', '', '']
2013-01-01    5
2013-01-02    5
2013-01-03    5
2013-01-04    5
2013-01-05    5
2013-01-06    5
Freq: D, Name: D, dtype: int64
['background-color: yellow', 'background-color: yellow', 'background-color: yellow', 'background-color: yellow', 'background-color: yellow', 'background-color: yellow']
2013-01-01    NaN
2013-01-02    1.0
2013-01-

Unnamed: 0,A,B,C,D,F
2013-01-01 00:00:00,0.0,0.0,0.0,5,
2013-01-02 00:00:00,1.609916,-1.913429,1.132821,5,1.0
2013-01-03 00:00:00,2.052436,-0.202747,-0.455613,5,2.0
2013-01-04 00:00:00,0.774299,-0.27726,0.589272,5,3.0
2013-01-05 00:00:00,0.766346,0.784104,-1.191702,5,4.0
2013-01-06 00:00:00,0.027132,-0.464599,-0.514315,5,5.0
