# Pandas 정리
Pandas의 공식 문서를 보고 정리합니다.

- [유저 가이드](https://pandas.pydata.org/docs/user_guide/index.html)
- [API 레퍼런스](https://pandas.pydata.org/docs/reference/index.html)



### 10 minutes  to pandas 따라하기


In [895]:
import numpy as np
import pandas as pd

# 넘파이와 시리즈 호환
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [896]:
dates = pd.date_range('20130101', periods=6)
dates

# 넘파이와 Dataframe 호환
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
2013-01-01,0.520497,2.042585,-0.285834,0.046392
2013-01-02,-0.989264,-0.248715,-1.104441,1.003459
2013-01-03,-0.186095,-0.160784,0.069718,-1.584559
2013-01-04,0.1443,-0.169195,0.07671,-0.873241
2013-01-05,-0.761776,-0.06954,-1.52161,0.165734
2013-01-06,0.839372,0.50246,-1.098452,-0.810864


In [897]:
dates = pd.date_range('20130101', periods=6)
dates

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
2013-01-01,1.662418,0.132887,0.1559,0.624545
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003
2013-01-03,0.838332,0.494697,-0.526519,-1.483515
2013-01-04,0.255709,1.896188,-0.48649,-0.70772
2013-01-05,0.780017,-1.054972,-0.876066,0.630361
2013-01-06,-1.280487,0.979442,-0.728822,-0.782057


In [898]:
# 쉽게 딕셔너리에서 컨버트할 수 있음
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [899]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [900]:
# 위에서부터 몇개
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,1.662418,0.132887,0.1559,0.624545
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003


In [901]:
# 아래서부터 몇개
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,0.780017,-1.054972,-0.876066,0.630361
2013-01-06,-1.280487,0.979442,-0.728822,-0.782057


In [902]:
# 인덱스를 출력해줌
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [903]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [904]:
# NumPy Array로 변경해준다
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [905]:
# 빠르게 Dataframe의 statistics를 보여준다
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.334218,0.444831,-0.452904,-0.389731
std,1.017106,0.978945,0.366438,0.845431
min,-1.280487,-1.054972,-0.876066,-1.483515
25%,-0.124082,0.154851,-0.678247,-0.763473
50%,0.517863,0.35772,-0.506504,-0.663862
75%,0.823753,0.858256,-0.313193,0.313408
max,1.662418,1.896188,0.1559,0.630361


In [906]:
# 전치행렬 (넘파이와 같음)
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.662418,-0.250679,0.838332,0.255709,0.780017,-1.280487
B,0.132887,0.220744,0.494697,1.896188,-1.054972,0.979442
C,0.1559,-0.255427,-0.526519,-0.48649,-0.876066,-0.728822
D,0.624545,-0.620003,-1.483515,-0.70772,0.630361,-0.782057


In [907]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.624545,0.1559,0.132887,1.662418
2013-01-02,-0.620003,-0.255427,0.220744,-0.250679
2013-01-03,-1.483515,-0.526519,0.494697,0.838332
2013-01-04,-0.70772,-0.48649,1.896188,0.255709
2013-01-05,0.630361,-0.876066,-1.054972,0.780017
2013-01-06,-0.782057,-0.728822,0.979442,-1.280487


In [908]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-05,0.780017,-1.054972,-0.876066,0.630361
2013-01-01,1.662418,0.132887,0.1559,0.624545
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003
2013-01-03,0.838332,0.494697,-0.526519,-1.483515
2013-01-06,-1.280487,0.979442,-0.728822,-0.782057
2013-01-04,0.255709,1.896188,-0.48649,-0.70772


Selection

In [909]:
df['A']

2013-01-01    1.662418
2013-01-02   -0.250679
2013-01-03    0.838332
2013-01-04    0.255709
2013-01-05    0.780017
2013-01-06   -1.280487
Freq: D, Name: A, dtype: float64

In [910]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.662418,0.132887,0.1559,0.624545
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003
2013-01-03,0.838332,0.494697,-0.526519,-1.483515


In [911]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003
2013-01-03,0.838332,0.494697,-0.526519,-1.483515
2013-01-04,0.255709,1.896188,-0.48649,-0.70772


In [912]:
# 아래에 loc에 대한 설명이 나와있다. 필요한 데이터만 추출할 때 유용하게 사용할 수 있을 듯 하다
# https://m.blog.naver.com/wideeyed/221964700554
# 아래는 시리즈
df.loc[dates[0]]

A    1.662418
B    0.132887
C    0.155900
D    0.624545
Name: 2013-01-01 00:00:00, dtype: float64

In [913]:
# 아래는 데이터프레임
df.loc[[dates[0]]]

Unnamed: 0,A,B,C,D
2013-01-01,1.662418,0.132887,0.1559,0.624545


In [914]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,1.662418,0.132887
2013-01-02,-0.250679,0.220744
2013-01-03,0.838332,0.494697
2013-01-04,0.255709,1.896188
2013-01-05,0.780017,-1.054972
2013-01-06,-1.280487,0.979442


In [915]:
df.loc['20130102':'20130104', ['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.250679,0.220744
2013-01-03,0.838332,0.494697
2013-01-04,0.255709,1.896188


In [916]:
df.loc['20130102', ['A', 'B']]

A   -0.250679
B    0.220744
Name: 2013-01-02 00:00:00, dtype: float64

In [917]:
# 시리즈에서 A 추출
# 이는 스칼라를 추출하는 것,
df.loc[dates[0], 'A']

1.662418498031618

In [918]:
# 스칼라는 at메소드를통해 구하면 더 빠르게 구할 수 있다
df.at[dates[0], 'A']

1.662418498031618

In [919]:
# Selection By Position
# 숫자를 이용해 선택하기
df.iloc[3]

A    0.255709
B    1.896188
C   -0.486490
D   -0.707720
Name: 2013-01-04 00:00:00, dtype: float64

In [920]:
# 역시 동일하게 배열이 아닌 값으로 넣으면 시리즈가 나오고
# 배열을 넣으면 Dataframe으로 값이 나옴
df.iloc[[3]]


Unnamed: 0,A,B,C,D
2013-01-04,0.255709,1.896188,-0.48649,-0.70772


In [921]:
# 정수로 슬라이싱
df.iloc[3:5,0:2]


Unnamed: 0,A,B
2013-01-04,0.255709,1.896188
2013-01-05,0.780017,-1.054972


In [922]:
# 특정 위치 가져오기
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.250679,-0.255427
2013-01-03,0.838332,-0.526519
2013-01-05,0.780017,-0.876066


In [923]:
# 명시적 슬라이싱
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003
2013-01-03,0.838332,0.494697,-0.526519,-1.483515


In [924]:
# 명시적 슬라이싱
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,0.132887,0.1559
2013-01-02,0.220744,-0.255427
2013-01-03,0.494697,-0.526519
2013-01-04,1.896188,-0.48649
2013-01-05,-1.054972,-0.876066
2013-01-06,0.979442,-0.728822


In [925]:
# 스칼라(값) 얻기
df.iloc[1,1]

0.2207435817342603

In [926]:
# 역시 at더 빠름
df.iat[1,1]

0.2207435817342603

In [927]:
# 조건주기, 열 A에 대한
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.662418,0.132887,0.1559,0.624545
2013-01-03,0.838332,0.494697,-0.526519,-1.483515
2013-01-04,0.255709,1.896188,-0.48649,-0.70772
2013-01-05,0.780017,-1.054972,-0.876066,0.630361


In [928]:
# 전체 데이터프레임에 대한 조건주기
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.662418,0.132887,0.1559,0.624545
2013-01-02,,0.220744,,
2013-01-03,0.838332,0.494697,,
2013-01-04,0.255709,1.896188,,
2013-01-05,0.780017,,,0.630361
2013-01-06,,0.979442,,


In [929]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'five']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.662418,0.132887,0.1559,0.624545,one
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003,one
2013-01-03,0.838332,0.494697,-0.526519,-1.483515,two
2013-01-04,0.255709,1.896188,-0.48649,-0.70772,three
2013-01-05,0.780017,-1.054972,-0.876066,0.630361,four
2013-01-06,-1.280487,0.979442,-0.728822,-0.782057,five


In [930]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.838332,0.494697,-0.526519,-1.483515,two
2013-01-05,0.780017,-1.054972,-0.876066,0.630361,four


In [931]:
# 시리즈에 인덱스를 주면 인덱스로 자동정렬
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [932]:
df['F'] = s1

In [933]:
# label로 값 설정
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.132887,0.1559,0.624545,
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003,1.0
2013-01-03,0.838332,0.494697,-0.526519,-1.483515,2.0
2013-01-04,0.255709,1.896188,-0.48649,-0.70772,3.0
2013-01-05,0.780017,-1.054972,-0.876066,0.630361,4.0
2013-01-06,-1.280487,0.979442,-0.728822,-0.782057,5.0


In [934]:
# 위치로 값 설정
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.1559,0.624545,
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003,1.0
2013-01-03,0.838332,0.494697,-0.526519,-1.483515,2.0
2013-01-04,0.255709,1.896188,-0.48649,-0.70772,3.0
2013-01-05,0.780017,-1.054972,-0.876066,0.630361,4.0
2013-01-06,-1.280487,0.979442,-0.728822,-0.782057,5.0


In [935]:
# at메소드가 아닌 loc메소드로도 값 대입 가능
df.iloc[0, 2] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,0.624545,
2013-01-02,-0.250679,0.220744,-0.255427,-0.620003,1.0
2013-01-03,0.838332,0.494697,-0.526519,-1.483515,2.0
2013-01-04,0.255709,1.896188,-0.48649,-0.70772,3.0
2013-01-05,0.780017,-1.054972,-0.876066,0.630361,4.0
2013-01-06,-1.280487,0.979442,-0.728822,-0.782057,5.0


In [936]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,5,
2013-01-02,-0.250679,0.220744,-0.255427,5,1.0
2013-01-03,0.838332,0.494697,-0.526519,5,2.0
2013-01-04,0.255709,1.896188,-0.48649,5,3.0
2013-01-05,0.780017,-1.054972,-0.876066,5,4.0
2013-01-06,-1.280487,0.979442,-0.728822,5,5.0


In [937]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,-5,
2013-01-02,-0.250679,-0.220744,-0.255427,-5,-1.0
2013-01-03,-0.838332,-0.494697,-0.526519,-5,-2.0
2013-01-04,-0.255709,-1.896188,-0.48649,-5,-3.0
2013-01-05,-0.780017,-1.054972,-0.876066,-5,-4.0
2013-01-06,-1.280487,-0.979442,-0.728822,-5,-5.0


In [938]:
# 결측치는 np.nan으로 표현된다. 계산에는 포함되지 않는다.
# Reindexing을 통해 인덱스를 변경 가능하다
df1 = df.reindex(index=dates[0:4], columns=['A', 'B', 'C', 'D'] + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.0,5,1.0
2013-01-02,-0.250679,0.220744,-0.255427,5,1.0
2013-01-03,0.838332,0.494697,-0.526519,5,
2013-01-04,0.255709,1.896188,-0.48649,5,


In [939]:
# 결측치가 있는 행을 지운다
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.0,5,1.0
2013-01-02,-0.250679,0.220744,-0.255427,5,1.0


In [940]:
# 결측치를 채운다
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.0,5,1.0
2013-01-02,-0.250679,0.220744,-0.255427,5,1.0
2013-01-03,0.838332,0.494697,-0.526519,5,5.0
2013-01-04,0.255709,1.896188,-0.48649,5,5.0


In [941]:
# Boolean Dataframe을 얻는다. 결측치가 있는 스칼라는 True가 됨
pd.isna(df1)


Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


In [942]:
# Operation

# mean메소드, 파라미터로 axis가 들어감 default는 axis=0
df.mean() # Series

A    0.057148
B    0.422683
C   -0.478887
D    5.000000
F    3.000000
dtype: float64

In [943]:
df.mean(axis=1)

2013-01-01    1.250000
2013-01-02    1.142927
2013-01-03    1.561302
2013-01-04    1.933082
2013-01-05    1.569796
2013-01-06    1.794026
Freq: D, dtype: float64

In [944]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [945]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,5,
2013-01-02,-0.250679,0.220744,-0.255427,5,1.0
2013-01-03,0.838332,0.494697,-0.526519,5,2.0
2013-01-04,0.255709,1.896188,-0.48649,5,3.0
2013-01-05,0.780017,-1.054972,-0.876066,5,4.0
2013-01-06,-1.280487,0.979442,-0.728822,5,5.0


In [946]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-0.161668,-0.505303,-1.526519,4.0,1.0
2013-01-04,-2.744291,-1.103812,-3.48649,2.0,0.0
2013-01-05,-4.219983,-6.054972,-5.876066,0.0,-1.0
2013-01-06,,,,,


In [947]:
# Apply 메소드, 데이터에 함수를 적용한다
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.0,5,
2013-01-02,-0.250679,0.220744,-0.255427,10,1.0
2013-01-03,0.587652,0.715441,-0.781946,15,3.0
2013-01-04,0.843361,2.611629,-1.268436,20,6.0
2013-01-05,1.623378,1.556657,-2.144502,25,10.0
2013-01-06,0.342891,2.536099,-2.873324,30,15.0


In [948]:
df.apply(lambda x : x.max() - x.min())


A    2.118819
B    2.951160
C    0.876066
D    0.000000
F    4.000000
dtype: float64

In [949]:
# 히스토그래밍
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    6
1    0
2    4
3    3
4    5
5    2
6    5
7    4
8    2
9    3
dtype: int64

In [950]:
s.value_counts()

5    2
4    2
3    2
2    2
6    1
0    1
dtype: int64

In [951]:
# String method
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object