# Pandas 정리
Pandas의 공식 문서를 보고 정리합니다.

- [유저 가이드](https://pandas.pydata.org/docs/user_guide/index.html)
- [API 레퍼런스](https://pandas.pydata.org/docs/reference/index.html)



### 10 minutes  to pandas 따라하기


In [61]:
import numpy as np
import pandas as pd

# 넘파이와 시리즈 호환
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [62]:
dates = pd.date_range('20130101', periods=6)
dates

# 넘파이와 Dataframe 호환
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
2013-01-01,-0.279949,1.238156,0.882792,0.244953
2013-01-02,1.579157,2.030816,0.486763,0.127359
2013-01-03,-0.468228,0.50611,-0.914738,-2.195081
2013-01-04,0.639859,-0.084386,2.123412,1.162978
2013-01-05,-0.903124,0.495533,0.16649,0.210592
2013-01-06,0.84057,-2.833395,1.480018,-1.078449


In [63]:
dates = pd.date_range('20130101', periods=6)
dates

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
2013-01-01,-0.555139,0.07848,-1.524571,1.361642
2013-01-02,-1.687085,0.695406,0.706722,-0.760823
2013-01-03,0.332463,2.115827,-0.914653,-0.490239
2013-01-04,-0.014927,-1.635122,0.640063,0.537327
2013-01-05,1.319945,-0.096198,0.110449,1.411064
2013-01-06,0.947225,-0.482907,0.541461,-0.244462


In [64]:
# 쉽게 딕셔너리에서 컨버트할 수 있음
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [65]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [66]:
# 위에서부터 몇개
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,-0.555139,0.07848,-1.524571,1.361642
2013-01-02,-1.687085,0.695406,0.706722,-0.760823


In [67]:
# 아래서부터 몇개
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,1.319945,-0.096198,0.110449,1.411064
2013-01-06,0.947225,-0.482907,0.541461,-0.244462


In [68]:
# 인덱스를 출력해줌
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [69]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [70]:
# NumPy Array로 변경해준다
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [71]:
# 빠르게 Dataframe의 statistics를 보여준다
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.05708,0.112581,-0.073422,0.302418
std,1.084908,1.24952,0.931996,0.944891
min,-1.687085,-1.635122,-1.524571,-0.760823
25%,-0.420086,-0.386229,-0.658378,-0.428794
50%,0.158768,-0.008859,0.325955,0.146433
75%,0.793534,0.541174,0.615413,1.155563
max,1.319945,2.115827,0.706722,1.411064


In [72]:
# 전치행렬 (넘파이와 같음)
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.555139,-1.687085,0.332463,-0.014927,1.319945,0.947225
B,0.07848,0.695406,2.115827,-1.635122,-0.096198,-0.482907
C,-1.524571,0.706722,-0.914653,0.640063,0.110449,0.541461
D,1.361642,-0.760823,-0.490239,0.537327,1.411064,-0.244462


In [73]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.361642,-1.524571,0.07848,-0.555139
2013-01-02,-0.760823,0.706722,0.695406,-1.687085
2013-01-03,-0.490239,-0.914653,2.115827,0.332463
2013-01-04,0.537327,0.640063,-1.635122,-0.014927
2013-01-05,1.411064,0.110449,-0.096198,1.319945
2013-01-06,-0.244462,0.541461,-0.482907,0.947225


In [74]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,-0.014927,-1.635122,0.640063,0.537327
2013-01-06,0.947225,-0.482907,0.541461,-0.244462
2013-01-05,1.319945,-0.096198,0.110449,1.411064
2013-01-01,-0.555139,0.07848,-1.524571,1.361642
2013-01-02,-1.687085,0.695406,0.706722,-0.760823
2013-01-03,0.332463,2.115827,-0.914653,-0.490239


Selection

In [75]:
df['A']

2013-01-01   -0.555139
2013-01-02   -1.687085
2013-01-03    0.332463
2013-01-04   -0.014927
2013-01-05    1.319945
2013-01-06    0.947225
Freq: D, Name: A, dtype: float64

In [76]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.555139,0.07848,-1.524571,1.361642
2013-01-02,-1.687085,0.695406,0.706722,-0.760823
2013-01-03,0.332463,2.115827,-0.914653,-0.490239


In [77]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-1.687085,0.695406,0.706722,-0.760823
2013-01-03,0.332463,2.115827,-0.914653,-0.490239
2013-01-04,-0.014927,-1.635122,0.640063,0.537327


In [78]:
# 아래에 loc에 대한 설명이 나와있다. 필요한 데이터만 추출할 때 유용하게 사용할 수 있을 듯 하다
# https://m.blog.naver.com/wideeyed/221964700554
# 아래는 시리즈
df.loc[dates[0]]

A   -0.555139
B    0.078480
C   -1.524571
D    1.361642
Name: 2013-01-01 00:00:00, dtype: float64

In [79]:
# 아래는 데이터프레임
df.loc[[dates[0]]]

Unnamed: 0,A,B,C,D
2013-01-01,-0.555139,0.07848,-1.524571,1.361642


In [80]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.555139,0.07848
2013-01-02,-1.687085,0.695406
2013-01-03,0.332463,2.115827
2013-01-04,-0.014927,-1.635122
2013-01-05,1.319945,-0.096198
2013-01-06,0.947225,-0.482907


In [81]:
df.loc['20130102':'20130104', ['A','B']]

Unnamed: 0,A,B
2013-01-02,-1.687085,0.695406
2013-01-03,0.332463,2.115827
2013-01-04,-0.014927,-1.635122


In [82]:
df.loc['20130102', ['A', 'B']]

A   -1.687085
B    0.695406
Name: 2013-01-02 00:00:00, dtype: float64

In [83]:
# 시리즈에서 A 추출
df.loc[dates[0], 'A']

-0.5551387762896798

In [84]:
df.at[dates[0], 'A']

-0.5551387762896798