# 10 minutes to pandas

출처: https://pandas.pydata.org/docs/user_guide/10min.html

In [1]:
import numpy as np
import pandas as pd

## Object creation
### creating a series

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### Creating Data Frame

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns = list("ABCD"))
df             #np.random.randn(): 가우시안표준정규분포에서 난수 matrix arrary생성

Unnamed: 0,A,B,C,D
2013-01-01,1.379564,-0.455298,-0.292994,-1.293324
2013-01-02,-0.147718,0.24918,-0.014028,1.408846
2013-01-03,-0.46329,-0.386975,0.369412,-0.888332
2013-01-04,-0.954953,-1.434796,0.455755,-0.425356
2013-01-05,0.617664,0.476167,0.042961,-0.186218
2013-01-06,-0.627526,1.277547,-1.965151,-0.748369


In [5]:
df2=pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
#attribute란?

## Viewing data

### view the TOP/BOTTOM of the frame 

In [8]:
#위에서 ~개
#default 5개, 숫자 입력하여 조정가능
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.379564,-0.455298,-0.292994,-1.293324
2013-01-02,-0.147718,0.24918,-0.014028,1.408846
2013-01-03,-0.46329,-0.386975,0.369412,-0.888332
2013-01-04,-0.954953,-1.434796,0.455755,-0.425356
2013-01-05,0.617664,0.476167,0.042961,-0.186218


In [9]:
#밑에서 ~개
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.954953,-1.434796,0.455755,-0.425356
2013-01-05,0.617664,0.476167,0.042961,-0.186218
2013-01-06,-0.627526,1.277547,-1.965151,-0.748369


### Index and Columns

index : 행 목록

In [10]:
df.index  #attribute이므로 함수와 달리 괄호()없음

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

colums : 열 목록

In [11]:
df.columns  #attribute

Index(['A', 'B', 'C', 'D'], dtype='object')

### summary of data

In [12]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.03271,-0.045696,-0.234007,-0.355459
std,0.874125,0.929958,0.890542,0.944905
min,-0.954953,-1.434796,-1.965151,-1.293324
25%,-0.586467,-0.438217,-0.223253,-0.853341
50%,-0.305504,-0.068898,0.014467,-0.586863
75%,0.426319,0.41942,0.2878,-0.246003
max,1.379564,1.277547,0.455755,1.408846


### Transporting (행렬전환)

In [13]:
df.T #attribute

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.379564,-0.147718,-0.46329,-0.954953,0.617664,-0.627526
B,-0.455298,0.24918,-0.386975,-1.434796,0.476167,1.277547
C,-0.292994,-0.014028,0.369412,0.455755,0.042961,-1.965151
D,-1.293324,1.408846,-0.888332,-0.425356,-0.186218,-0.748369


### Sorting

by axis : 행/열 목록을 기준으로 정렬

In [14]:
df.sort_index(axis =1, ascending=False)  #:열 순서를 내림차순으로 정렬
#axis :축 / axis=0 :index(default) / axis=1: columns

Unnamed: 0,D,C,B,A
2013-01-01,-1.293324,-0.292994,-0.455298,1.379564
2013-01-02,1.408846,-0.014028,0.24918,-0.147718
2013-01-03,-0.888332,0.369412,-0.386975,-0.46329
2013-01-04,-0.425356,0.455755,-1.434796,-0.954953
2013-01-05,-0.186218,0.042961,0.476167,0.617664
2013-01-06,-0.748369,-1.965151,1.277547,-0.627526


by values : 값  기준으로 행/열방향으로 오른/내림 차순

In [20]:
df.sort_values(by="B", axis=0)
#by=열이름, axis=index or 0 , 열 내 순서를 기준으로 행들 정렬이 바뀜
#by=행이름, axis=column or 1, 행 내 값들을 기준으로 열들의 정렬이 바뀜

Unnamed: 0,A,B,C,D
2013-01-04,-0.954953,-1.434796,0.455755,-0.425356
2013-01-01,1.379564,-0.455298,-0.292994,-1.293324
2013-01-03,-0.46329,-0.386975,0.369412,-0.888332
2013-01-02,-0.147718,0.24918,-0.014028,1.408846
2013-01-05,0.617664,0.476167,0.042961,-0.186218
2013-01-06,-0.627526,1.277547,-1.965151,-0.748369


## Getting
### column

In [21]:
#single column
#df["열이름"]
df["A"]

2013-01-01    1.379564
2013-01-02   -0.147718
2013-01-03   -0.463290
2013-01-04   -0.954953
2013-01-05    0.617664
2013-01-06   -0.627526
Freq: D, Name: A, dtype: float64

In [22]:
#df.열이름
df.A

2013-01-01    1.379564
2013-01-02   -0.147718
2013-01-03   -0.463290
2013-01-04   -0.954953
2013-01-05    0.617664
2013-01-06   -0.627526
Freq: D, Name: A, dtype: float64

In [23]:
#multiple column : df[열 이름 리스트]
df[[ "A","B"]] # 첫번째 대괄호: 위치,  두번째 대괄호: 리스트

Unnamed: 0,A,B
2013-01-01,1.379564,-0.455298
2013-01-02,-0.147718,0.24918
2013-01-03,-0.46329,-0.386975
2013-01-04,-0.954953,-1.434796
2013-01-05,0.617664,0.476167
2013-01-06,-0.627526,1.277547


### row

In [24]:
#slicing by rows
df[0:3] #행번호 가능  
#[start(include):finish(exclude)]

Unnamed: 0,A,B,C,D
2013-01-01,1.379564,-0.455298,-0.292994,-1.293324
2013-01-02,-0.147718,0.24918,-0.014028,1.408846
2013-01-03,-0.46329,-0.386975,0.369412,-0.888332


In [25]:
#행 이름으로 슬라이싱
df["20130102":"20130104"] #이름으로 지정할때는 시작~끝값 모두 포함

Unnamed: 0,A,B,C,D
2013-01-02,-0.147718,0.24918,-0.014028,1.408846
2013-01-03,-0.46329,-0.386975,0.369412,-0.888332
2013-01-04,-0.954953,-1.434796,0.455755,-0.425356


## Selection by lable

### .loc[ ] : 레이블을 이용한 행선택


In [26]:
#df.loc[행이름]  :리스트로 반환
df.loc["2013-01-01"]


A    1.379564
B   -0.455298
C   -0.292994
D   -1.293324
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:
dates[0]

Timestamp('2013-01-01 00:00:00', freq='D')

In [28]:
df.loc[dates[0]]

A    1.379564
B   -0.455298
C   -0.292994
D   -1.293324
Name: 2013-01-01 00:00:00, dtype: float64

### .loc[행,열]

In [29]:
# 여러 열 선택하기
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,1.379564,-0.455298
2013-01-02,-0.147718,0.24918
2013-01-03,-0.46329,-0.386975
2013-01-04,-0.954953,-1.434796
2013-01-05,0.617664,0.476167
2013-01-06,-0.627526,1.277547


In [30]:
# 슬라이싱으로 일부 행,열 가져오기
df.loc["20130102":"20130104", "A":"C"]

Unnamed: 0,A,B,C
2013-01-02,-0.147718,0.24918,-0.014028
2013-01-03,-0.46329,-0.386975,0.369412
2013-01-04,-0.954953,-1.434796,0.455755


In [31]:
#리스트로 여러 행,열 가져오기
df.loc[["20130101","20130105"],["A","C"]]

Unnamed: 0,A,C
2013-01-01,1.379564,-0.292994
2013-01-05,0.617664,0.042961


In [32]:
#행 1개 선택 시, 리스트로 반환
df.loc["20130104",["A","B"]]

A   -0.954953
B   -1.434796
Name: 2013-01-04 00:00:00, dtype: float64

In [33]:
# getting scalar valus
df.loc[dates[0],"A"]

1.3795642320277886

In [34]:
# 더 빠르게 scalar value 얻기
df.at[dates[0], "A"]

1.3795642320277886

## Slection by position

### .iloc[위치정수] : 행 번호를 통한 행 선택

In [35]:
df.iloc[3] #리스트로 반환

A   -0.954953
B   -1.434796
C    0.455755
D   -0.425356
Name: 2013-01-04 00:00:00, dtype: float64

### .iloc[행번호, 열번호]

slicing

In [36]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.954953,-1.434796
2013-01-05,0.617664,0.476167


In [37]:
# 전체 열
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-0.147718,0.24918,-0.014028,1.408846
2013-01-03,-0.46329,-0.386975,0.369412,-0.888332


In [38]:
# 전체 행
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.455298,-0.292994
2013-01-02,0.24918,-0.014028
2013-01-03,-0.386975,0.369412
2013-01-04,-1.434796,0.455755
2013-01-05,0.476167,0.042961
2013-01-06,1.277547,-1.965151


list

In [39]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.147718,-0.014028
2013-01-03,-0.46329,0.369412
2013-01-05,0.617664,0.042961


scalar

In [40]:
df.iloc[1,1]

0.24917954526989078

In [41]:
df.iat[1,1]

0.24917954526989078

## Boolean indexing

### df[값 조건]

In [42]:
df[df["A"]>0]  #A열 내에서 조건에 맞는 행의 모든 열값이 불러와진다 

Unnamed: 0,A,B,C,D
2013-01-01,1.379564,-0.455298,-0.292994,-1.293324
2013-01-05,0.617664,0.476167,0.042961,-0.186218


In [43]:
df[df>0] #모든 값을 조회하여 조건에 맞는 값만 가져온다 ( 데이터프레임형식)

Unnamed: 0,A,B,C,D
2013-01-01,1.379564,,,
2013-01-02,,0.24918,,1.408846
2013-01-03,,,0.369412,
2013-01-04,,,0.455755,
2013-01-05,0.617664,0.476167,0.042961,
2013-01-06,,1.277547,,


### isin() method

In [44]:
df3=df.copy()
df3["E"]=["one", "one", "two", "three", "four", "three"]
df3

Unnamed: 0,A,B,C,D,E
2013-01-01,1.379564,-0.455298,-0.292994,-1.293324,one
2013-01-02,-0.147718,0.24918,-0.014028,1.408846,one
2013-01-03,-0.46329,-0.386975,0.369412,-0.888332,two
2013-01-04,-0.954953,-1.434796,0.455755,-0.425356,three
2013-01-05,0.617664,0.476167,0.042961,-0.186218,four
2013-01-06,-0.627526,1.277547,-1.965151,-0.748369,three


In [48]:
df3[df3["E"].isin(["two","four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.46329,-0.386975,0.369412,-0.888332,two
2013-01-05,0.617664,0.476167,0.042961,-0.186218,four


## Setting

In [51]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range("20130102", periods=6)) 
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [55]:
#새 컬럼 추가하기
df["F"]=s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.379564,-0.455298,-0.292994,-1.293324,
2013-01-02,-0.147718,0.24918,-0.014028,1.408846,1.0
2013-01-03,-0.46329,-0.386975,0.369412,-0.888332,2.0
2013-01-04,-0.954953,-1.434796,0.455755,-0.425356,3.0
2013-01-05,0.617664,0.476167,0.042961,-0.186218,4.0
2013-01-06,-0.627526,1.277547,-1.965151,-0.748369,5.0


### by label

In [57]:
df.at[dates[0],"A"]

1.3795642320277886

In [58]:
df.at[dates[0],"A"]=0
df.at[dates[0],"A"]

0.0