# 10 minutes to pandas

출처: https://pandas.pydata.org/docs/user_guide/10min.html

In [5]:
import numpy as np
import pandas as pd

## Object creation
### creating a series

In [7]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### Creating Data Frame

In [8]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns = list("ABCD"))
df             #np.random.randn(): 가우시안표준정규분포에서 난수 matrix arrary생성

Unnamed: 0,A,B,C,D
2013-01-01,-0.3537,-0.113234,0.022683,0.560711
2013-01-02,0.24865,-0.067372,-0.966383,-0.134326
2013-01-03,0.757383,0.388361,-0.541155,-1.433319
2013-01-04,0.009037,0.277653,1.231518,-0.689989
2013-01-05,-0.260682,1.849582,-1.143801,-0.339334
2013-01-06,0.384528,-0.717406,-0.189581,0.302414


In [13]:
df2=pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [14]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [None]:
#attribute란?

## Viewing data

### view the TOP/BOTTOM of the frame 

In [18]:
#위에서 ~개
#default 5개, 숫자 입력하여 조정가능
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.3537,-0.113234,0.022683,0.560711
2013-01-02,0.24865,-0.067372,-0.966383,-0.134326
2013-01-03,0.757383,0.388361,-0.541155,-1.433319
2013-01-04,0.009037,0.277653,1.231518,-0.689989
2013-01-05,-0.260682,1.849582,-1.143801,-0.339334


In [21]:
#밑에서 ~개
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.009037,0.277653,1.231518,-0.689989
2013-01-05,-0.260682,1.849582,-1.143801,-0.339334
2013-01-06,0.384528,-0.717406,-0.189581,0.302414


### Index and Columns

index : 행 목록

In [85]:
df.index  #attribute이므로 함수와 달리 괄호()없음

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

colums : 열 목록

In [86]:
df.columns  #attribute

Index(['A', 'B', 'C', 'D'], dtype='object')

### summary of data

In [25]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.130869,0.269597,-0.264453,-0.288974
std,0.418019,0.865425,0.856564,0.716728
min,-0.3537,-0.717406,-1.143801,-1.433319
25%,-0.193252,-0.101768,-0.860076,-0.602325
50%,0.128844,0.105141,-0.365368,-0.23683
75%,0.350558,0.360684,-0.030383,0.193229
max,0.757383,1.849582,1.231518,0.560711


### Transporting (행렬전환)

In [26]:
df.T #attribute

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.3537,0.24865,0.757383,0.009037,-0.260682,0.384528
B,-0.113234,-0.067372,0.388361,0.277653,1.849582,-0.717406
C,0.022683,-0.966383,-0.541155,1.231518,-1.143801,-0.189581
D,0.560711,-0.134326,-1.433319,-0.689989,-0.339334,0.302414


### Sorting

by axis : 행/열 목록을 기준으로 정렬

In [87]:
df.sort_index(axis =1, ascending=False)  #:열 순서를 내림차순으로 정렬
#axis :축 / axis=0 :index(default) / axis=1: columns

Unnamed: 0,D,C,B,A
2013-01-01,0.560711,0.022683,-0.113234,-0.3537
2013-01-02,-0.134326,-0.966383,-0.067372,0.24865
2013-01-03,-1.433319,-0.541155,0.388361,0.757383
2013-01-04,-0.689989,1.231518,0.277653,0.009037
2013-01-05,-0.339334,-1.143801,1.849582,-0.260682
2013-01-06,0.302414,-0.189581,-0.717406,0.384528


by values : 값  기준으로 행/열방향으로 오른/내림 차순

In [88]:
df.sort_values(by="B", axis=0)
#by=열이름, axis=index or 0 , 열 내 순서를 기준으로 행들 정렬이 바뀜
#by=행이름, axis=column or 1, 행 내 값들을 기준으로 열들의 정렬이 바뀜

Unnamed: 0,A,B,C,D
2013-01-06,0.384528,-0.717406,-0.189581,0.302414
2013-01-01,-0.3537,-0.113234,0.022683,0.560711
2013-01-02,0.24865,-0.067372,-0.966383,-0.134326
2013-01-04,0.009037,0.277653,1.231518,-0.689989
2013-01-03,0.757383,0.388361,-0.541155,-1.433319
2013-01-05,-0.260682,1.849582,-1.143801,-0.339334


## Getting
### column

In [50]:
#single column
#df["열이름"]
df["A"]

2013-01-01   -0.353700
2013-01-02    0.248650
2013-01-03    0.757383
2013-01-04    0.009037
2013-01-05   -0.260682
2013-01-06    0.384528
Freq: D, Name: A, dtype: float64

In [45]:
#df.열이름
df.A

2013-01-01   -0.353700
2013-01-02    0.248650
2013-01-03    0.757383
2013-01-04    0.009037
2013-01-05   -0.260682
2013-01-06    0.384528
Freq: D, Name: A, dtype: float64

In [39]:
#multiple column : df[열 이름 리스트]
df[[ "A","B"]] # 첫번째 대괄호: 위치,  두번째 대괄호: 리스트

Unnamed: 0,A,B
2013-01-01,-0.3537,-0.113234
2013-01-02,0.24865,-0.067372
2013-01-03,0.757383,0.388361
2013-01-04,0.009037,0.277653
2013-01-05,-0.260682,1.849582
2013-01-06,0.384528,-0.717406


### row

In [56]:
#slicing by rows
df[0:3] #행번호 가능  
#[start(include):finish(exclude)]

Unnamed: 0,A,B,C,D
2013-01-01,-0.3537,-0.113234,0.022683,0.560711
2013-01-02,0.24865,-0.067372,-0.966383,-0.134326
2013-01-03,0.757383,0.388361,-0.541155,-1.433319


In [48]:
#행 이름으로 슬라이싱
df["20130102":"20130104"] #이름으로 지정할때는 시작~끝값 모두 포함

Unnamed: 0,A,B,C,D
2013-01-02,0.24865,-0.067372,-0.966383,-0.134326
2013-01-03,0.757383,0.388361,-0.541155,-1.433319
2013-01-04,0.009037,0.277653,1.231518,-0.689989


## Selection by lable

### .loc[ ] : 레이블을 이용한 행선택


In [59]:
#df.loc[행이름]  :리스트로 반환
df.loc["2013-01-01"]


A   -0.353700
B   -0.113234
C    0.022683
D    0.560711
Name: 2013-01-01 00:00:00, dtype: float64

In [62]:
dates[0]

Timestamp('2013-01-01 00:00:00', freq='D')

In [63]:
df.loc[dates[0]]

A   -0.353700
B   -0.113234
C    0.022683
D    0.560711
Name: 2013-01-01 00:00:00, dtype: float64

### .loc[행,열]

In [64]:
# 여러 열 선택하기
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,-0.3537,-0.113234
2013-01-02,0.24865,-0.067372
2013-01-03,0.757383,0.388361
2013-01-04,0.009037,0.277653
2013-01-05,-0.260682,1.849582
2013-01-06,0.384528,-0.717406


In [68]:
# 슬라이싱으로 일부 행,열 가져오기
df.loc["20130102":"20130104", "A":"C"]

Unnamed: 0,A,B,C
2013-01-02,0.24865,-0.067372,-0.966383
2013-01-03,0.757383,0.388361,-0.541155
2013-01-04,0.009037,0.277653,1.231518


In [70]:
#리스트로 여러 행,열 가져오기
df.loc[["20130101","20130105"],["A","C"]]

Unnamed: 0,A,C
2013-01-01,-0.3537,0.022683
2013-01-05,-0.260682,-1.143801


In [71]:
#행 1개 선택 시, 리스트로 반환
df.loc["20130104",["A","B"]]

A    0.009037
B    0.277653
Name: 2013-01-04 00:00:00, dtype: float64

In [72]:
# getting scalar valus
df.loc[dates[0],"A"]

-0.35370002854079247

In [73]:
# 더 빠르게 scalar value 얻기
df.at[dates[0], "A"]

-0.35370002854079247

## Slection by position

### .iloc[위치정수] : 행 번호를 통한 행 선택

In [75]:
df.iloc[3] #리스트로 반환

A    0.009037
B    0.277653
C    1.231518
D   -0.689989
Name: 2013-01-04 00:00:00, dtype: float64

### .iloc[행번호, 열번호]

slicing

In [77]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.009037,0.277653
2013-01-05,-0.260682,1.849582


In [80]:
# 전체 열
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,0.24865,-0.067372,-0.966383,-0.134326
2013-01-03,0.757383,0.388361,-0.541155,-1.433319


In [81]:
# 전체 행
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.113234,0.022683
2013-01-02,-0.067372,-0.966383
2013-01-03,0.388361,-0.541155
2013-01-04,0.277653,1.231518
2013-01-05,1.849582,-1.143801
2013-01-06,-0.717406,-0.189581


list

In [78]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.24865,-0.966383
2013-01-03,0.757383,-0.541155
2013-01-05,-0.260682,-1.143801


scalar

In [82]:
df.iloc[1,1]

-0.06737201181114272

In [83]:
df.iat[1,1]

-0.06737201181114272

## Boolean indexing

### df[값 조건]

In [89]:
df[df["A"]>0]  #A열 내에서 조건에 맞는 행의 모든 열값이 불러와진다 

Unnamed: 0,A,B,C,D
2013-01-02,0.24865,-0.067372,-0.966383,-0.134326
2013-01-03,0.757383,0.388361,-0.541155,-1.433319
2013-01-04,0.009037,0.277653,1.231518,-0.689989
2013-01-06,0.384528,-0.717406,-0.189581,0.302414


In [91]:
df[df>0] #모든 값을 조회하여 조건에 맞는 값만 가져온다 ( 데이터프레임형식)

Unnamed: 0,A,B,C,D
2013-01-01,,,0.022683,0.560711
2013-01-02,0.24865,,,
2013-01-03,0.757383,0.388361,,
2013-01-04,0.009037,0.277653,1.231518,
2013-01-05,,1.849582,,
2013-01-06,0.384528,,,0.302414


### isin() method

In [96]:
df3=df.copy()
df3["E"]=["one", "one", "two", "three", "four", "three"]
df3

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.3537,-0.113234,0.022683,0.560711,one
2013-01-02,0.24865,-0.067372,-0.966383,-0.134326,one
2013-01-03,0.757383,0.388361,-0.541155,-1.433319,two
2013-01-04,0.009037,0.277653,1.231518,-0.689989,three
2013-01-05,-0.260682,1.849582,-1.143801,-0.339334,four
2013-01-06,0.384528,-0.717406,-0.189581,0.302414,three


Unnamed: 0,A,B,C,D,E
2013-01-01,-0.3537,-0.113234,0.022683,0.560711,one
2013-01-02,0.24865,-0.067372,-0.966383,-0.134326,one
2013-01-03,0.757383,0.388361,-0.541155,-1.433319,two
2013-01-04,0.009037,0.277653,1.231518,-0.689989,three
2013-01-05,-0.260682,1.849582,-1.143801,-0.339334,four
2013-01-06,0.384528,-0.717406,-0.189581,0.302414,three
