# Pandas basic
- python에서 R만큼의 강력한 데이터 핸들링 성능을 제공하는 모듈
- 단일 프로세스에서는 최대 효율
- 코딩 가능하고 응용 가능한 엑셀로 받아들여도 됨
- 누군가 스테로이드를 맞은 엑셀로 표현함

# 01. Analysis Seoul CCTV

In [73]:
import pandas as pd
import numpy as np

In [74]:
pd.Series([1,3,5,7])

In [75]:
pd.Series([1,3,5,7], dtype = float64) #에러

NameError: name 'float64' is not defined

## 구글링으로 pandas 공식문서에 문법 확인

In [76]:
pd.Series([1,3,5,7], dtype = np.float64) 

0    1.0
1    3.0
2    5.0
3    7.0
dtype: float64

In [77]:
pd.Series([1,3,5,7], dtype = str)

0    1
1    3
2    5
3    7
dtype: object

In [78]:
pd.Series(np.array([1,2,3]))

0    1
1    2
2    3
dtype: int32

In [79]:
pd.Series({"Key" : "Value"})

Key    Value
dtype: object

In [80]:
data = pd.Series([1,2,3,4,"5"])
data

0    1
1    2
2    3
3    4
4    5
dtype: object

In [82]:
data % 2

TypeError: not all arguments converted during string formatting

In [83]:
data = pd.Series([1,2,3,4])
data

0    1
1    2
2    3
3    4
dtype: int64

In [84]:
data % 2

0    1
1    0
2    1
3    0
dtype: int64

### Series는 한가지 데이터 타입을 가지고, index, value으로 값을 가진다.

#### Pandas의 데이터 형을 구성하는 기본은 'Series'이다.

In [85]:
s = pd.Series([1,3,5,np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

date_range() 함수

In [86]:
dates = pd.date_range("20130101", periods = 6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

#### Pandas에서 가장 많이 사용되는 데이터형은 'DataFrame' 이고
#### index(row느낌)와 columns를 지정하면 된다.

In [87]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353
2013-01-02,-0.304148,1.108036,0.729182,0.204243
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757
2013-01-04,0.56864,1.902704,-1.02127,0.899716
2013-01-05,0.96067,-0.755891,0.811971,1.413437
2013-01-06,-0.724948,-2.687743,2.213685,-0.126405


In [88]:
#함수종류
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353
2013-01-02,-0.304148,1.108036,0.729182,0.204243
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757
2013-01-04,0.56864,1.902704,-1.02127,0.899716
2013-01-05,0.96067,-0.755891,0.811971,1.413437


In [89]:
df.index #인덱스확인

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [90]:
df.columns #컬럼확인

Index(['A', 'B', 'C', 'D'], dtype='object')

In [91]:
df.values #값만확인

array([[ 0.04841525, -0.11500318, -0.30937985, -1.72035265],
       [-0.3041485 ,  1.10803581,  0.72918163,  0.20424348],
       [-0.38094969, -2.28311973, -0.18308235,  1.19875718],
       [ 0.56864032,  1.90270425, -1.02126963,  0.89971641],
       [ 0.96067045, -0.75589079,  0.8119708 ,  1.41343699],
       [-0.72494838, -2.68774297,  2.21368518, -0.12640515]])

In [92]:
df.info() #개괄적으로 DataFrame을 설명

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [93]:
#DataFrame 통계적 기본정보를 확인
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.027947,-0.471836,0.373518,0.311566
std,0.633573,1.81843,1.133714,1.155784
min,-0.724948,-2.687743,-1.02127,-1.720353
25%,-0.361749,-1.901312,-0.277805,-0.043743
50%,-0.127867,-0.435447,0.27305,0.55198
75%,0.438584,0.802276,0.791274,1.123997
max,0.96067,1.902704,2.213685,1.413437


In [94]:
#데이터 정렬
df.sort_values(by="B", ascending = False)

Unnamed: 0,A,B,C,D
2013-01-04,0.56864,1.902704,-1.02127,0.899716
2013-01-02,-0.304148,1.108036,0.729182,0.204243
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353
2013-01-05,0.96067,-0.755891,0.811971,1.413437
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757
2013-01-06,-0.724948,-2.687743,2.213685,-0.126405


In [95]:
#특정컬럼만 읽고싶을 때
df["A"]

2013-01-01    0.048415
2013-01-02   -0.304148
2013-01-03   -0.380950
2013-01-04    0.568640
2013-01-05    0.960670
2013-01-06   -0.724948
Freq: D, Name: A, dtype: float64

In [96]:
#[n:m] n부터 m-1까지
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353
2013-01-02,-0.304148,1.108036,0.729182,0.204243
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757


In [97]:
#인덱스나 컬럼의 이름으로 슬라이스하는 경우는 끝도 포함한다.
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.304148,1.108036,0.729182,0.204243
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757
2013-01-04,0.56864,1.902704,-1.02127,0.899716


In [98]:
#location 함수
#콜론만 쓰면 모든 행
df.loc[:]

Unnamed: 0,A,B,C,D
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353
2013-01-02,-0.304148,1.108036,0.729182,0.204243
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757
2013-01-04,0.56864,1.902704,-1.02127,0.899716
2013-01-05,0.96067,-0.755891,0.811971,1.413437
2013-01-06,-0.724948,-2.687743,2.213685,-0.126405


In [99]:
df.loc[:, ["A", "B"]]
#모든 행중에 A, B 컬럼

Unnamed: 0,A,B
2013-01-01,0.048415,-0.115003
2013-01-02,-0.304148,1.108036
2013-01-03,-0.38095,-2.28312
2013-01-04,0.56864,1.902704
2013-01-05,0.96067,-0.755891
2013-01-06,-0.724948,-2.687743


In [100]:
df.loc["20130102", ["A", "B"]] #이름으로 지정

A   -0.304148
B    1.108036
Name: 2013-01-02 00:00:00, dtype: float64

In [101]:
df.iloc[3] #번호로 접근

A    0.568640
B    1.902704
C   -1.021270
D    0.899716
Name: 2013-01-04 00:00:00, dtype: float64

In [102]:
df.iloc[3:5, 0:2] #-->3,4행 0,1컬럼

Unnamed: 0,A,B
2013-01-04,0.56864,1.902704
2013-01-05,0.96067,-0.755891


In [103]:
df.iloc[[1,2,4],[0,2]] #1,2,4번 행 / 0,2번 컬럼

Unnamed: 0,A,C
2013-01-02,-0.304148,0.729182
2013-01-03,-0.38095,-0.183082
2013-01-05,0.96067,0.811971


df[condition]과 같이 사용하는 것이 일반적
Pandas의 버전에 따라 조금씩 허용되는 문법이 다르다
외부 소스코드를 쓸 때에는 Pandas의 버전을 확인하는 것이 필요하다.

In [104]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353
2013-01-02,-0.304148,1.108036,0.729182,0.204243
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757
2013-01-04,0.56864,1.902704,-1.02127,0.899716
2013-01-05,0.96067,-0.755891,0.811971,1.413437
2013-01-06,-0.724948,-2.687743,2.213685,-0.126405


조건으로 조회

In [105]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353
2013-01-04,0.56864,1.902704,-1.02127,0.899716
2013-01-05,0.96067,-0.755891,0.811971,1.413437


In [106]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.048415,,,
2013-01-02,,1.108036,0.729182,0.204243
2013-01-03,,,,1.198757
2013-01-04,0.56864,1.902704,,0.899716
2013-01-05,0.96067,,0.811971,1.413437
2013-01-06,,,2.213685,


### 컬럼 삽입

In [107]:
df["E"] = ["one", "one", "two", "three", "four", "three"]
df

Unnamed: 0,A,B,C,D,E
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353,one
2013-01-02,-0.304148,1.108036,0.729182,0.204243,one
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757,two
2013-01-04,0.56864,1.902704,-1.02127,0.899716,three
2013-01-05,0.96067,-0.755891,0.811971,1.413437,four
2013-01-06,-0.724948,-2.687743,2.213685,-0.126405,three


### isin() 함수 -  boolean 타입으로 반환

In [108]:
df["E"].isin(["two", "four"])
#DataFrame E"라는 열이 있는 경우 df["E"]는 해당 열을 나타내는 시리즈를 제공

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

### df[] - Series가 True인 경우 'df'의 해당 행이 결과에 포함됩니다. Series가 False이면 df의 해당 행이 제외됩니다.

In [109]:
df[df["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757,two
2013-01-05,0.96067,-0.755891,0.811971,1.413437,four


### 특정 컬럼 제거

In [110]:
del df["E"]
df

Unnamed: 0,A,B,C,D
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353
2013-01-02,-0.304148,1.108036,0.729182,0.204243
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757
2013-01-04,0.56864,1.902704,-1.02127,0.899716
2013-01-05,0.96067,-0.755891,0.811971,1.413437
2013-01-06,-0.724948,-2.687743,2.213685,-0.126405


### 각 컬럼 누적함

In [111]:
df.apply(np.cumsum)
df

Unnamed: 0,A,B,C,D
2013-01-01,0.048415,-0.115003,-0.30938,-1.720353
2013-01-02,-0.304148,1.108036,0.729182,0.204243
2013-01-03,-0.38095,-2.28312,-0.183082,1.198757
2013-01-04,0.56864,1.902704,-1.02127,0.899716
2013-01-05,0.96067,-0.755891,0.811971,1.413437
2013-01-06,-0.724948,-2.687743,2.213685,-0.126405


## DataFrame
- pd.Series()
    - index, value
- pd.DataFrame()
    -index, value, column

In [112]:
pd.DataFrame() # shift + tab - docstring확인가능

In [117]:
#표준정규분포에서 샘플링한 난수 생성
data = np.random.randn(6, 4) #6행 4열
data

array([[-1.12511448,  2.1318122 ,  0.40621616, -0.08692747],
       [-0.61688788, -1.23084707,  0.77109504,  0.89557834],
       [ 1.74789708, -0.11659216, -1.05132155,  0.88231797],
       [-0.61010164,  0.19451338, -0.75112913, -0.99945916],
       [ 2.02813018,  0.8878463 ,  0.31608133,  1.07460061],
       [ 0.21641445, -0.68536973,  0.3550045 , -0.35887858]])

In [119]:
df = pd.DataFrame(data, index=dates, columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-02,-0.616888,-1.230847,0.771095,0.895578
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-06,0.216414,-0.68537,0.355004,-0.358879


### 데이터 프레임 정보탐색
-df.head()

In [120]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-02,-0.616888,-1.230847,0.771095,0.895578
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-05,2.02813,0.887846,0.316081,1.074601


In [121]:
df.tail() #데이터의 수도 파악가능할 수 있다.

Unnamed: 0,A,B,C,D
2013-01-02,-0.616888,-1.230847,0.771095,0.895578
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-06,0.216414,-0.68537,0.355004,-0.358879


In [122]:
df.index #head() tail()과 다른건 변수냐 내장되어있는 함수냐

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [123]:
df.columns #리스트형태

Index(['A', 'B', 'C', 'D'], dtype='object')

- df.info() : 데이터 프레임의 기본 정보 확인

In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 412.0 bytes


- df.describe() : 데이터 프레임의 기술통계 정보 확인

In [125]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.27339,0.196894,0.007658,0.234539
std,1.325556,1.194691,0.72865,0.84149
min,-1.125114,-1.230847,-1.051322,-0.999459
25%,-0.615191,-0.543175,-0.484327,-0.290891
50%,-0.196844,0.038961,0.335543,0.397695
75%,1.365026,0.714513,0.393413,0.892263
max,2.02813,2.131812,0.771095,1.074601


### 데이터 정렬
- sort_values()
- 특정 컬럼(열)을 기준으로 데이터를 정렬

In [127]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-02,-0.616888,-1.230847,0.771095,0.895578
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-01,-1.125114,2.131812,0.406216,-0.086927


In [128]:
df.sort_values(by="B", ascending=False, inplace=True)

In [129]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


### 데이터 선택

In [130]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


In [131]:
# 한 개 컬럼 선택
df["A"]

2013-01-01   -1.125114
2013-01-05    2.028130
2013-01-04   -0.610102
2013-01-03    1.747897
2013-01-06    0.216414
2013-01-02   -0.616888
Name: A, dtype: float64

In [132]:
type(df["A"])

pandas.core.series.Series

In [133]:
df.A #컬럼명이 숫자인 경우에는 불가 문자인 경우에 가능.

2013-01-01   -1.125114
2013-01-05    2.028130
2013-01-04   -0.610102
2013-01-03    1.747897
2013-01-06    0.216414
2013-01-02   -0.616888
Name: A, dtype: float64

In [134]:
#두 개 이상 컬럼 선택
df[["A", "B"]] #리스트 형태로 대괄호 써주어야 함

Unnamed: 0,A,B
2013-01-01,-1.125114,2.131812
2013-01-05,2.02813,0.887846
2013-01-04,-0.610102,0.194513
2013-01-03,1.747897,-0.116592
2013-01-06,0.216414,-0.68537
2013-01-02,-0.616888,-1.230847


### offset index
- [n:m] : n부터 m-1 까지
- 인덱스나 컬럼의 이름으로 slice 하는 경우는 끝을 포함합니다.

In [135]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459


In [137]:
df["20130101":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


- loc : location
- index 이름으로 특정 행, 열을 선택합니다

In [138]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


In [139]:
df.loc[:, ["A", "B"]] #index는 전부 column은 A B만 가져오기

Unnamed: 0,A,B
2013-01-01,-1.125114,2.131812
2013-01-05,2.02813,0.887846
2013-01-04,-0.610102,0.194513
2013-01-03,1.747897,-0.116592
2013-01-06,0.216414,-0.68537
2013-01-02,-0.616888,-1.230847


In [140]:
df.loc["20130102":"20130104", ["A", "D"]]

Unnamed: 0,A,D
2013-01-04,-0.610102,-0.999459
2013-01-03,1.747897,0.882318
2013-01-02,-0.616888,0.895578


In [141]:
df.loc["20130102":"20130104", "A":"D"]

Unnamed: 0,A,B,C,D
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


In [142]:
df.loc["20130102", ["A", "B"]]

A   -0.616888
B   -1.230847
Name: 2013-01-02 00:00:00, dtype: float64

- iloc : integer location
    - 컴퓨터가 인식하는 인덱스 값으로 선택

In [143]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


In [144]:
df.iloc[3]

A    1.747897
B   -0.116592
C   -1.051322
D    0.882318
Name: 2013-01-03 00:00:00, dtype: float64

In [145]:
df.iloc[3, 2]

-1.0513215450651867

In [146]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-03,1.747897,-0.116592
2013-01-06,0.216414,-0.68537


In [147]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-05,2.02813,0.316081
2013-01-04,-0.610102,-0.751129
2013-01-06,0.216414,0.355004


In [148]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,2.131812,0.406216
2013-01-05,0.887846,0.316081
2013-01-04,0.194513,-0.751129
2013-01-03,-0.116592,-1.051322
2013-01-06,-0.68537,0.355004
2013-01-02,-1.230847,0.771095


### condition

In [149]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


In [150]:
# A 컬럼에서 양수만 선택
df["A"] > 0

2013-01-01    False
2013-01-05     True
2013-01-04    False
2013-01-03     True
2013-01-06     True
2013-01-02    False
Name: A, dtype: bool

In [151]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879


In [152]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,2.131812,0.406216,
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,,0.194513,,
2013-01-03,1.747897,,,0.882318
2013-01-06,0.216414,,0.355004,
2013-01-02,,,0.771095,0.895578


- NaN : Not a Number

### 컬럼 추가
- 기존 컬럼이 없으면 추가
- 기존 컬럼이 있으면 수정

In [153]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


In [160]:
df["E"] = ["one", "one", "two", "three", "four", "six"]
df

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.125114,2.131812,0.406216,-0.086927,one
2013-01-05,2.02813,0.887846,0.316081,1.074601,one
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459,two
2013-01-03,1.747897,-0.116592,-1.051322,0.882318,three
2013-01-06,0.216414,-0.68537,0.355004,-0.358879,four
2013-01-02,-0.616888,-1.230847,0.771095,0.895578,six


- isin() : 특정 요소가 있는지 확인

In [161]:
df["E"].isin(["two"])

2013-01-01    False
2013-01-05    False
2013-01-04     True
2013-01-03    False
2013-01-06    False
2013-01-02    False
Name: E, dtype: bool

In [162]:
df["E"].isin(["two", "five"])

2013-01-01    False
2013-01-05    False
2013-01-04     True
2013-01-03    False
2013-01-06    False
2013-01-02    False
Name: E, dtype: bool

In [163]:
df["E"].isin(["two", "five", "three"])

2013-01-01    False
2013-01-05    False
2013-01-04     True
2013-01-03     True
2013-01-06    False
2013-01-02    False
Name: E, dtype: bool

In [164]:
df[df["E"].isin(["two", "five", "three"])]

Unnamed: 0,A,B,C,D,E
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459,two
2013-01-03,1.747897,-0.116592,-1.051322,0.882318,three


### 특정 컬럼 제거
- del
- drop

In [166]:
del df["E"]
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


In [171]:
df.drop(["D"], axis = 1) # axis=0 -가로 / axis=1 - 세로

Unnamed: 0,A,B,C
2013-01-01,-1.125114,2.131812,0.406216
2013-01-05,2.02813,0.887846,0.316081
2013-01-04,-0.610102,0.194513,-0.751129
2013-01-03,1.747897,-0.116592,-1.051322
2013-01-06,0.216414,-0.68537,0.355004
2013-01-02,-0.616888,-1.230847,0.771095


In [174]:
df.drop(["20130104"])

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


### apply()

In [175]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


In [176]:
df["A"].apply("sum")

1.6403377163832855

In [177]:
df["A"].apply("mean")

0.27338961939721423

In [178]:
df["A"].apply("min"), df["A"].apply("max")

(-1.125114482270331, 2.0281301802503413)

In [180]:
df[["A", "D"]].apply("sum")

A    1.640338
D    1.407232
dtype: float64

In [181]:
df["A"].apply(np.sum)

2013-01-01   -1.125114
2013-01-05    2.028130
2013-01-04   -0.610102
2013-01-03    1.747897
2013-01-06    0.216414
2013-01-02   -0.616888
Name: A, dtype: float64

In [182]:
df["A"].apply(np.mean)

2013-01-01   -1.125114
2013-01-05    2.028130
2013-01-04   -0.610102
2013-01-03    1.747897
2013-01-06    0.216414
2013-01-02   -0.616888
Name: A, dtype: float64

In [183]:
df["A"].apply(np.std) #표준편차

2013-01-01    0.0
2013-01-05    0.0
2013-01-04    0.0
2013-01-03    0.0
2013-01-06    0.0
2013-01-02    0.0
Name: A, dtype: float64

In [184]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.125114,2.131812,0.406216,-0.086927
2013-01-05,2.02813,0.887846,0.316081,1.074601
2013-01-04,-0.610102,0.194513,-0.751129,-0.999459
2013-01-03,1.747897,-0.116592,-1.051322,0.882318
2013-01-06,0.216414,-0.68537,0.355004,-0.358879
2013-01-02,-0.616888,-1.230847,0.771095,0.895578


In [187]:
def addsub(num):
    return "+" if num > 0 else "-"

In [188]:
df["A"].apply(addsub) #함수를 만들어 넣을 수도 있음

2013-01-01    -
2013-01-05    +
2013-01-04    -
2013-01-03    +
2013-01-06    +
2013-01-02    -
Name: A, dtype: object

In [189]:
df["A"].apply(lambda num : "+" if num > 0 else "-")

2013-01-01    -
2013-01-05    +
2013-01-04    -
2013-01-03    +
2013-01-06    +
2013-01-02    -
Name: A, dtype: object