In [2]:
import pandas as pd
import numpy as np

In [3]:
# 한 셀에서 복수개의 출력을 가능하게 한다.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [15]:
# 소숫점 셋째자리까지 출력
pd.set_option('display.float_format', lambda x : "%0.3f" % x)

In [5]:
# 모든 column 출력
pd.set_option('max_columns', None)

# `Series` Data type

* 문자열로 인덱싱이 가능하다.

* 한가지 data type만 가질 수 있다.

In [2]:
import pandas as pd

In [3]:
# 기본 인덱스로 0, 1, 2, 3 ...을 사용
a = pd.Series([1, 2, 3, 4])
a

0    1
1    2
2    3
3    4
dtype: int64

In [17]:
# 리스트로 Series 객체 생성
a = pd.Series(
    [1, 2, 3, 4],
    index = ['a', 'b', 'c', 'd']
)
a

a    1
b    2
c    3
d    4
dtype: int64

In [5]:
# 딕셔너리로 Series 객체 생성
a = pd.Series(
    {'a' : 1,
     'b' : 2,
     'c' : 3,
     'd' : 4}
)
a

a    1
b    2
c    3
d    4
dtype: int64

In [6]:
# shift + tap으로 함수에 대한 설명을 볼 수 있다.
a.head()

a    1
b    2
c    3
d    4
dtype: int64

## `nan`과 관련된 함수

* 일반적으로 nan을 제외하고 연산을 수행하지만 unique() 메서드는 포함하여 연산한다.

In [7]:
import numpy as np

In [9]:
# nan은 numpy에서 정의되어 있다.
np.nan

nan

In [11]:
# nan이 존재하면 data type은 float64로 변경된다.
a = pd.Series([10, 9, 1, 2, 3, np.nan])
a

0    10.0
1     9.0
2     1.0
3     2.0
4     3.0
5     NaN
dtype: float64

In [12]:
# len() 함수는 nan을 포함한다
len(a)

6

In [14]:
# count() 메서드는 nan을 포함하지 않는다. (shift-tap 확인)
a.count()

5

In [15]:
# unique() 메서드는 nan을 포함한다.
# 반환타입: ndarray
a.unique()

array([10.,  9.,  1.,  2.,  3., nan])

In [16]:
# value_counts() 메서드는 nan을 포함하지 않는다.
# 반환타입: Series
a.value_counts()

10.0    1
9.0     1
1.0     1
2.0     1
3.0     1
dtype: int64

## Series 객체는 index를 기준으로 연산이 수행된다.

In [18]:
s3 = pd.Series(
    [1, 2, 3, 4],
    index = ['a', 'b', 'c', 'd']
)
s4 = pd.Series(
    [4, 3, 2, 1],
    index = ['d', 'c', 'b', 'a']
)

In [21]:
s3 + s4

a    2
b    4
c    6
d    8
dtype: int64

# DataFrame data type

* 다수의 Series 객체를 하나의 변수로 관리하기 위해 만든 자료형

* Series를 값으로 갖는 dict 형태이다.

    {'컬럼명1' : Series1, '컬럼명2' : Series2}

* 각 Series는 DataFrame의 column을 이룬다.

* DataFrame을 이루는 Series 간의 index는 모두 동일해야한다.

    -> 데이터 손실을 최소화하기 위해 항상 합집합으로 병합된다.

## DataFrame을 만드는 다양한 방법들

In [23]:
# 1번째 방법(list 또는 ndarray로 생성)
a = pd.DataFrame(
    [[1, 2],
     [3, 4]]
)
a
b = pd.DataFrame(
    np.array(
        [[1, 2],
         [3, 4]]
    )
)
b

Unnamed: 0,0,1
0,1,2
1,3,4


Unnamed: 0,0,1
0,1,2
1,3,4


In [28]:
# 2번째 방법(행으로 넣기, 많이 안쓰임...)
a = pd.DataFrame(
    [pd.Series(np.arange(1, 6)),
     pd.Series(np.arange(6, 11))]
)
a
b = pd.DataFrame(
    [np.arange(1, 6),
     np.arange(6, 11)]
)
b

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,7,8,9,10


Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,7,8,9,10


In [31]:
# 3번째 방법(column과 index 부여)
a = pd.DataFrame(
    [[1, 2],
     [3, 4]],
    index = [0, 1],
    columns = ['a', 'b']
)
a

Unnamed: 0,a,b
0,1,2
1,3,4


In [32]:
# 4번째 방법(dict로 생성)
a = pd.DataFrame(
    {'a' : [1, 2],
     'b' : [3, 4]}
)
a

Unnamed: 0,a,b
0,1,3
1,2,4


In [33]:
# 열을 하나만 갖고 있는 DataFrame을 생성할 때도 dict의 value에 해당하는 값은 iterable한 값이어야 한다.
# iterable한 데이터: list, np.array, pd.Series()
a = pd.DataFrame(
    {'a' : [1],
     'b' : [2]}
)
a

Unnamed: 0,a,b
0,1,2


## DataFrame 생성시, Series 간에 index를 기준으로 자동정렬한다.

In [13]:
s1 = pd.Series(np.arange(1, 6))
s2 = pd.Series(np.arange(6, 11))
s3 = pd.Series(np.arange(12, 15), index = [1, 2, 10])
s1
s2
s3

0    1
1    2
2    3
3    4
4    5
dtype: int32

0     6
1     7
2     8
3     9
4    10
dtype: int32

1     12
2     13
10    14
dtype: int32

In [16]:
df = pd.DataFrame(
    {'c1' : s1,
     'c2' : s2,
     'c3' : s3}
)
df

Unnamed: 0,c1,c2,c3
0,1.0,6.0,
1,2.0,7.0,12.0
2,3.0,8.0,13.0
3,4.0,9.0,
4,5.0,10.0,
10,,,14.0


## DataFrame에 새로운 column 추가하기

In [38]:
df['c4'] = pd.Series(
    [1, 2, 3, 4], 
    index = [2, 3, 4, 10]
)

## reindexing

* 새로운 index를 기반으로 index-value mapping을 유지한채 재배열한다.

### index 자체를 바꾸는 것(index-value mapping이 깨짐)

In [6]:
s = pd.Series([1, 2, 3, 4, 5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [7]:
s.index = ['a', 'b', 'c', 'd', 'e']
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

### `set_index()`: 특정 column을 index로 만든다.

In [18]:
df['c4'] = pd.Series([1, 2, 3, 4, 5])
df

Unnamed: 0,c1,c2,c3,c4
0,1.0,6.0,,1.0
1,2.0,7.0,12.0,2.0
2,3.0,8.0,13.0,3.0
3,4.0,9.0,,4.0
4,5.0,10.0,,5.0
10,,,14.0,


In [20]:
# 데이터프레임을 반환한다.
df.set_index('c4')

Unnamed: 0_level_0,c1,c2,c3
c4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,1.0,6.0,
2.0,2.0,7.0,12.0
3.0,3.0,8.0,13.0
4.0,4.0,9.0,
5.0,5.0,10.0,
,,,14.0


### reindex(기존 값은 그대로 사용하고 새로운 값은 NaN으로 대체)

In [21]:
s2

0     6
1     7
2     8
3     9
4    10
dtype: int32

In [22]:
new = s2.reindex(
    [1, 2, 5, 7]
)
new

1   7.000
2   8.000
5     NaN
7     NaN
dtype: float64

In [23]:
# 덮어쓰기
new[2] = 0
new

1   7.000
2   0.000
5     NaN
7     NaN
dtype: float64

### `reindex()`의 유용한 arguments

* `fill_value`

In [33]:
# Series 객체 copy
copied = s2.copy()
copied

a    3
b    4
c    5
dtype: int64

In [35]:
s2.reindex(
    ['a', 'b', 'd', 'e'],
    fill_value = 0
)

a    3
b    4
d    0
e    0
dtype: int64

* `method`

In [37]:
s3 = pd.Series(
    ['red', 'blue', 'green'],
    index = [0, 3, 5]
)
s3.reindex(
    np.arange(0, 10),
    method = "ffill"
)

0      red
1      red
2      red
3     blue
4     blue
5    green
6    green
7    green
8    green
9    green
dtype: object

### index 관련 자주 실수하는 사항

In [24]:
s1 = pd.Series([0, 1, 2], index = [0, 1, 2])
s2 = pd.Series([3, 4, 5], index = ['0', '1', '2'])
s1
s2

0    0
1    1
2    2
dtype: int64

0    3
1    4
2    5
dtype: int64

In [26]:
# index를 기준으로 연산을 수행하지만 
# 문자열 인덱스와 숫자형 인덱스는 별개의 인덱스로 취급된다.
s1 + s2

0   NaN
1   NaN
2   NaN
0   NaN
1   NaN
2   NaN
dtype: float64

In [27]:
new = s2.reindex(s1.index)
new

0   NaN
1   NaN
2   NaN
dtype: float64

In [30]:
# 첫번째 해결방법(인덱스 자료형 일치)
s2.index = s1.index.astype(int)

In [31]:
s1 + s2

0    3
1    5
2    7
dtype: int64

In [32]:
# 두번쨰 해결방법(새로운 인덱스로 일치 시켜준다)
s1.index = ['a', 'b', 'c']
s2.index = ['a', 'b', 'c']
s1 + s2

a    3
b    5
c    7
dtype: int64

### reindex 관련 예제

In [45]:
!pip install finance_datareader==0.9.1

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting finance_datareader==0.9.1
  Downloading finance_datareader-0.9.1-py3-none-any.whl (17 kB)
Collecting lxml
  Downloading lxml-4.8.0-cp39-cp39-win_amd64.whl (3.6 MB)
Collecting requests-file
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, lxml, finance-datareader
Successfully installed finance-datareader-0.9.1 lxml-4.8.0 requests-file-1.5.1


You should consider upgrading via the 'c:\users\ignis535\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [47]:
!pip install BeautifulSoup4

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting BeautifulSoup4
  Downloading beautifulsoup4-4.10.0-py3-none-any.whl (97 kB)
Collecting soupsieve>1.2
  Downloading soupsieve-2.3.1-py3-none-any.whl (37 kB)
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.10.0 soupsieve-2.3.1


You should consider upgrading via the 'c:\users\ignis535\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [48]:
import FinanceDataReader as fdr

In [50]:
# 삼성전자
df1 = fdr.DataReader("005930", "2018-01-02", "2018-10-30")

# KODEX 200(ETF)
df2 = fdr.DataReader("069500", "2018-01-03", "2018-10-30")

In [51]:
df1.head(2)
df1.tail(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,51380,51400,50780,51020,169485,0.001
2018-01-03,52540,52560,51420,51620,200270,0.012


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-29,40850,41950,40550,41400,14460521,0.01
2018-10-30,41400,43000,41000,42350,14205190,0.023


In [52]:
df2.head(2)
df2.tail(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-03,30393,30484,30320,30396,7250553,0.004
2018-01-04,30567,30582,30145,30156,8914121,-0.008


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-29,24742,24906,24410,24459,5301352,-0.008
2018-10-30,24424,24847,24336,24628,8010749,0.007


In [65]:
# 삼성전자
df1 = fdr.DataReader("005930", "2018-01-02", "2018-10-30")

# KODEX 200(ETF)
df2 = fdr.DataReader("069500", "2018-01-02", "2018-10-30")

In [66]:
df1.shape
df2.shape

(202, 6)

(202, 6)

In [None]:
df2 = df2.drop(pd.to_datetime("2018-01-03"))

In [71]:
df2

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,30245,30334,30150,30267,5016257,0.004
2018-01-04,30567,30582,30145,30156,8914121,-0.008
2018-01-05,30232,30558,30232,30566,8121543,0.014
2018-01-08,30687,30815,30517,30771,8023240,0.007
2018-01-09,30678,30898,30512,30650,7970365,-0.004
...,...,...,...,...,...,...
2018-10-24,25722,25744,25415,25462,12097024,-0.005
2018-10-25,24930,25065,24689,25056,11679604,-0.016
2018-10-26,25079,25084,24433,24658,8230431,-0.016
2018-10-29,24742,24906,24410,24459,5301352,-0.008


In [72]:
new_df2 = df2.reindex(df1.index)
new_df2.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,30245.0,30334.0,30150.0,30267.0,5016257.0,0.004
2018-01-03,,,,,,
2018-01-04,30567.0,30582.0,30145.0,30156.0,8914121.0,-0.008
2018-01-05,30232.0,30558.0,30232.0,30566.0,8121543.0,0.014
2018-01-08,30687.0,30815.0,30517.0,30771.0,8023240.0,0.007


In [73]:
new_df2.fillna(method = "ffill")

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,30245.000,30334.000,30150.000,30267.000,5016257.000,0.004
2018-01-03,30245.000,30334.000,30150.000,30267.000,5016257.000,0.004
2018-01-04,30567.000,30582.000,30145.000,30156.000,8914121.000,-0.008
2018-01-05,30232.000,30558.000,30232.000,30566.000,8121543.000,0.014
2018-01-08,30687.000,30815.000,30517.000,30771.000,8023240.000,0.007
...,...,...,...,...,...,...
2018-10-24,25722.000,25744.000,25415.000,25462.000,12097024.000,-0.005
2018-10-25,24930.000,25065.000,24689.000,25056.000,11679604.000,-0.016
2018-10-26,25079.000,25084.000,24433.000,24658.000,8230431.000,-0.016
2018-10-29,24742.000,24906.000,24410.000,24459.000,5301352.000,-0.008
