In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import date, timedelta

In [2]:
result_list = []
for num in range(91, 544):
    url = 'https://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_CALL'
    params = {
        'page' : num
    }
    resp = requests.get(url, params = params)
    soup = BeautifulSoup(resp.content, 'lxml')
    tr_list = soup.find_all('tr')[1:]
    for tr in tr_list:
        time = tr.find_all('td')[0].text.strip()
        a = time.split('.')
        time2 = date(int(a[0]), int(a[1]), int(a[2]))
        price = tr.find_all('td')[1].text
        
        new_dict = {
            'date' : time2,
            'callrate' : price
        }
        result_list.append(new_dict)

In [3]:
df = pd.DataFrame(result_list)
df = df.sort_values(by = 'date')
df

Unnamed: 0,date,callrate
3170,2005-03-31,3.26
3169,2005-04-01,3.25
3168,2005-04-04,3.24
3167,2005-04-06,3.23
3166,2005-04-07,3.23
...,...,...
4,2017-12-26,1.52
3,2017-12-27,1.51
2,2017-12-28,1.55
1,2017-12-29,1.58


In [4]:
df.drop(df.index[0], inplace = True)

In [5]:
df.drop(df.index[3169], inplace = True)

In [6]:
df

Unnamed: 0,date,callrate
3169,2005-04-01,3.25
3168,2005-04-04,3.24
3167,2005-04-06,3.23
3166,2005-04-07,3.23
3165,2005-04-08,3.26
...,...,...
5,2017-12-22,1.51
4,2017-12-26,1.52
3,2017-12-27,1.51
2,2017-12-28,1.55


In [7]:
date_df = pd.DataFrame(columns = ['date', 'callrate'])
start = date(2005, 4, 1)
end = date(2017, 12, 29)
for n in range(int((end - start).days) + 1):
    date_df.loc[n, 'date'] = start + timedelta(n)

In [8]:
date_df

Unnamed: 0,date,callrate
0,2005-04-01,
1,2005-04-02,
2,2005-04-03,
3,2005-04-04,
4,2005-04-05,
...,...,...
4651,2017-12-25,
4652,2017-12-26,
4653,2017-12-27,
4654,2017-12-28,


In [9]:
new_df = pd.merge(date_df, df, on = 'date', how = 'left')
new_df

Unnamed: 0,date,callrate_x,callrate_y
0,2005-04-01,,3.25
1,2005-04-02,,
2,2005-04-03,,
3,2005-04-04,,3.24
4,2005-04-05,,
...,...,...,...
4651,2017-12-25,,
4652,2017-12-26,,1.52
4653,2017-12-27,,1.51
4654,2017-12-28,,1.55


In [10]:
new_df.drop('callrate_x', axis = 'columns', inplace = True)

In [11]:
# new_df = new_df.set_index('date')
new_df

Unnamed: 0,date,callrate_y
0,2005-04-01,3.25
1,2005-04-02,
2,2005-04-03,
3,2005-04-04,3.24
4,2005-04-05,
...,...,...
4651,2017-12-25,
4652,2017-12-26,1.52
4653,2017-12-27,1.51
4654,2017-12-28,1.55


In [12]:
new_df.fillna(method = 'pad', inplace=True)
new_df

Unnamed: 0,date,callrate_y
0,2005-04-01,3.25
1,2005-04-02,3.25
2,2005-04-03,3.25
3,2005-04-04,3.24
4,2005-04-05,3.24
...,...,...
4651,2017-12-25,1.51
4652,2017-12-26,1.52
4653,2017-12-27,1.51
4654,2017-12-28,1.55


In [13]:
second_df  = pd.DataFrame(new_df)
second_df.head(50)

Unnamed: 0,date,callrate_y
0,2005-04-01,3.25
1,2005-04-02,3.25
2,2005-04-03,3.25
3,2005-04-04,3.24
4,2005-04-05,3.24
5,2005-04-06,3.23
6,2005-04-07,3.23
7,2005-04-08,3.26
8,2005-04-09,3.26
9,2005-04-10,3.26


In [14]:
second_df.drop(second_df.index[0:30], inplace = True)

In [15]:
second_df.reset_index(drop = True, inplace = True)

In [16]:
second_df

Unnamed: 0,date,callrate_y
0,2005-05-01,3.29
1,2005-05-02,3.29
2,2005-05-03,3.30
3,2005-05-04,3.21
4,2005-05-05,3.21
...,...,...
4621,2017-12-25,1.51
4622,2017-12-26,1.52
4623,2017-12-27,1.51
4624,2017-12-28,1.55


In [17]:
second_df['한달전 date'] = second_df['date'] - pd.DateOffset(30)

In [18]:
second_df

Unnamed: 0,date,callrate_y,한달전 date
0,2005-05-01,3.29,2005-04-01
1,2005-05-02,3.29,2005-04-02
2,2005-05-03,3.30,2005-04-03
3,2005-05-04,3.21,2005-04-04
4,2005-05-05,3.21,2005-04-05
...,...,...,...
4621,2017-12-25,1.51,2017-11-25
4622,2017-12-26,1.52,2017-11-26
4623,2017-12-27,1.51,2017-11-27
4624,2017-12-28,1.55,2017-11-28


In [19]:
second_df['한달전 callrate'] = new_df['callrate_y'][0:4626]

In [20]:
second_df = second_df.rename(columns = {'callrate_y':'callrate'})

In [21]:
final_df = pd.DataFrame(second_df)
final_df['labeling'] = np.zeros(len(final_df))

In [22]:
tmp = []
for i in final_df.iloc:
    if i['callrate'] > i['한달전 callrate']:
        tmp.append('up')
    elif i['callrate'] == i['한달전 callrate']:
        tmp.append(0)
    else:
        tmp.append('down')
final_df.labeling = tmp

In [27]:
final_df

Unnamed: 0,date,callrate,한달전 date,한달전 callrate,labeling
0,2005-05-01,3.29,2005-04-01,3.25,up
1,2005-05-02,3.29,2005-04-02,3.25,up
2,2005-05-03,3.30,2005-04-03,3.25,up
3,2005-05-04,3.21,2005-04-04,3.24,down
4,2005-05-05,3.21,2005-04-05,3.24,down
...,...,...,...,...,...
4621,2017-12-25,1.51,2017-11-25,1.25,up
4622,2017-12-26,1.52,2017-11-26,1.25,up
4623,2017-12-27,1.51,2017-11-27,1.26,up
4624,2017-12-28,1.55,2017-11-28,1.26,up


In [30]:
final_df.to_csv('callrate.csv', encoding = 'utf-8-sig', header = ['date','callrate', '한달전 date', '한달전 callrate', 'labeling'], index = False)

In [147]:
# tmp = 0

# for i in new_df['date']:
#     if i in df.index:
#         i = str(i)
#         year = i[0:4]
#         month = i[5:7]
#         if month[0] == '0':
#             month = month.replace('0', '')
#         day = i[8:10]
#         if day[0] == '0':
#             day = day.replace('0', '')

2005 4 1
2005 4 4
2005 4 6
2005 4 7
2005 4 8
2005 4 11
2005 4 12
2005 4 13
2005 4 14
2005 4 15
2005 4 18
2005 4 19
2005 4 20
2005 4 21
2005 4 22
2005 4 25
2005 4 26
2005 4 27
2005 4 28
2005 4 29
2005 5 2
2005 5 3
2005 5 4
2005 5 6
2005 5 9
2005 5 10
2005 5 11
2005 5 12
2005 5 13
2005 5 16
2005 5 17
2005 5 18
2005 5 19
2005 5 20
2005 5 23
2005 5 24
2005 5 25
2005 5 26
2005 5 27
2005 5 30
2005 5 31
2005 6 1
2005 6 2
2005 6 3
2005 6 7
2005 6 8
2005 6 9
2005 6 10
2005 6 13
2005 6 14
2005 6 15
2005 6 16
2005 6 17
2005 6 20
2005 6 21
2005 6 22
2005 6 23
2005 6 24
2005 6 27
2005 6 28
2005 6 29
2005 6 30
2005 7 1
2005 7 4
2005 7 5
2005 7 6
2005 7 7
2005 7 8
2005 7 11
2005 7 12
2005 7 13
2005 7 14
2005 7 15
2005 7 18
2005 7 19
2005 7 20
2005 7 21
2005 7 22
2005 7 25
2005 7 26
2005 7 27
2005 7 28
2005 7 29
2005 8 1
2005 8 2
2005 8 3
2005 8 4
2005 8 5
2005 8 8
2005 8 9
2005 8 10
2005 8 11
2005 8 12
2005 8 16
2005 8 17
2005 8 18
2005 8 19
2005 8 22
2005 8 23
2005 8 24
2005 8 25
2005 8 26
2005 8 29

2011 3 24
2011 3 25
2011 3 28
2011 3 29
2011 3 30
2011 3 31
2011 4 1
2011 4 4
2011 4 5
2011 4 6
2011 4 7
2011 4 8
2011 4 11
2011 4 12
2011 4 13
2011 4 14
2011 4 15
2011 4 18
2011 4 19
2011 4 20
2011 4 21
2011 4 22
2011 4 25
2011 4 26
2011 4 27
2011 4 28
2011 4 29
2011 5 2
2011 5 3
2011 5 4
2011 5 6
2011 5 9
2011 5 11
2011 5 12
2011 5 13
2011 5 16
2011 5 17
2011 5 18
2011 5 19
2011 5 20
2011 5 23
2011 5 24
2011 5 25
2011 5 26
2011 5 27
2011 5 30
2011 5 31
2011 6 1
2011 6 2
2011 6 3
2011 6 7
2011 6 8
2011 6 9
2011 6 10
2011 6 13
2011 6 14
2011 6 15
2011 6 16
2011 6 17
2011 6 20
2011 6 21
2011 6 22
2011 6 23
2011 6 24
2011 6 27
2011 6 28
2011 6 29
2011 6 30
2011 7 1
2011 7 4
2011 7 5
2011 7 6
2011 7 7
2011 7 8
2011 7 11
2011 7 12
2011 7 13
2011 7 14
2011 7 15
2011 7 18
2011 7 19
2011 7 20
2011 7 21
2011 7 22
2011 7 25
2011 7 26
2011 7 27
2011 7 28
2011 7 29
2011 8 1
2011 8 2
2011 8 3
2011 8 4
2011 8 5
2011 8 8
2011 8 9
2011 8 10
2011 8 11
2011 8 12
2011 8 16
2011 8 17
2011 8 18
2011 8 19


2017 3 17
2017 3 20
2017 3 21
2017 3 22
2017 3 23
2017 3 24
2017 3 27
2017 3 28
2017 3 29
2017 3 30
2017 3 31
2017 4 3
2017 4 4
2017 4 5
2017 4 6
2017 4 7
2017 4 10
2017 4 11
2017 4 12
2017 4 13
2017 4 14
2017 4 17
2017 4 18
2017 4 19
2017 4 20
2017 4 21
2017 4 24
2017 4 25
2017 4 26
2017 4 27
2017 4 28
2017 5 2
2017 5 4
2017 5 8
2017 5 10
2017 5 11
2017 5 12
2017 5 15
2017 5 16
2017 5 17
2017 5 18
2017 5 19
2017 5 22
2017 5 23
2017 5 24
2017 5 25
2017 5 26
2017 5 29
2017 5 30
2017 5 31
2017 6 1
2017 6 2
2017 6 5
2017 6 7
2017 6 8
2017 6 9
2017 6 12
2017 6 13
2017 6 14
2017 6 15
2017 6 16
2017 6 19
2017 6 20
2017 6 21
2017 6 22
2017 6 23
2017 6 26
2017 6 27
2017 6 28
2017 6 29
2017 6 30
2017 7 3
2017 7 4
2017 7 5
2017 7 6
2017 7 7
2017 7 10
2017 7 11
2017 7 12
2017 7 13
2017 7 14
2017 7 17
2017 7 18
2017 7 19
2017 7 20
2017 7 21
2017 7 24
2017 7 25
2017 7 26
2017 7 27
2017 7 28
2017 7 31
2017 8 1
2017 8 2
2017 8 3
2017 8 4
2017 8 7
2017 8 8
2017 8 9
2017 8 10
2017 8 11
2017 8 14
2017 8