In [1]:
from typing import Any, Dict, List, Optional, Union

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup as bs
from lxml.html import fromstring

import pandas as pd
import numpy as np

import re

import itertools
from copy import deepcopy

## custom libs

from korquanttools.pricevolume.config import PathConfig, ScraperConfig
from korquanttools.pricevolume.processor import Preprocessor, Lv2Converter
from korquanttools.pricevolume.utils import DateUtil

In [2]:
import investpy

In [3]:
years = list(range(2014, 2022))

yearly_from_to_list = [(f'01/01/{y}', f'01/12/{y}') for y in years]
yearly_from_to_list.append(('01/01/2022', '30/05/2022'))

## Economic calendar (macro)

In [4]:
earnings_calendar_df = []

for from_date, to_date in yearly_from_to_list:
    df = investpy.news.economic_calendar(from_date=from_date, to_date=to_date, countries=['south korea'])
    earnings_calendar_df.append(df)

earnings_calendar_df = pd.concat(earnings_calendar_df, ignore_index=True, )

In [5]:
earnings_calendar_df

Unnamed: 0,id,date,time,zone,currency,importance,event,actual,forecast,previous
0,9,01/01/2014,All Day,south korea,,,South Korea - New Year's Day,,,
1,33077,01/01/2014,09:00,south korea,KRW,low,Trade Balance,3.70B,3.76B,4.80B
2,264342,01/01/2014,18:00,south korea,KRW,low,Exports (YoY) (Dec),6.90%,,0.20%
3,233832,01/01/2014,18:00,south korea,KRW,low,Export Price Index (YoY) (Dec),-2.10%,,-2.50%
4,259367,01/01/2014,18:00,south korea,KRW,low,Imports (YoY) (Dec),3.00%,,-0.60%
...,...,...,...,...,...,...,...,...,...,...
2601,9,31/01/2022,All Day,south korea,,,South Korea - New Year's Day,,,
2602,9,01/02/2022,All Day,south korea,,,South Korea - New Year's Day,,,
2603,9,02/02/2022,All Day,south korea,,,South Korea - New Year's Day,,,
2604,173,01/03/2022,All Day,south korea,,,South Korea - Independence Day,,,


## Earnings Calendar (company)

In [6]:
request_url = 'https://www.investing.com/earnings-calendar/Service/getCalendarFilteredData'

request_headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest",
    "Accept": "text/html",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive",
}

In [7]:
POST_data = {
    "country[]": "11", # south korea
    "dateFrom": "2022-01-01",
    "dateTo": "2022-05-30",
    "currentTab": "custom",
    "limit_from": "0",
    "submitFilters": "1",
    # "last_time_scope": "1437523200",
    # "byHandler": "true",
}

In [8]:
class DataFetcher:
    def __init__(self) -> None:
        ## Init config
        self.scraper_config = ScraperConfig()

        ## Init session
        self.session = requests.session()

        assert_status_hook = lambda response, *args, **kwargs: response.raise_for_status()
        self.session.hooks["response"] = [assert_status_hook]

        retry_strategy = Retry(**self.scraper_config.retry_strategy)
        adapter = HTTPAdapter(max_retries=retry_strategy)

        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)
        
        ## POST data
        self.request_url = request_url
        self.request_headers = request_headers
        self.POST_data = POST_data

    def get_response(self, ):
        res = self.session.post(self.request_url, data=self.POST_data, headers=self.request_headers)

        return res
        
    def parse_response(self,):
        pass

In [9]:
fetcher = DataFetcher()

In [10]:
years = list(range(2014, 2022))

yearly_from_to_list = [(f'{y}-01-01', f'{y}-12-31') for y in years]
yearly_from_to_list.append(('2022-01-01', '2022-05-30'))

In [11]:
for from_date, to_date in yearly_from_to_list:
    POST_data['dateFrom'] = from_date
    POST_data['dateTo'] = to_date
    
    r = fetcher.get_response()
    

In [12]:
root_bs = bs(r.json()['data'], 'html.parser')
data_list = root_bs.find_all('td', {'class': 'earnCalCompany'})
data_bs = data_list[0]

In [15]:
data_bs

<td _p_pid="1186791" _r_pid="1186791" class="left noWrap earnCalCompany" title="K Auction Inc">
<span class="earnCalCompanyName middle">K Auction</span> (<a class="bold middle" href="/equities/k-auction-earnings" target="_blank">102370</a>)
                    </td>

In [20]:
root_text = root_bs.get_text()

weekdays = [
    'Monday',
    'Tuesday',
    'Wednesday',
    'Thursday',
    'Friday',
    'Saturday',
    'Sunday',
]

weekdays_re = re.compile('(' + '|'.join(weekdays) + ')')


1

In [29]:
dd = root_text.split('\n')
dd.pop(0)

steps = 19
dd[0:steps]

['Tuesday, January 4, 2022',
 '',
 '',
 '',
 '',
 '',
 '',
 'K Auction\xa0(102370)',
 '                    ',
 '-128.24',
 '/\xa0\xa0--',
 '10.51B',
 '/\xa0\xa0--',
 '209.69B',
 '',
 '',
 '',
 '',
 '']

In [30]:
dd[steps:steps*2]

['Wednesday, January 5, 2022',
 '',
 '',
 '',
 '',
 '',
 '',
 'EGtronics\xa0(377330)',
 '                    ',
 '-100.07',
 '/\xa0\xa0--',
 '5.44B',
 '/\xa0\xa0--',
 '125.46B',
 '',
 '',
 '',
 '',
 '']

In [33]:
dd[steps*2:steps*3]

['Thursday, January 6, 2022',
 '',
 '',
 '',
 '',
 '',
 '',
 'Narae NanoTech\xa0(137080)',
 '                    ',
 '435.54',
 '/\xa0\xa0--',
 '37.40B',
 '/\xa0\xa0--',
 '123.87B',
 '',
 '',
 '',
 '',
 '']

In [22]:
print(root_text)

 
Tuesday, January 4, 2022






K Auction (102370)
                    
-128.24
/  --
10.51B
/  --
209.69B





Wednesday, January 5, 2022






EGtronics (377330)
                    
-100.07
/  --
5.44B
/  --
125.46B





Thursday, January 6, 2022






Narae NanoTech (137080)
                    
435.54
/  --
37.40B
/  --
123.87B





Monday, January 10, 2022






Skonec Entertainment (276040)
                    
-304.94
/  --
2.71B
/  --
178.49B





Friday, January 14, 2022






Assems (136410)
                    
9.67
/  --
9.95B
/  --
161.43B









Pungkang (093380)
                    
-0.31
/  --
18.43B
/  --
42.38B





Thursday, January 20, 2022






KT&G Corp (033780)
                    
1121.48
/  1325.48
1,247.5B
/  1,226.25B
10.21T









BCNC (146320)
                    
-70.53
/  --
13.75B
/  --
242.96B





Friday, January 21, 2022






Daedong Ind (000490)
                    
-506.69
/  --
286.0B
/  --
314.1B









Hanyang Securities Co (001755)
   

In [16]:
data_bs.attrs['_p_pid']
data_bs.attrs['_r_pid']
data_bs.attrs['title']

'K Auction Inc'

In [None]:
dates = root_bs.find_all('td', {'class': 'theDay'})
a = dates[0]
a

<td class="theDay" colspan="9">Tuesday, January 4, 2022</td>

In [156]:
a.

7716

In [146]:
root_bs

 <tr tablesorterdivider="">
<td class="theDay" colspan="9">Tuesday, January 4, 2022</td>
</tr>
<tr>
<td class="flag">
<span class="ceFlags South_Korea middle" title="South Korea"></span>
</td>
<td _p_pid="1186791" _r_pid="1186791" class="left noWrap earnCalCompany" title="K Auction Inc">
<span class="earnCalCompanyName middle">K Auction</span> (<a class="bold middle" href="/equities/k-auction-earnings" target="_blank">102370</a>)
                    </td>
<td class="bold pid-1186791-2022-01-04-062020-eps_actual">-162</td>
<td class="leftStrong">/  --</td>
<td class="bold pid-1186791-2022-01-04-062020-rev_actual">8.50B</td>
<td class="leftStrong">/  --</td>
<td class="right">217.39B</td>
<td class="right time" data-value="2"><span class="genToolTip oneliner reverseToolTip"></span></td>
<!-- EARNING -->
<td data-pair-id="1186791"></td>
</tr>
<tr tablesorterdivider="">
<td class="theDay" colspan="9">Wednesday, January 5, 2022</td>
</tr>
<tr>
<td class="flag">
<span class="ceFlags South_Ko