In [1]:
import pandas as pd
from sqlalchemy import create_engine
from tqdm.auto import tqdm

# 주식체결 데이터 저장

In [11]:
import pandas as pd
from sqlalchemy import create_engine

class TodayMinuteChartDataProvider:
  table_query = '''
  CREATE TABLE IF NOT EXISTS today_in_minute (
    st_code TEXT not NULL,
    dt TEXT not NULL,
    open INTEGER,
    high INTEGER,
    low INTEGER,
    close INTEGER,
    volume INTEGER
  )
  '''
  index_query = '''
  CREATE INDEX IF NOT EXISTS idx_today_in_minute ON today_in_minute (st_code, dt)
  '''

  insert_query = '''
  INSERT INTO today_in_minute (st_code, dt, open, high, low, close, volume)
  VALUES (?, ?, ?, ?, ?, ?, ?)
  '''

  def __init__(self, engine):
    pass


class RealTimeTickDataPrivder:
  table_query = '''
  CREATE TABLE IF NOT EXISTS today_in_ticks (
    st_code TEXT not NULL,
    dt TEXT not NULL,
    open INTEGER,
    high INTEGER,
    low INTEGER,
    close INTEGER,
    volume INTEGER
  )
  '''
  index_query = '''
  CREATE INDEX IF NOT EXISTS idx_today_in_ticks ON today_in_ticks (st_code, dt)
  '''

  insert_query = '''
  INSERT INTO today_in_ticks (st_code, dt, open, high, low, close, volume)
  VALUES (?, ?, ?, ?, ?, ?, ?)
  '''

  def __init__(self, db_path, in_memory_db = False, with_index=False):
      self.engine = create_engine(f"sqlite://") if in_memory_db else create_engine(f"sqlite:///{db_path}")
      self.with_index = with_index
      self.create_table()

  def clear_table(self):
    with self.engine.connect() as connection:
      connection.execute('DROP TABLE IF EXISTS today_in_ticks')
      if self.with_index:
        connection.execute('DROP INDEX IF EXISTS idx_today_in_ticks')

  def create_table(self):
    with self.engine.connect() as connection:
      connection.execute(self.table_query)
      connection.execute(self.index_query)
  
  def __build_data(self, real_data):
    return (
      real_data['code'],
      real_data['20'], # 체결시간 (HHMMSS)
      abs(int(real_data['16'])), # 시가 +-
      abs(int(real_data['17'])), # 고가 +-
      abs(int(real_data['18'])), # 저가 +-
      abs(int(real_data['10'])), # 현재가 +-
      abs(int(real_data['15'])), # 거래량 +-
    )

  def __build_dataframe(self, real_data):
    return pd.DataFrame(
      [self.__build_data(real_data)],
      columns=['st_code', 'dt', 'open', 'high', 'low', 'close', 'volume']
    )

  def insert(self, real_data):
    with self.engine.begin() as connection:
      connection.execute(self.insert_query, self.__build_data(real_data))

  def insert_by_dataframe(self, real_data):
    self.build_dataframe(real_data).to_sql('today_in_ticks', self.engine, if_exists='append', index=False)

In [70]:
real_data = {'code': '069500', 'type': '주식체결', '20': '100645', '16': '+31345', '17': '+31370', '18': '-31215', '10': ' 31275', '15': '+50', '11': ' 0', '12': '0.00', '13': '1613182'}

In [9]:
provider = RealTimeTickDataPrivder("kiwoom_db.sqlite3", in_memory_db=True, with_index=True)

connection.execute()로 수행시
- 인덱스 있을 때
  - 10,000건 추가에 70초
- 인덱스 없을 때
  - 10,000건 추가에 65초
- 인메모리
  - 0.7초

dataframe.to_sql()로 수행시
- 인덱스 있을 때
  - 10,000건 추가에 140초
- 인덱스 없을 때
  - 10,000건 추가에 141초
- 인메모리
  - 33초

dataframe.to_sql()로 벌크 수행시 (10,000건)
- 0.7초

결론
- 일정 주기 모아서 인서트 하는게 낫다
- 인메모리 디비로 선택하고, 표준 SQL로 작업하는게 낫다

In [89]:
for i in range(10000):
  real_data['20'] = f"{i:06d}"
  provider.insert1(real_data)

In [82]:
for i in range(10000):
  real_data['20'] = f"{i:06d}"
  provider.insert2(real_data)

In [85]:
ll = []
for i in range(10000):
  real_data['20'] = f"{i:06d}"
  ll.append(provider.build_dataframe(real_data))

In [86]:
pd.concat(ll, ignore_index=True).to_sql('today_in_ticks', provider.engine, if_exists='append', index=False)

10000

In [87]:
provider.clear_table()

In [88]:
provider.create_table()

In [91]:
with provider.engine.connect() as connection:
  rr = connection.execute('select * from today_in_ticks').fetchall()

# 체결데이터로부터 분봉 데이터 생성

In [1]:
import re

In [2]:
sample_file_path = r'c:\Sungshin\Lectures\2022\repos\AIFT2022\data\주식체결_sample.txt'

In [3]:
p = re.compile(r".*([{]'code.*[}])")

In [12]:
provider = RealTimeTickDataPrivder("kiwoom_db.sqlite3", in_memory_db=True, with_index=True)

In [13]:
with open(sample_file_path, 'r', encoding='utf8') as f:
  for line in f:
    m = p.match(line)
    if m:
      # print(eval(m.group(1)))
      provider.insert(eval(m.group(1)))

In [16]:
provider.engine.execute('select name from sqlite_master where type="table"').fetchall()

[('today_in_ticks',)]

In [67]:
with provider.engine.connect() as connection:
  rr = connection.execute(
  '''
  select DISTINCT t.st_code, t.minute||'00',
  first_value(t.close) over (partition by t.st_code, t.minute order by t.dt ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as open,
  max(t.close) over (partition by t.st_code, t.minute order by t.dt ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as high,
  min(t.close) over (partition by t.st_code, t.minute order by t.dt ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as low,
  last_value(t.close) over (partition by t.st_code, t.minute order by t.dt ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as close
  from (
    select *, substr(dt, 0, 5) as minute from today_in_ticks substr
    ) as t
  '''
  ).fetchall()

In [68]:
rr

[('069500', '104800', 31315, 31335, 31315, 31330),
 ('069500', '104900', 31335, 31340, 31330, 31335),
 ('069500', '105000', 31330, 31340, 31330, 31340),
 ('069500', '105100', 31340, 31340, 31330, 31330),
 ('069500', '105200', 31325, 31350, 31325, 31345),
 ('069500', '105300', 31345, 31350, 31335, 31345),
 ('069500', '105400', 31350, 31350, 31345, 31350),
 ('069500', '105500', 31350, 31350, 31335, 31340),
 ('069500', '105600', 31340, 31345, 31330, 31335),
 ('069500', '105700', 31335, 31350, 31335, 31350),
 ('069500', '105800', 31345, 31370, 31345, 31360),
 ('069500', '105900', 31365, 31390, 31365, 31375),
 ('069500', '110000', 31375, 31375, 31360, 31370),
 ('069500', '110100', 31360, 31370, 31360, 31365),
 ('069500', '110200', 31360, 31370, 31360, 31360),
 ('069500', '110300', 31365, 31365, 31345, 31345),
 ('069500', '110400', 31340, 31355, 31340, 31350),
 ('069500', '110500', 31350, 31355, 31345, 31355),
 ('069500', '110600', 31350, 31350, 31345, 31345),
 ('069500', '110700', 31345, 31

In [None]:
provider.engine.execute('select * from today_in_ticks where st_code="069500"').fetchall()