In [1]:
from sqlalchemy import create_engine
from tqdm import tqdm
import pandas as pd
import numpy as np
import pymysql
import pickle
import csv
import os
import time

In [16]:
# mysql connect하기 위한 아이디 비밀번호 포트 데이터베이스 등록 및 conn 리턴
def sqlalchemy_connect_ip(ip_address, db_name):
    engine = create_engine("mysql+pymysql://admin:"
                +"big15" # user password
                +"@{0}:3306/{1}?charset=utf8".format(ip_address, db_name)
                , encoding='utf8')
    
    return engine.connect()

# mysql connect하기 위한 아이디 비밀번호 포트 데이터베이스 등록 및 conn 리턴
def get_pymysql_connection(ip_address, db_name):

    conn = pymysql.connect(host=ip_address, user='admin', password='big15'
                        , db=db_name, charset='utf8')

    return conn

#DB 내 존재하는 테이블(종목) 리스트 추출
def get_pymysql_stock_list(conn, db_name):

    # 원하는 폴더의 테이블(종목) 추출
    sql = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{0}'".format(db_name)

    with conn:
        with conn.cursor() as cur:
            cur.execute(sql)
            result = [item[0] for item in cur.fetchall()]
            cur.close()

            return result
               
# 병합을 위해 날짜만을 가지고 있는 데이터 프레임 생성
def get_empty_day_df(sqlalchemy_conn):
     
    sql = "SELECT * FROM investing_data.aedkrw내역 where 날짜 > 20220501 and 날짜 < 20221121"
            
    result = sqlalchemy_conn.execute(sql)
    empty_day_df = pd.DataFrame(result.fetchall())
    
    empty_day_df = empty_day_df.set_index('날짜')
    empty_day_df = empty_day_df.notnull().replace(True, np.NaN)

    empty_day_df = empty_day_df.reset_index()
    empty_day_df = empty_day_df.drop(columns=['AEDKRW내역_종가','AEDKRW내역_오픈','AEDKRW내역_고가'
                                      ,'AEDKRW내역_저가','AEDKRW내역_거래량','AEDKRW내역_변동'])
    empty_day_df.to_pickle('./pickle/empty_day_df.pkl')
    
    return empty_day_df


# investing Data로 이루어진 데이터 프레임 추출
def get_sqlalchemy_investing_df(empty_day_df, investing_table_list):
    
    investing_df = pd.DataFrame(empty_day_df)
    investing_df['날짜'] = investing_df['날짜'].astype(str).astype(int) 
    for table in investing_table_list:
        conn = sqlalchemy_connect_ip('192.168.50.123', 'investing_data')
        
        sql = "SELECT * FROM investing_data.`{0}` where 날짜 > 20220501 and 날짜 < 20221121".format(table)
        result = conn.execute(sql)
        table_df = pd.DataFrame(result.fetchall())
        table_df['날짜'] = table_df['날짜'].astype(str).astype(int)
        investing_df = pd.merge(investing_df, table_df,on='날짜', how='left')
        
    for c in list(investing_df.columns):
        if c.split('_')[-1] == '거래량' or c.split('_')[-1] == '변동':
            investing_df[c] = investing_df[c].fillna(0)
        else:
            investing_df[c] = investing_df[c].fillna(method='bfill')
            
    return investing_df  

# stock df와 investing df를 병합
def get_sqlalchemy_stock_investing_merge_df(conn, stock_table_list, investing_df):
    
    complete_df = pd.DataFrame()
    investing_df['날짜'] = investing_df['날짜'].astype(str).astype(int) # 날짜 타입 int로 통일
    for table in tqdm(stock_table_list):
        sql = "SELECT * FROM stock_info.`{0}` where 날짜 > 20220501 and 날짜 < 20221121".format(table)
        table_data = conn.execute(sql)
        stock_df = pd.DataFrame(table_data.fetchall()) # DB내 테이블을 DF로 변환
        
        stock_df['날짜'] = stock_df['날짜'].astype(str).astype(int) # 날짜 타입 int로 통일
        merge_df = pd.merge(stock_df, investing_df, on='날짜') # stock df 와 investing df 를 날짜 기준으로 merge

        complete_df = pd.concat([complete_df, merge_df], axis=0) # merge_Df가 된 
        
    conn.close()
    return complete_df     


In [10]:
pymysql_conn = get_pymysql_connection('192.168.50.123', 'investing_data')
sqlalchemy_conn = sqlalchemy_connect_ip('192.168.50.123', 'investing_data')
investing_table_list = get_pymysql_stock_list(pymysql_conn, 'investing_data')
empty_day_df = get_empty_day_df(sqlalchemy_conn)
investing_df = get_sqlalchemy_investing_df(empty_day_df, investing_table_list)

In [26]:
# investing_df.to_csv('../cor/investing_df_update.csv', encoding='utf-8-sig')

In [17]:
pymysql_conn = get_pymysql_connection('192.168.50.123', 'investing_data')
sqlalchemy_conn = sqlalchemy_connect_ip('192.168.50.123', 'investing_data')
stock_table_list = ['005930','373220','207940','000660','051910','247540','091990','066970','293490','028300']
complete_df = get_sqlalchemy_stock_investing_merge_df(sqlalchemy_conn, stock_table_list, investing_df)
# for code in stock_table_list:
#     pymysql_conn = get_pymysql_connection('192.168.50.123', 'investing_data')
#     sqlalchemy_conn = sqlalchemy_connect_ip('192.168.50.123', 'investing_data')
#     complete_df = get_sqlalchemy_stock_investing_merge_df(sqlalchemy_conn, code, investing_df)
#     complete_df.to_pickle(f'../colume_pickle/6개월_choice_10개_대형주_update_{code}.pkl')
    
# ['003070','006220','014580','118990','027410','000060','008560','003520','067290','064350']
# get_pymysql_stock_list(pymysql_conn, 'stock_info')

100%|██████████| 10/10 [00:30<00:00,  3.07s/it]


In [20]:
complete_df

Unnamed: 0,날짜,시간,시가,고가,저가,종가,거래량,거래대금,누적체결매수수량,누적체결매도수량,...,헝가리BUX내역_고가,헝가리BUX내역_저가,헝가리BUX내역_거래량,헝가리BUX내역_변동,호주SPASX내역_종가,호주SPASX내역_오픈,호주SPASX내역_고가,호주SPASX내역_저가,호주SPASX내역_거래량,호주SPASX내역_변동
0,20220502,901,66600.0,66800.0,66500.0,66700.0,824383.0,5.490616e+10,709457.0,114926.0,...,43468.19,42571.10,0,-1.76,7347.0,7435.0,7435.0,7301.6,565050000.0,-1.18
1,20220502,902,66700.0,66900.0,66700.0,66800.0,203314.0,1.356986e+10,789226.0,238471.0,...,43468.19,42571.10,0,-1.76,7347.0,7435.0,7435.0,7301.6,565050000.0,-1.18
2,20220502,903,66800.0,66900.0,66700.0,66800.0,190201.0,1.270068e+10,832905.0,384993.0,...,43468.19,42571.10,0,-1.76,7347.0,7435.0,7435.0,7301.6,565050000.0,-1.18
3,20220502,904,66700.0,66800.0,66600.0,66700.0,126366.0,8.426040e+09,856269.0,487995.0,...,43468.19,42571.10,0,-1.76,7347.0,7435.0,7435.0,7301.6,565050000.0,-1.18
4,20220502,905,66700.0,66700.0,66600.0,66700.0,55899.0,3.725630e+09,883780.0,516383.0,...,43468.19,42571.10,0,-1.76,7347.0,7435.0,7435.0,7301.6,565050000.0,-1.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52192,20221118,1517,36200.0,36200.0,36150.0,36150.0,5278.0,1.908040e+08,176906.0,214477.0,...,44512.99,43739.37,0,1.04,7151.8,7135.7,7166.2,7135.7,762720000.0,0.23
52193,20221118,1518,36100.0,36150.0,36050.0,36100.0,7696.0,2.777010e+08,177619.0,221460.0,...,44512.99,43739.37,0,1.04,7151.8,7135.7,7166.2,7135.7,762720000.0,0.23
52194,20221118,1519,36050.0,36150.0,36050.0,36100.0,5145.0,1.857180e+08,180241.0,223983.0,...,44512.99,43739.37,0,1.04,7151.8,7135.7,7166.2,7135.7,762720000.0,0.23
52195,20221118,1520,36100.0,36200.0,36100.0,36200.0,2021.0,7.308800e+07,181656.0,224589.0,...,44512.99,43739.37,0,1.04,7151.8,7135.7,7166.2,7135.7,762720000.0,0.23


In [10]:
# complete_df.to_pickle('../colume_pickle/test_complete_df_6개월_choice_10개_대형주_update.pkl')

In [2]:
with open('../colume_pickle/test_complete_df_6개월_choice_10개_대형주.pkl', 'rb') as f:
    stock_df = pickle.load(f)

In [4]:
# 상관 계수 높은 거 추출
corr_matrix = stock_df.corr()
cor = corr_matrix["pct_label"]
cor1 = cor[(cor.values>0.08) | (cor.values<-0.08) |
           (cor.index == '고가') | (cor.index == '시가') | 
           (cor.index == '종가') | (cor.index == '저가')  ]

cor1 = cor1.drop('pct_label')
len(cor1)

  corr_matrix = stock_df.corr()


In [None]:
cor1.to_pickle('../colume_pickle/6개월_choice_10개_cor_0.08_lstm_대형주.pkl')