# 주식정보수집 for Human Deeplearning

# 0. 기본설정

In [49]:
import pandas as pd
import numpy as np
import openai
import fitz
import requests
import json
import datetime
import os
from datetime import datetime

### 0.1. S&P500 목록 정리

In [3]:
# S&P500 기업 정보 데이터 불러오기
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

# 필요한 열 선택하기
sp500 = sp500[['Symbol', 'Security']]

# 열 이름 변경하기
sp500 = sp500.rename(columns={'Symbol': 'Ticker', 'Security': 'Company'})

# 데이터프레임 출력하기
sp500

Unnamed: 0,Ticker,Company
0,MMM,3M
1,AOS,A. O. Smith
2,ABT,Abbott
3,ABBV,AbbVie
4,ACN,Accenture
...,...,...
498,YUM,Yum! Brands
499,ZBRA,Zebra Technologies
500,ZBH,Zimmer Biomet
501,ZION,Zions Bancorporation


In [3]:
def makeBatch(df, batch_size=50):
    bucket = []
    num_batch = len(df) // batch_size
    for n_th in range(num_batch+1):
        if (n_th+1)*batch_size < len(df):
            df_batch = df.iloc[n_th*batch_size: (n_th+1)*batch_size]
        else:
            df_batch = df.iloc[n_th*batch_size: ]
        bucket.append(df_batch)
    return bucket

In [4]:
list_batches = makeBatch(sp500)

In [None]:
list_batches

### 0.2. 날짜정리(for Sentiment) 

In [4]:
timespan = pd.read_csv('./SP500-dataset-withRs/AAL-withRs.csv')

In [7]:
timespan

Unnamed: 0,timestamp,open,high,low,close,volume,return,calculated_price,R
0,2005-10-07,20.900,23.00,20.90,22.15,16350134,0.000000,22.15,"[0.0, 0.005417607223476395, -0.013920071845532..."
1,2005-10-14,22.280,22.40,21.40,22.27,9746113,0.005418,22.27,"[0.005417607223476395, -0.013920071845532012, ..."
2,2005-10-21,22.150,22.42,20.85,21.96,14218231,-0.013920,21.96,"[-0.013920071845532012, 0.08834244080145703, 0..."
3,2005-10-28,22.010,23.90,21.70,23.90,9263807,0.088342,23.90,"[0.08834244080145703, 0.20502092050209209, 0.1..."
4,2005-11-04,24.000,29.00,24.00,28.80,16788473,0.205021,28.80,"[0.20502092050209209, 0.15798611111111116, -0...."
...,...,...,...,...,...,...,...,...,...
912,2023-03-31,13.960,14.76,13.71,14.75,70608911,0.075073,14.75,"[0.07507288629737596, -0.050169491525423715, -..."
913,2023-04-06,14.520,14.62,13.70,14.01,63721777,-0.050169,14.01,"[-0.050169491525423715, -0.08708065667380449, ..."
914,2023-04-14,13.980,14.51,12.73,12.79,208236734,-0.087081,12.79,"[-0.08708065667380449, 0.04769351055512128, -0..."
915,2023-04-21,12.855,13.70,12.80,13.40,123132547,0.047694,13.40,"[0.04769351055512128, -0.04925373134328359]"


In [None]:
timespan['timestamp'] = pd.to_datetime(timespan['timestamp'])
timespan = timespan[timespan['timestamp'] >= datetime.datetime(2018, 1, 1)]
timespan['timestamp'] = timespan['timestamp'].dt.strftime('%Y%m%d')
timespan_list = timespan['timestamp'].values.tolist()

In [9]:
len(timespan_list)

278

### 0.3. download함수 정리

In [10]:
import webbrowser
import time 

## 제공파일타입이 CSV인 경우
def downloadCSV(dfTickers):
    list_tickers = dfTickers['Ticker'].tolist()
    print(f'corps: {list_tickers}')
    
    for t in list_tickers:
        u = f'https://www.alphavantage.co/query?function={function}&symbol={t}&datatype=csv&apikey={ALPHA_VINTAGE_API_KEY}'
        r = requests.get(u)
        webbrowser.open_new_tab(r.url)
        time.sleep(25) # 20초로 해도 5콜/분 한도 초과라 여유있게 25초로 설정

## 제공파일타입이 JSON인 경우
def downloadJSON(dfTickers, function):
    list_tickers = dfTickers['Ticker'].tolist()
    print(f'corps: {list_tickers}')
    
    if function == 'NEWS_SENTIMENT':
        for t in list_tickers:
            path = f'./sp500-dataset-all/{function}/{t}.json'
            u = f'https://www.alphavantage.co/query?function={function}&time_from={time_from}&time_to={time_to}&sort={sort}&limit={limit}&tickers={t}&apikey={api_key}'
            r = requests.get(u)
            r_json = r.json()
            with open(path, "w") as f:
                json.dump(r_json, f)
            time.sleep(25) # 20초로 해도 5콜/분 한도 초과라 여유있게 25초로 설정
    else :
        for t in list_tickers:
            path = f'./sp500-dataset-all/{function}/{t}.json'
            u = f'https://www.alphavantage.co/query?function={function}&symbol={t}&apikey={api_key}'
            r = requests.get(u)
            r_json = r.json()
            with open(path, "w") as f:
                json.dump(r_json, f)
            time.sleep(25) # 20초로 해도 5콜/분 한도 초과라 여유있게 25초로 설정


### 0.3. API key 설정

In [11]:
api_key = 'API'

# 1. JSON to CSV

### 1.1. 부문별 파라미터 설정

In [12]:
# 재무정보
function_income = 'INCOME_STATEMENT'

# 재무제표
function_balance = 'BALANCE_SHEET'

# 현금흐름
function_cashflow = 'CASH_FLOW'

# 수익
function_earnings = 'EARNINGS'

# Some Metrics
function_metrics = 'OVERVIEW'

# News Sentiment
function_sentiment = 'NEWS_SENTIMENT'
time_from = [s + 'T0000' for s in timespan_list]
time_to = [s + 'T1600' for s in timespan_list]


'20220410T0130' # YYYYMMDDTHHMM 형식
time_to = ''
sort = 'EARLIEST'
limit = '50'

### 1.2. 다운로드

In [None]:
# 준영님
downloadJSON(list_batches[0], function_income)
downloadJSON(list_batches[1], function_income)
downloadJSON(list_batches[2], function_income)
downloadJSON(list_batches[3], function_income)
downloadJSON(list_batches[4], function_income)
downloadJSON(list_batches[5], function_income)
downloadJSON(list_batches[6], function_income)
downloadJSON(list_batches[7], function_income)
downloadJSON(list_batches[8], function_income)
downloadJSON(list_batches[9], function_income)

In [None]:
# 해솔님
downloadJSON(list_batches[0], function_balance)
downloadJSON(list_batches[1], function_balance)
downloadJSON(list_batches[2], function_balance)
downloadJSON(list_batches[3], function_balance)
downloadJSON(list_batches[4], function_balance)
downloadJSON(list_batches[5], function_balance)
downloadJSON(list_batches[6], function_balance)
downloadJSON(list_batches[7], function_balance)
downloadJSON(list_batches[8], function_balance)
downloadJSON(list_batches[9], function_balance)

In [None]:
# 채환님
downloadJSON(list_batches[0], function_cashflow)
downloadJSON(list_batches[1], function_cashflow)
downloadJSON(list_batches[2], function_cashflow)
downloadJSON(list_batches[3], function_cashflow)
downloadJSON(list_batches[4], function_cashflow)
downloadJSON(list_batches[5], function_cashflow)
downloadJSON(list_batches[6], function_cashflow)
downloadJSON(list_batches[7], function_cashflow)
downloadJSON(list_batches[8], function_cashflow)
downloadJSON(list_batches[9], function_cashflow)

In [None]:
# 성희님
downloadJSON(list_batches[0], function_earnings)
downloadJSON(list_batches[1], function_earnings)
downloadJSON(list_batches[2], function_earnings)
downloadJSON(list_batches[3], function_earnings)
downloadJSON(list_batches[4], function_earnings)
downloadJSON(list_batches[5], function_earnings)
downloadJSON(list_batches[6], function_earnings)
downloadJSON(list_batches[7], function_earnings)
downloadJSON(list_batches[8], function_earnings)
downloadJSON(list_batches[9], function_earnings)

In [105]:
# 누락분
# downloadJSON(list_batches[10], function_cashflow)
# downloadJSON(list_batches[10], function_earnings)
# downloadJSON(list_batches[10], function_income)
downloadJSON(list_batches[10], function_balance)


corps: ['ZBH', 'ZION', 'ZTS']


In [141]:
# error 수작업

# 목록 
# CASH_FLOW : ADM, ANET, APD, PAYX, PHM, SBUX, VICI / BRK.B, BF.B
# EARNINGS : / BRK.B, BF.B
# BALANCE_SHEET : / BRK.B, BF.B
# INCOME_STATEMENT : CTAS, CTRA, DGX, DRI, KO, MCO, PEG, RF, SPGI, GEHC, CEG, OGN, FRC, VTRS, CARR, OTIS, FOX, FOXA / BRK.B, BF.B


error_function = 'INCOME_STATEMENT'
error_t = 'FOXA'

error_path = f'./sp500-dataset-all/{error_function}/{error_t}.json'
error_u = f'https://www.alphavantage.co/query?function={error_function}&symbol={error_t}&apikey={api_key}'
error_r = requests.get(error_u)
error_r_json = error_r.json()
with open(error_path, "w") as f:
    json.dump(error_r_json, f)

### 1.3. JSON to CSV

In [142]:
## INCOME_STATEMENT

# 폴더 경로 지정
folder_path = './SP500-dataset-all/INCOME_STATEMENT'
# 폴더 내 모든 파일의 경로 가져오기
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path)]
# 파일 목록 출력
file_paths

for i in file_paths : 
    try:
        with open(i, 'r') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f'Error: Failed to decode JSON in {i}. {e}')
        continue
    filename, ext = os.path.splitext(i)
    filename_only = os.path.basename(filename)
    temp = pd.json_normalize(data['quarterlyReports'])
    temp.to_csv(f'{folder_path}_csv/{filename_only}.csv', index=False)

In [106]:
## BALANCE_SHEET

# 폴더 경로 지정
folder_path = './SP500-dataset-all/BALANCE_SHEET'
# 폴더 내 모든 파일의 경로 가져오기
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path)]
# 파일 목록 출력
file_paths

for i in file_paths : 
    try:
        with open(i, 'r') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f'Error: Failed to decode JSON in {i}. {e}')
        continue
    filename, ext = os.path.splitext(i)
    filename_only = os.path.basename(filename)
    temp = pd.json_normalize(data['quarterlyReports'])
    temp.to_csv(f'{folder_path}_csv/{filename_only}.csv', index=False)

In [30]:
## CASH_FLOW

# 폴더 경로 지정
folder_path = './SP500-dataset-all/CASH_FLOW'
# 폴더 내 모든 파일의 경로 가져오기
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path)]
# 파일 목록 출력
file_paths

for i in file_paths : 
    try:
        with open(i, 'r') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f'Error: Failed to decode JSON in {i}. {e}')
        continue
    filename, ext = os.path.splitext(i)
    filename_only = os.path.basename(filename)
    temp = pd.json_normalize(data['quarterlyReports'])
    temp.to_csv(f'{folder_path}_csv/{filename_only}.csv', index=False)

In [45]:
## EARNINGS

# 폴더 경로 지정
folder_path = './SP500-dataset-all/EARNINGS'
# 폴더 내 모든 파일의 경로 가져오기
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path)]
# 파일 목록 출력
file_paths

for i in file_paths : 
    try:
        with open(i, 'r') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f'Error: Failed to decode JSON in {i}. {e}')
        continue
    filename, ext = os.path.splitext(i)
    filename_only = os.path.basename(filename)
    temp = pd.json_normalize(data['quarterlyEarnings'])
    temp.to_csv(f'{folder_path}_csv/{filename_only}.csv', index=False)

In [None]:
## OVERVIEW

# # 폴더 경로 지정
# folder_path = './SP500-dataset-all/OVERVIEW'
# # 폴더 내 모든 파일의 경로 가져오기
# file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path)]
# # 파일 목록 출력
# file_paths

# for i in file_paths : 
#     with open(i, 'r') as f:
#         data = json.load(f)
#     filename, ext = os.path.splitext(i)
#     filename_only = os.path.basename(filename)
#     temp = pd.json_normalize(data['quarterlyReports'])
#     temp.to_csv(f'{folder_path}_csv/{filename_only}.csv', index=False)

# 2. Merge

In [97]:
def generaterow(i, folder_path, keydate): # folder_path는 folder_path_INCOME_STATEMENT와 같이, keydate은 fiscalDateEnding인지 reportedDate(EARNINGS만)인지
    filename, ext = os.path.splitext(i)
    filename_only = os.path.basename(filename)
    df = pd.read_csv(f'{folder_path}/{filename_only}.csv')
    df.set_index(pd.to_datetime(df[keydate]), inplace=True)
    df.drop(columns=[keydate], inplace=True)
    df_resampled = df.resample('D').asfreq()
    df_resampled.fillna(method='ffill', inplace=True)
    last_date = df_resampled.index[-1]
    today = datetime.date.today()
    date_range = pd.date_range(last_date, today, freq='D')[1:]
    df_resampled = pd.concat([df_resampled, pd.DataFrame(index=date_range)], axis=0)
    df_resampled.fillna(method='ffill', inplace=True)
    df_resampled.reset_index(inplace=True)
    df_resampled.rename(columns={'index': keydate}, inplace=True)
    df_resampled.to_csv(f'{folder_path}_process/{filename_only}.csv', index=False)


### 2.1. Merge를 위한 row 증폭 

In [143]:
# 폴더 경로 지정
folder_path_INCOME_STATEMENT = './SP500-dataset-all/INCOME_STATEMENT_csv'
folder_path_BALANCE_SHEET = './SP500-dataset-all/BALANCE_SHEET_csv'
folder_path_CASH_FLOW = './SP500-dataset-all/CASH_FLOW_csv'
folder_path_EARNINGS = './SP500-dataset-all/EARNINGS_csv'

# 폴더 내 모든 파일의 경로 가져오기
file_paths_INCOME_STATEMENT = [os.path.join(folder_path_INCOME_STATEMENT, file) for file in os.listdir(folder_path_INCOME_STATEMENT)]
file_paths_BALANCE_SHEET = [os.path.join(folder_path_BALANCE_SHEET, file) for file in os.listdir(folder_path_BALANCE_SHEET)]
file_paths_CASH_FLOW = [os.path.join(folder_path_CASH_FLOW, file) for file in os.listdir(folder_path_CASH_FLOW)]
file_paths_EARNINGS = [os.path.join(folder_path_EARNINGS, file) for file in os.listdir(folder_path_EARNINGS)]

# merge를 위한 전처리
for i in file_paths_INCOME_STATEMENT : 
    generaterow(i, folder_path_INCOME_STATEMENT, 'fiscalDateEnding')

# for i in file_paths_BALANCE_SHEET :
#     generaterow(i, folder_path_BALANCE_SHEET, 'fiscalDateEnding')

# for i in file_paths_CASH_FLOW :
#     generaterow(i, folder_path_CASH_FLOW, 'fiscalDateEnding')

# for i in file_paths_EARNINGS :
#     generaterow(i, folder_path_EARNINGS, 'reportedDate')


In [102]:
# EARNINGS GPN 추가 처리

df_i = pd.read_csv(f'./SP500-dataset-all/GPN.csv')
df_i.set_index(pd.to_datetime(df_i['reportedDate']), inplace=True)
df_i.drop_duplicates(subset=['reportedDate'], inplace=True)
df_i_resampled = df_i.resample('D').asfreq()
df_i_resampled.fillna(method='ffill', inplace=True)
last_date = df_i_resampled.index[-1]
today = datetime.date.today()
date_range = pd.date_range(last_date, today, freq='D')[1:]
df_i_resampled = pd.concat([df_i_resampled, pd.DataFrame(index=date_range)], axis=0)
df_i_resampled.fillna(method='ffill', inplace=True)
df_i_resampled.reset_index(inplace=True)
df_i_resampled.rename(columns={'index':'reportedDate'}, inplace=True)

df_i_resampled.to_csv('./SP500-dataset-all/GPN_process.csv')

### 2.2. Merge

##### 2.2.1. INCOME_STATEMENT, BALANCE_SHEET, CASH_FLOW, EARNINGS 병합

In [147]:
file_names = set()
for file_list in [file_paths_INCOME_STATEMENT, file_paths_BALANCE_SHEET, file_paths_CASH_FLOW, folder_path_EARNINGS]:
    file_names.update([os.path.splitext(os.path.basename(f))[0] for f in file_list])

for file_name in file_names:
    income_file = f'./SP500-dataset-all/INCOME_STATEMENT_csv_process/{file_name}.csv'
    balance_file = f'./SP500-dataset-all/BALANCE_SHEET_csv_process/{file_name}.csv'
    cash_flow_file = f'./SP500-dataset-all/CASH_FLOW_csv_process/{file_name}.csv'
    earnings_file = f'./SP500-dataset-all/EARNINGS_csv_process/{file_name}.csv'

    if os.path.exists(income_file) and os.path.exists(balance_file) and os.path.exists(cash_flow_file) and os.path.exists(earnings_file):
        df1 = pd.read_csv(income_file)
        df2 = pd.read_csv(balance_file)
        df3 = pd.read_csv(cash_flow_file)
        df4 = pd.read_csv(earnings_file)
        merged_df = pd.merge(df1, df2, on='fiscalDateEnding')
        merged_df = pd.merge(merged_df, df3, on='fiscalDateEnding')
        merged_df = pd.concat([merged_df.set_index('fiscalDateEnding'), df4.set_index('reportedDate')], axis=1, join='outer')
        merged_df.to_csv(f'./SP500-dataset-all/MERGED/{file_name}.csv')


#### 2.2.2. primary key name

In [None]:
directory_path = './SP500-dataset-all/MERGED/'

file_names = set()
for f in os.listdir(directory_path):
    if os.path.isfile(os.path.join(directory_path, f)):
        file_names.update([os.path.splitext(os.path.basename(f))[0]])

file_names

In [None]:
# primary key인 첫번째열 timestamp로 이름 정의, fiscalDateEnding열 삭제
for file_name in file_names:
    df_rename = pd.read_csv(f'{directory_path}{file_name}.csv')
    first_column_name = df_rename.columns[0]
    df_rename = df_rename.rename(columns={first_column_name: 'timestamp'})
    df_rename = df_rename.drop(columns='fiscalDateEnding')
    df_rename.to_csv(f'./SP500-dataset-all/MERGED_rename/{file_name}.csv', index=False)

In [None]:
type_allmost = pd.read_csv('./SP500-dataset-all/MERGED_rename/AAL.csv')
type_diff = pd.read_csv('./SP500-dataset-all/MERGED_rename/GPN.csv')

a = list(type_allmost.columns)
b = list(type_diff.columns)

different_elements = set(a).symmetric_difference(set(b))

# 다른 요소를 출력합니다.
print("Different elements between list1 and list2:")
for element in different_elements:
    print(element)

In [None]:
a

In [None]:
b

In [42]:
df = pd.read_csv('./SP500-dataset-all/MERGED_rename/GPN.csv')
df = df.drop(columns='Unnamed: 0')
df = df.drop(columns='reportedDate.1')
df.to_csv('./SP500-dataset-all/MERGED_rename/GPN.csv', index=False)

##### 2.2.3. 주식가격 정보와 병합

In [148]:
directory_path = './SP500-dataset-withRs/'

file_names = set()
for f in os.listdir(directory_path):
    if os.path.isfile(os.path.join(directory_path, f)):
        file_name, _ = os.path.splitext(os.path.basename(f))
        file_name = file_name.replace('-withRs', '')  # '-withRs' 문자열 제거
        file_names.update([file_name])

In [None]:
for file_name in file_names:
    price_file = f'./SP500-dataset-withRs/{file_name}-withRs.csv'
    notprice_file = f'./SP500-dataset-all/MERGED_rename/{file_name}.csv'

    if os.path.exists(price_file) and os.path.exists(notprice_file):
        try:
            df1 = pd.read_csv(price_file, error_bad_lines=False)
            df2 = pd.read_csv(notprice_file)
        except pd.errors.ParserError as e:
            print(f"ParserError in file: {file_name}")
        except Exception as e:
            print(f"Unexpected error in file: {file_name}, Error: {e}")
        else:
            df1 = df1.drop(columns='R')
            merged_df = pd.merge(df1, df2, on='timestamp', how='inner')
            merged_df.to_csv(f'./SP500-dataset-all/MERGED_price/{file_name}.csv', index=False)

# 해당 처리로 AMAT, TEL, EOG, CNC, VTR은 제외되었음(해당 기업은 모두 2018년 이전 데이터가 없음)

##### 2.2.4. 개별기업 파일 전처리(2018년 이후 데이터만 취급)

In [58]:
directory_path = './SP500-dataset-all/MERGED_price/'
start_date = datetime.strptime('2018-01-01', '%Y-%m-%d')
end_date = datetime.strptime('2022-04-30', '%Y-%m-%d')

for f in os.listdir(directory_path):
    file_path = os.path.join(directory_path, f)
    file_name, _ = os.path.splitext(os.path.basename(f))

    if os.path.isfile(file_path):
        df = pd.read_csv(file_path)
        
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            
            min_date = df['timestamp'].min()
            max_date = df['timestamp'].max()

            if min_date > start_date and max_date < end_date:
                print(f"Incomplete date range in file: {f}")
            else:
                df = df[df['timestamp'] >= start_date]
                df.to_csv(f'./SP500-dataset-all/FINAL/{file_name}.csv', index=False)
        else:
            print(f"'timestamp' column not found in file: {file_name}")

In [None]:
directory_path = './SP500-dataset-all/FINAL/'
csv_files = [f for f in os.listdir(directory_path)]
error_file = []
error_file_name = []

for csv_file in csv_files:
    file_path = os.path.join(directory_path, csv_file)
    name = os.path.splitext(csv_file)[0]
    df = pd.read_csv(file_path)
    row_count = len(df)
    if row_count < 278:
        error_file.append(f'{csv_file}_{row_count}rows')
        error_file_name.append(f'{name}')
    print(f"Number of rows in {csv_file}: {row_count}")


error_file # ETN, PEG는 삭제하고 278개의 행이 모두 있는 버전과 불완전 행도 모두 포함시킨 버전 두개를 만들것
error_file_name

### 2.3. 통합파일 생성
- 티커열 생성(2.3.1.)
- 278개 행이 모두 있는 버전, 불완전 행 기업도 포함한 버전 두 개의 파일을 생성(2.3.1.)
- 13주 붙임(2.3.2.)
- 값없는 열이 포함된 행 제거(2.3.3.)
- train set 분리(2.3.4.)

#### 2.3.1. 티커열생성 및 통합파일 생성

In [None]:
directory_path = './SP500-dataset-all/FINAL/'
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

# Initialize an empty DataFrame to store the final merged data
merged_data = pd.DataFrame()

for csv_file in csv_files:
    file_path = os.path.join(directory_path, csv_file)
    df = pd.read_csv(file_path)
    
    # Remove the file extension and create a ticker column
    ticker = os.path.splitext(csv_file)[0]
    df.insert(0, 'ticker', ticker)
    
    # Append the current DataFrame to the merged_data DataFrame
    merged_data = merged_data.append(df, ignore_index=True)

# Save the merged_data DataFrame as a CSV file
merged_data.to_csv('./SP500-dataset-all/merged_data_witherror.csv', index=False)

In [None]:
# Initialize an empty DataFrame to store the final merged data
merged_data = pd.DataFrame()

for csv_file in csv_files:
    file_path = os.path.join(directory_path, csv_file)
    df = pd.read_csv(file_path)
    
    # Remove the file extension and create a ticker column
    ticker = os.path.splitext(csv_file)[0]
    df.insert(0, 'ticker', ticker)
    
    # Append the current DataFrame to the merged_data DataFrame
    if ticker in error_file_name:
        pass
    else:
        merged_data = merged_data.append(df, ignore_index=True)

# Save the merged_data DataFrame as a CSV file
merged_data.to_csv('./SP500-dataset-all/merged_data_withouterror.csv', index=False)

#### 2.3.2. P, P_future, R, R_future 라벨링

In [None]:
# witherror파일
df1 = pd.read_csv('./SP500-dataset-all/merged_data_witherror.csv')
df2 = pd.read_csv('dataset_GPT_model.csv')

# 두 테이블 합치기
merged_df = pd.merge(df1, df2[['ticker', 'timestamp', 'P', 'P_future', 'R', 'R_future']], on=['ticker', 'timestamp'])
merged_df.to_csv('./SP500-dataset-all/merged_data_witherror_labeling.csv', index=False)

In [None]:
# withouterror파일
df1 = pd.read_csv('./SP500-dataset-all/merged_data_withouterror.csv')
df2 = pd.read_csv('dataset_GPT_model.csv')

# 두 테이블 합치기
merged_df = pd.merge(df1, df2[['ticker', 'timestamp', 'P', 'P_future', 'R', 'R_future']], on=['ticker', 'timestamp'])
merged_df.to_csv('./SP500-dataset-all/merged_data_withouterror_labeling.csv', index=False)

#### 2.3.3. 빈칸(NA)이 10개 이상인 행 제거, currency열 제거, 'None'을 0으로 대체

In [None]:
# witherror파일
df = pd.read_csv('./SP500-dataset-all/merged_data_witherror_labeling.csv')
df = df.dropna(axis=0, thresh=df.shape[1] - 10)
df = df.drop(columns=['reportedCurrency_x', 'reportedCurrency_y', 'reportedCurrency'])
df = df.fillna(value=0)
df = df.replace('None', 0)
df.to_csv('./SP500-dataset-all/dataset_human_model_witherror.csv', index=False)

In [None]:
# withouterror파일
df = pd.read_csv('./SP500-dataset-all/merged_data_withouterror_labeling.csv')
df = df.dropna(axis=0, thresh=df.shape[1] - 10)
df = df.drop(columns=['reportedCurrency_x', 'reportedCurrency_y', 'reportedCurrency'])
df = df.fillna(value=0)
df = df.replace('None', 0)
df.to_csv('./SP500-dataset-all/dataset_human_model_withouterror.csv', index=False)

#### 2.3.4. train set 분리 

In [96]:
df = pd.read_csv('./SP500-dataset-all/dataset_human_model_witherror.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
train_set = df[df['timestamp'] < datetime(2021, 8, 31)]
train_set.to_csv('./SP500-dataset-all/dataset_train_human_model_witherror.csv')

In [97]:
df = pd.read_csv('./SP500-dataset-all/dataset_human_model_withouterror.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
train_set = df[df['timestamp'] < datetime(2021, 8, 31)]
train_set.to_csv('./SP500-dataset-all/dataset_train_human_model_withouterror.csv')