In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

In [2]:
def bind_data(folder_path, start_year, end_year, skiprows=4):
    df_list = []
    for year in range(start_year, end_year + 1):
        filename = 's_' + str(year) + '.csv'
        path = os.path.join(folder_path, filename)
        df = pd.read_csv(path, encoding='utf-8', skiprows=skiprows)
        df_list.append(df)
    df = pd.concat(df_list)

    return df

In [23]:
# データの読込と結合
folder_path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/data/eol/BCP/securities/'
df = bind_data(folder_path, 2004, 2021)

# 列の調整
df = df[[
    '[証券コード(提出者)]', '[EDINETコード(提出者)]', '[企業名(提出者)]',
    '[業種（東証）(提出者)]', '[上場市場(提出者)]', '[決算日]', '[提出日]',
    '[書類種類]', '[ヒット文書]'
]]
new_cols = [
    'stock_code', 'edinet_code', 'firm_name', 'industry', 'market', 'fiscal_ymd',
    'submit_ymd', 'doc_type', 'hit_doc'
]
df.set_axis(labels=new_cols, axis=1, inplace=True)

# 証券コードの調整
df = df[~(df['stock_code'] == '--')]

# 決算日の調整
df = df.assign(
    year = df['fiscal_ymd'].apply(lambda x: int(str(x)[0:4])),
    month = df['fiscal_ymd'].apply(lambda x: str(x)[5:7])
)
df['month'] = df['month'].apply(lambda x: int(x[0]) if x[0] == 0 else int(x))

# 文書情報の修正
def fix_doc_info(text):
    new_text = []
    contain = False
    for t in text:
        if not contain: # '【'まで読み込まれていない時
            if t == '【':
                contain = True
        else: # '【'の後を読み込んでいるとき('】'は含まない)
            if not t == '】':
                new_text.append(t)
    
    new_text = ''.join(new_text)

    return new_text

df['doc_type'] = df['doc_type'].apply(lambda x: x.replace('（ＨＴＭＬ）', ''))
df['hit_doc_fixed'] = df['hit_doc'].apply(fix_doc_info)

# 年ごとに企業ユニークにする
df.drop_duplicates(subset=['stock_code', 'year'], inplace=True, keep='first')

# BCP開示フラグを作成する
df['BCP'] = 1

# BCPを最初に開示した年を追加する
df_grouped = df.groupby('stock_code')
df_grouped = df_grouped['year'].min()
df_grouped.name = 'BCP_first_year'
df = pd.merge(left=df, right=df_grouped, on='stock_code', how='left')

# ソート
df.sort_values(['stock_code', 'year'], inplace=True)

# 保存
save_path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/data/eol/BCP/securities/BCP_data.csv'
df.to_csv(save_path, encoding='cp932', header=True, index=False)

In [25]:
df.columns

Index(['stock_code', 'edinet_code', 'firm_name', 'industry', 'market',
       'fiscal_ymd', 'submit_ymd', 'doc_type', 'hit_doc', 'year', 'month',
       'hit_doc_fixed', 'BCP', 'BCP_first_year'],
      dtype='object')