In [2]:
from coredotfinance.data import KrxReader
import pandas as pd
import numpy as np
import requests
import lxml
from bs4 import BeautifulSoup
from requests import Response
import re
import matplotlib.pyplot as plt
from pykrx import stock
from pykrx import bond
import time
import math
import seaborn as sns
import plotly.express as px



## Step 1. 데이터 수집
###### 기업의 업종, 매출액, 시가총액의 데이터가 필요했습니다. 이 3가지의 데이터를 한번에 제공하는 적절한 API가 없어서 네이버 파이낸스에서 직접 크롤링해서 수집하였습니다.
###### 
###### 아래의 코드는 업로드 된 데이터를 이용해서 진행 할 경우 코드를 실행할 필요 없습니다!


In [3]:

headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}

finance_url = "https://finance.naver.com/sise/sise_group.naver?type=upjong"
res = requests.get(finance_url, headers=headers)
page_soup = BeautifulSoup(res.text, "lxml")
summary_html = page_soup.select('td a')

upjong_list = pd.DataFrame(re.findall('no=(.+?)">(.+?)</a>',str(summary_html)))
upjong_list = upjong_list.rename(columns={0:"no",1:"upjong"}).sort_values(by=['upjong']).reset_index(drop=True)
upjong_list.no = upjong_list.no.str.zfill(3)

request_upjong_url = "https://finance.naver.com/sise/field_submit.naver?menu=upjong&returnUrl=http%3A%2F%2Ffinance.naver.com%2Fsise%2Fsise_group_detail.naver%3Ftype%3Dupjong%26no%3D"
request_url1 = "&fieldIds=market_sum&fieldIds=property_total&fieldIds=debt_total&fieldIds=sales"
request_url2 = "&fieldIds=operating_profit&fieldIds=net_income&fieldIds=eps&fieldIds=dividend"

upjong = pd.DataFrame()
for i in (upjong_list['no'].to_list()):
    temp = pd.DataFrame()
    for j in ([request_url1,request_url2]):
        res = requests.get(request_upjong_url+i+j, headers=headers)
        page_soup = BeautifulSoup(res.text, "lxml")
        upjong_data_html = page_soup.select_one('#contentarea > div:nth-child(5)')
        col = [item.get_text().strip() for item in upjong_data_html.select('thead th')]
        col = [x for x in col if x not in ['종목명','토론실']]
        row = [item.get_text().strip() for item in upjong_data_html.select('tbody a')]
        row = list(filter(None,row))
        num = np.array([item.get_text().strip() for item in upjong_data_html.select('td.number')])
        num.resize(len(row),len(col))
        df = pd.DataFrame(num, columns=col,index=row).drop(columns=['현재가','전일비','등락률'])
        df['Sector'] = upjong_list[upjong_list.no==i].upjong.iat[0]
        temp = pd.concat([temp,df], axis = 1)

    upjong = pd.concat([upjong,temp], axis = 0)
    
with pd.ExcelWriter('upjong.xlsx', engine='openpyxl') as writer:
    upjong.to_excel(writer)

In [3]:
marketData = pd.read_excel('marketData.xlsx', engine='openpyxl')
marketData = marketData.dropna(subset=['영업이익', '매출액'])
marketData['영업이익'] = marketData['영업이익'].str.replace(',','').astype(int)
marketData['매출액'] = marketData['매출액'].str.replace(',','').astype(int)

def bar(df, x, y, title=" "):
    fig = px.bar(df, x=x, y=y, title=title,height=350, width=1000, color='Sector')
    fig.show()  

def hist(x_values, x_label=" "):
    fig = px.histogram(x_values, x=x_label, height=350, width=600)
    fig.show()
    
def boxplot(df, x, y):
    fig = px.box(df, x=x, y=y, height=400, width=1000, color = "Sector", points="all", hover_data=[df["종목명"]])
    fig.show()

#분석하고 싶은 시장을 여기서 설정해주세요!
market = "KOSPI"
companies = marketData[marketData['시장구분'] == market].sort_values(by=['시가총액'])
hist(companies['시가총액'], "시가총액")



In [10]:
#KOSPI or KOSDAQ시가총액 합과 평균
companies = marketData[marketData['시장구분'] == market].sort_values(by=['시가총액'])
hist(companies, "시가총액")
companies = companies.loc[companies['시가총액'] < 1.0*10**12] #상위 시가총액을 가진 기업 필터링 KOSPI 10**12, KOSDAQ 10**11

groupby_sum_cap = companies.groupby('Sector')['시가총액'].agg(시가총액='sum', count='count').reset_index().sort_values("시가총액")
groupby_mean_cap = companies.groupby('Sector')['시가총액'].agg(시가총액='mean', count='count').reset_index().sort_values("시가총액")
groupby_var_cap = companies.groupby('Sector')['시가총액'].agg(시가총액='var',count='count').reset_index().sort_values("시가총액")

groupby_mean_cap = groupby_mean_cap.loc[groupby_mean_cap['count'] > 5] #상위 시가총액을 가진 기업 필터링 KOSPI 10**12, KOSDAQ 10**11

sum_top10_cap = groupby_sum_cap[-10:]
mean_top10_cap = groupby_mean_cap[-10:]
var_top10_cap = groupby_mean_cap

#bar(sum_top10_cap, 'Sector', '시가총액', title='20조이하 기업들의 업종별 시가총액 합')
bar(mean_top10_cap, 'Sector', '시가총액', title='20조이하 기업들의 업종별 시가총액 평균')
#bar(var_top10_cap, 'Sector', '시가총액', title='20조이하 기업들의 업종별 시가총액 분산')


filtered = companies.loc[companies['Sector'].isin(mean_top10_cap["Sector"])]
boxplot(filtered, "Sector", "시가총액")


In [11]:

#코스피 or 코스닥 매출액 합과 평균
companies = marketData[marketData['시장구분'] == market]

hist(companies, "매출액")
companies = companies.loc[companies['매출액'] < 1*10**4] #KOSPI 1*10**4, KOSDAQ  2*10**3

groupby_sum_sales = companies.groupby('Sector')['매출액'].agg(매출액='sum').reset_index().sort_values(by=['매출액'])
groupby_mean_sales = companies.groupby('Sector')['매출액'].agg(매출액='mean').reset_index().sort_values(by=['매출액'])

sum_top10 = groupby_sum_sales[-10:]
mean_top10 = groupby_mean_sales[-10:]

bar(sum_top10, 'Sector', '매출액', title= '%s 업종별 매출액 합'%market)
bar(mean_top10, 'Sector', '매출액', title= '%s 업종별 매출액 평균'%market)


In [12]:
df_KOPSI = pd.merge(groupby_mean_cap, groupby_sum_sales,on='Sector', how='inner')

In [455]:
fig = px.scatter(df, x='시가총액', y='매출액', hover_data=[df["Sector"]], color="Sector", size="count", width=1000, height=400)
fig.show()

In [13]:
fig = px.scatter(df_KOPSI, x='시가총액', y='매출액', hover_data=[df_KOPSI["Sector"]], color="Sector", size="count", width=800, height=400)
fig.show()