# Correlation Clustering between stocks

In [1]:
from pathlib import Path
import sys
import pickle

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
# import streamlit as st

from collections import defaultdict, Counter

In [2]:
plt.rcParams['font.family'] = 'NanumGothic'

In [3]:
import FinanceDataReader as fdr
import quantstats as qs

In [4]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

In [5]:
## custom libs

from korquanttools.pricevolume.loader import KRXPriceDM
from korquanttools.pricevolume.utils import DateUtil
from korquanttools.pricevolume.config import PathConfig

## 1. Import datasets

### Transaction volume & adjusted close data

- Import price-volume data module from KRX using custom lib `korquanttools`
- Import related datasets

In [6]:
# Global parameters

START = 20140101
END = 20220520

In [7]:
# Init data module
pricevolume = KRXPriceDM(START, END)
pricevolume.get_info()


        * DM name: KRX_pricevolume
        * DM description: Basic price-volume data imported from KRX website & NAVER finance. Has KOSPI, KOSDAQ, KONEX stocks.
        * birthday: 20211203
        * DM period: 19990101 ~ 
        * Available data: ['lv1', 'open', 'high', 'low', 'close', 'volume', 'dollarvolume', 'marketcap']
        


In [8]:
dollarvolume_df = pricevolume.get_data('dollarvolume')
dollarvolume_df = dollarvolume_df.astype(float)

In [9]:
holidays = dollarvolume_df.isnull().all(axis=1)
tradingdays = ~holidays

holidays = holidays.index[holidays]
tradingdays = tradingdays.index[tradingdays]

In [10]:
adjClose_df = pd.read_pickle(PathConfig.cache_path / f"temp_adjClose_{START}_{END}")
return_df = pd.read_pickle(PathConfig.cache_path / f"temp_return_{START}_{END}")

In [25]:
DateUtil.intDate_2_timestamp(END) - 

return_df.loc['2020-01-01':, :]

ISU_SRT_CD,000020,000040,000050,000060,000070,000075,000080,000087,000100,000105,...,37550L,388050,389140,405640,412930,413600,415580,419270,389260,399720
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,0.010830,0.010591,0.032328,-0.039216,0.004478,-0.012615,-0.003448,-0.008596,-0.021123,0.009640,...,,,,,,,,,,
2020-01-03,-0.004762,-0.010480,0.001044,0.000000,-0.008915,0.000000,-0.012111,-0.005780,-0.004316,-0.002387,...,,,,,,,,,,
2020-01-06,-0.021531,-0.022065,-0.005214,-0.029155,-0.037481,0.002323,0.003503,0.002907,-0.017362,-0.019141,...,,,,,,,,,,
2020-01-07,-0.002445,-0.028881,0.002096,-0.015015,0.010903,-0.004635,-0.006981,-0.002899,0.017669,0.007318,...,,,,,,,,,,
2020-01-08,-0.028186,-0.014870,-0.030335,-0.015244,-0.049307,-0.020955,-0.026362,-0.017442,-0.019517,-0.009686,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-16,0.008511,-0.001263,0.013029,0.017711,0.001225,0.006400,0.009859,0.007194,-0.006873,0.005310,...,-0.012285,-0.005764,-0.003953,-0.002283,0.002283,0.000000,-0.002309,-0.011211,,
2022-05-17,0.004219,-0.003793,-0.006431,0.024096,0.001224,0.000000,0.030683,0.009524,0.000000,-0.001761,...,0.000000,0.011594,0.007937,0.000000,0.004556,0.030488,-0.002315,0.004535,0.028674,
2022-05-18,0.008403,-0.006345,-0.016181,0.039216,-0.007335,0.001590,-0.004060,-0.002358,0.013841,0.007055,...,-0.013682,0.011461,0.001969,0.002288,-0.002268,0.001972,0.002320,-0.002257,-0.045296,
2022-05-19,-0.008333,-0.003831,0.003289,-0.018868,-0.016010,-0.011111,-0.019022,-0.018913,-0.006826,-0.008757,...,-0.030265,0.000000,-0.027505,-0.002283,-0.006818,-0.019685,0.000000,0.002262,0.299270,


### security id - to - name 


In [11]:
with open('kospi_ii2codename_combined.pickle', 'rb') as p:
    kospi_sid2name = pickle.load(p)

In [12]:
with open('kosdaq_ii2codename_combined.pickle', 'rb') as p:
    kosdaq_sid2name = pickle.load(p)

In [13]:
sid2name = {**kospi_sid2name, **kosdaq_sid2name}

In [14]:
# with open('sid2name.pkl', 'wb') as p:
#     pickle.dump(sid2name, p)

# 2. Preprocess data to create lab environment 

Only clean data used

In [15]:
return_df = return_df.loc[tradingdays, :]

In [16]:
howmanydays = 252

recent_return_df = return_df.iloc[-howmanydays:, :].copy()
recent_return_df.dropna(axis=1, inplace=True)


In [17]:
recent_return_df.columns[(recent_return_df == 0).all(axis=0)]
# 거래정지 종목들. 아마 FDR에서 return가져오면서 섞여들어온 것들이 아닌가 싶다. 

Index(['003620', '006580', '009730', '010580', '011690', '012600', '015540',
       '016670', '021820', '024830', '032790', '033340', '033790', '038340',
       '043590', '052770', '054220', '056000', '056730', '058220', '058420',
       '058530', '060300', '064520', '065560', '069110', '078590', '083470',
       '099520', '101680', '103230', '106080', '109070', '114190', '121890',
       '126870', '127160', '138360', '141020', '158310', '182690', '900100',
       '180400', '103660', '160600', '178780', '223310', '208860', '176440',
       '215600', '263540', '263920', '950160', '257370', '309900'],
      dtype='object', name='ISU_SRT_CD')

In [18]:
recent_return_df = recent_return_df.loc[:, (recent_return_df != 0).any(axis=0)]
# There are sid with 0.0 all day. Remove them

In [19]:
recent_return_df.shape

(252, 2446)

In [26]:
# recent_return_df.to_pickle('recent252_return_df.pkl')

Only Top N00 used

- metric: dollarvolume
- calculation method: the most recent @@ days average

In [617]:
TOP = 200

In [618]:
recent_dv_df = dollarvolume_df.iloc[-howmanydays:, :].copy()
mean_dv_s = recent_dv_df.mean(axis=0)
mean_dv_s.dropna(inplace=True)
top_sid_list = list(mean_dv_s.sort_values()[-TOP:].index)

In [619]:
recent_return_df = recent_return_df[recent_return_df.columns.intersection(top_sid_list)].copy()

### Calculate correlation matrix

In [20]:
return_corr_df = recent_return_df.corr(method='pearson')
return_corr_df

ISU_SRT_CD,000020,000040,000050,000060,000070,000075,000080,000087,000100,000105,...,378850,950220,059270,294570,333620,351330,367480,373340,361610,377400
ISU_SRT_CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000020,1.000000,0.176975,0.067970,-0.061092,0.065096,0.065429,0.088100,0.059320,0.234996,0.245992,...,0.113706,0.190060,0.387558,0.202113,0.149201,0.259301,0.018944,-0.002990,0.088787,0.053572
000040,0.176975,1.000000,0.305702,0.057206,0.291463,0.289151,0.348878,0.239266,0.282387,0.344444,...,0.251704,0.214052,0.126283,0.146537,0.321289,0.262753,0.028516,0.080077,0.174766,0.024963
000050,0.067970,0.305702,1.000000,0.103863,0.147829,0.187276,0.181420,0.189016,0.127331,0.183388,...,0.225200,0.130399,0.220922,0.186181,0.199186,0.151240,-0.029986,0.018479,0.145210,-0.037371
000060,-0.061092,0.057206,0.103863,1.000000,0.206322,0.182085,0.064822,0.061655,0.091872,0.084379,...,0.085146,-0.064838,0.072264,0.089808,0.048641,0.106697,0.008310,0.013131,0.143957,-0.026710
000070,0.065096,0.291463,0.147829,0.206322,1.000000,0.692895,0.302850,0.291966,0.347349,0.320762,...,0.225644,0.268488,0.087510,0.269600,0.230988,0.337842,-0.067337,0.016798,0.159134,-0.099578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351330,0.259301,0.262753,0.151240,0.106697,0.337842,0.217872,0.219619,0.068630,0.294359,0.357279,...,0.296278,0.268716,0.226408,0.278992,0.346724,1.000000,0.085025,0.060551,0.191163,0.097515
367480,0.018944,0.028516,-0.029986,0.008310,-0.067337,-0.072044,0.153634,0.045628,0.080323,0.048637,...,0.089146,0.070406,-0.002748,-0.007907,-0.032707,0.085025,1.000000,0.420906,0.037742,0.580578
373340,-0.002990,0.080077,0.018479,0.013131,0.016798,0.049500,0.065630,0.089725,0.123946,0.112921,...,0.170741,0.153805,0.045512,0.025768,-0.035332,0.060551,0.420906,1.000000,-0.011285,0.756228
361610,0.088787,0.174766,0.145210,0.143957,0.159134,0.143745,0.121087,0.120054,0.250797,0.240627,...,0.125564,0.265800,0.045281,0.225969,0.151837,0.191163,0.037742,-0.011285,1.000000,0.008454


In [21]:
return_corr_df.to_pickle('return_corr_df.pkl')

In [621]:
corr_sid_list = list(return_corr_df.columns)

In [622]:
return_corr_arr = return_corr_df.values.tolist()
return_corr_arr = return_corr_arr

## Clustering with various algorithms

In [623]:
n_clusters = 21 # daum 금융 업종별 시세 업종 갯수 # https://finance.daum.net/domestic/sectors
random_seed = 123


In [624]:

# kmeans++
kmeans = KMeans(
    n_clusters=n_clusters,
    init='k-means++',
    random_state=random_seed,
    )

# agllomerative (hierarchial) 
agglomerative = AgglomerativeClustering(
    n_clusters=n_clusters,
    )

# DBSCAN
dbscan = DBSCAN()

### Test metric

다음 금융의 업종별 주식들에서 거래량순으로 정렬하여 가져온 종목들

In [625]:
curdir = Path('.').resolve()
curdir

WindowsPath('E:/VSCodeProjects/korquanttools/notebooks')

In [626]:
sector_pickle_files = list(curdir.glob('D*.pkl'))

sector_count = len(sector_pickle_files)

In [627]:
sectorname2sidlist = {}

for sector_pkl in sector_pickle_files:
    sector_name = sector_pkl.stem
    sector_name = sector_name[9:]

    df = pd.read_pickle(sector_pkl)
    try:
        sid_list = list(df['symbolCode'])
        sid_list = [sid[1:] for sid in sid_list]
        sectorname2sidlist[sector_name] = sid_list
    except:
        print(f'skipped {sector_name}')




skipped 은행


In [628]:
sectorname2sidlist.keys()

dict_keys(['음식료품', '섬유의복', '종이목재', '화학', '의약품', '비금속광물', '철강및금속', '기계', '전기전자', '의료정밀', '운수장비', '유통업', '전기가스업', '건설업', '운수창고', '통신업', '금융업', '증권', '보험', '서비스업'])

In [629]:
for sectorname, sector_sid_list in sectorname2sidlist.items():
    print(f'{sectorname}: {len(sector_sid_list)}')

음식료품: 47
섬유의복: 25
종이목재: 19
화학: 100
의약품: 53
비금속광물: 26
철강및금속: 52
기계: 42
전기전자: 75
의료정밀: 8
운수장비: 65
유통업: 66
전기가스업: 10
건설업: 37
운수창고: 28
통신업: 5
금융업: 100
증권: 31
보험: 15
서비스업: 97


### KMeans++

In [630]:
kmeans.fit(return_corr_arr)

In [631]:
kmeans.labels_

array([14, 13, 19, 13, 17, 13, 16, 16,  5, 10,  2,  1,  2,  9,  2, 15, 11,
       17, 19, 10, 10,  1, 11,  3, 13, 17,  4, 13, 13, 17, 11,  5,  4, 10,
        2, 13,  1, 17,  9,  9, 16,  2,  2, 10, 16, 18,  3,  2,  0,  5,  2,
       11, 13,  2,  1, 14,  2, 16,  0, 18, 11,  9,  4,  9,  4,  2,  1,  0,
        0,  3,  0,  5,  3,  2, 10, 12, 12, 12,  3,  5,  6, 13, 16,  8, 12,
        6,  2, 20, 15,  1,  1,  5, 19, 16,  7, 15, 20,  6,  6,  8, 15, 20,
       13, 11,  6, 18, 18,  8, 11,  5, 18,  2, 11,  7, 20,  1, 10,  1, 11,
        8,  5,  6, 18,  2,  6, 14,  3,  7,  5,  3,  6, 19, 11,  8, 12,  5,
       14, 10, 16,  6, 17,  6, 18,  8, 10,  4,  3, 10,  6, 10, 20, 14, 18,
       15,  8, 18,  6,  1, 15,  4,  2,  2, 20, 10,  1, 20, 15, 11,  7, 11,
        9, 11,  3,  8,  1, 12, 20, 15,  1])

In [632]:
kmeans.cluster_centers_

array([[ 0.04690687,  0.16337888,  0.00127276, ...,  0.06487384,
        -0.01913044,  0.13085249],
       [ 0.07046294,  0.185173  , -0.01695385, ...,  0.12528891,
         0.10246934,  0.20530808],
       [ 0.05784338,  0.28232827,  0.01483411, ...,  0.11172948,
         0.02144846,  0.1704861 ],
       ...,
       [ 0.0212189 ,  0.13368946, -0.06487292, ...,  0.15753721,
         0.19506831,  0.15035165],
       [-0.06740809, -0.15059825,  0.73603369, ..., -0.0852758 ,
        -0.01163947, -0.01283959],
       [-0.01150862,  0.12822159, -0.04023092, ...,  0.42611388,
         0.03184884,  0.1450911 ]])

### Agglomerative Clustering

In [633]:
agglomerative.fit(return_corr_arr)

In [634]:
agglomerative.labels_

array([ 9, 20, 17, 13,  7, 13,  1, 15, 16,  3,  1,  2,  8,  4,  8,  6,  0,
        7, 17,  2, 19,  2, 12,  0, 20,  7, 14, 13, 13,  7,  0, 16, 14,  3,
        8, 13,  2,  7,  8,  8,  2,  7,  7,  2,  1,  6,  0,  8,  1, 16,  8,
       12, 20,  3,  2,  9,  8, 15,  1,  6, 12, 19, 14,  2, 14,  1,  2,  1,
        1, 18,  1, 16, 13,  8,  2,  4,  4,  4, 18,  1,  5,  0, 19,  4,  4,
        5,  3, 18,  6,  2,  0, 16, 17, 19, 11,  6,  5,  5,  5, 18,  6, 18,
        8, 12,  5, 10, 10,  5, 12, 16,  6, 20, 12, 11, 18,  2,  3,  0, 12,
        5, 16,  5,  6,  8,  5,  9, 18, 11, 16,  0,  5, 17, 12, 18,  4, 16,
        9,  2, 15,  5,  7,  5,  6,  5,  3, 14,  0,  2,  5,  3,  5,  9, 10,
        6,  5, 10,  5,  0,  6, 14,  8,  8,  5,  3,  0,  5,  6, 12, 11, 12,
        8, 12,  0,  5,  2,  4,  5,  6,  0], dtype=int64)

### DBSCAN

In [635]:
dbscan.fit(return_corr_arr)

In [636]:
dbscan.labels_

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=int64)

## Test performance

### KMeans++

In [637]:
sid2group = {}

for sid, group in zip(corr_sid_list, kmeans.labels_):
    sid2group[sid] = group


In [638]:
sector2group = {}
sector2sidlist = {}

for sectorname, sector_sid_list in sectorname2sidlist.items():
    sector2group[sectorname] = [sid2group[sid] for sid in sector_sid_list if sid in sid2group]
    sector2sidlist[sectorname] = [sid for sid in sector_sid_list if sid in sid2group]

In [639]:
rank1_ratios = []
rankn_ratios = []

for i, (sectorname, sector_group_list) in enumerate(sector2group.items()):
    # plt.figure(i)
    # sns.histplot(sector_group_list,).set(title=sectorname)

    counter = Counter(sector_group_list)
    most_common = counter.most_common(n=3)
    total_count = len(sector_group_list)

    if not most_common:
        print(f'{sectorname} most_common: {most_common}')
        continue

    rank1_count = most_common[0][1]
    rankn_count = sum([x[1] for x in most_common])

    rank1_ratio = rank1_count / total_count
    rankn_ratio = rankn_count / total_count

    rank1_ratios.append(rank1_ratio)
    rankn_ratios.append(rankn_ratio)

    print(f'{sectorname}, Rank#1: {rank1_ratio}, Rank#N: {rankn_ratio}, Total: {total_count}')

print(f'Rank#1 avg: {np.mean(rank1_ratios)}, Rank#N avg: {np.mean(rankn_ratios)}')

음식료품, Rank#1: 1.0, Rank#N: 1.0, Total: 1
섬유의복, Rank#1: 0.5, Rank#N: 1.0, Total: 2
종이목재 most_common: []
화학, Rank#1: 0.2857142857142857, Rank#N: 0.6666666666666666, Total: 21
의약품, Rank#1: 0.5, Rank#N: 0.875, Total: 8
비금속광물, Rank#1: 0.5, Rank#N: 1.0, Total: 2
철강및금속, Rank#1: 0.4, Rank#N: 1.0, Total: 5
기계, Rank#1: 0.5, Rank#N: 1.0, Total: 4
전기전자, Rank#1: 0.35294117647058826, Rank#N: 0.7058823529411765, Total: 17
의료정밀, Rank#1: 1.0, Rank#N: 1.0, Total: 1
운수장비, Rank#1: 0.5, Rank#N: 1.0, Total: 8
유통업, Rank#1: 0.3333333333333333, Rank#N: 1.0, Total: 3
전기가스업, Rank#1: 0.3333333333333333, Rank#N: 1.0, Total: 3
건설업, Rank#1: 0.5, Rank#N: 1.0, Total: 4
운수창고, Rank#1: 0.5, Rank#N: 1.0, Total: 4
통신업, Rank#1: 1.0, Rank#N: 1.0, Total: 2
금융업, Rank#1: 0.5714285714285714, Rank#N: 1.0, Total: 7
증권, Rank#1: 1.0, Rank#N: 1.0, Total: 1
보험 most_common: []
서비스업, Rank#1: 0.3333333333333333, Rank#N: 0.6666666666666666, Total: 9
Rank#1 avg: 0.5616713352007471, Rank#N avg: 0.9396786492374728


### Agglomerative

In [640]:
sid2group = {}

for sid, group in zip(corr_sid_list, agglomerative.labels_):
    sid2group[sid] = group


In [641]:
sector2group = {}
sector2sidlist = {}

for sectorname, sector_sid_list in sectorname2sidlist.items():
    sector2group[sectorname] = [sid2group[sid] for sid in sector_sid_list if sid in sid2group]
    sector2sidlist[sectorname] = [sid for sid in sector_sid_list if sid in sid2group]

In [642]:
rank1_ratios = []
rankn_ratios = []

for i, (sectorname, sector_group_list) in enumerate(sector2group.items()):
    # plt.figure(i)
    # sns.histplot(sector_group_list,).set(title=sectorname)

    counter = Counter(sector_group_list)
    most_common = counter.most_common(n=3)
    total_count = len(sector_group_list)

    if not most_common:
        print(f'{sectorname} most_common: {most_common}')
        continue

    rank1_count = most_common[0][1]
    rankn_count = sum([x[1] for x in most_common])

    rank1_ratio = rank1_count / total_count
    rankn_ratio = rankn_count / total_count

    rank1_ratios.append(rank1_ratio)
    rankn_ratios.append(rankn_ratio)

    print(f'{sectorname}, Rank#1: {rank1_ratio}, Rank#N: {rankn_ratio}, Total: {total_count}')

print(f'Rank#1 avg: {np.mean(rank1_ratios)}, Rank#N avg: {np.mean(rankn_ratios)}')

음식료품, Rank#1: 1.0, Rank#N: 1.0, Total: 1
섬유의복, Rank#1: 0.5, Rank#N: 1.0, Total: 2
종이목재 most_common: []
화학, Rank#1: 0.3333333333333333, Rank#N: 0.7142857142857143, Total: 21
의약품, Rank#1: 0.375, Rank#N: 0.875, Total: 8
비금속광물, Rank#1: 0.5, Rank#N: 1.0, Total: 2
철강및금속, Rank#1: 0.4, Rank#N: 0.8, Total: 5
기계, Rank#1: 0.5, Rank#N: 1.0, Total: 4
전기전자, Rank#1: 0.35294117647058826, Rank#N: 0.7647058823529411, Total: 17
의료정밀, Rank#1: 1.0, Rank#N: 1.0, Total: 1
운수장비, Rank#1: 0.375, Rank#N: 1.0, Total: 8
유통업, Rank#1: 0.3333333333333333, Rank#N: 1.0, Total: 3
전기가스업, Rank#1: 0.3333333333333333, Rank#N: 1.0, Total: 3
건설업, Rank#1: 0.5, Rank#N: 1.0, Total: 4
운수창고, Rank#1: 0.5, Rank#N: 1.0, Total: 4
통신업, Rank#1: 1.0, Rank#N: 1.0, Total: 2
금융업, Rank#1: 0.5714285714285714, Rank#N: 1.0, Total: 7
증권, Rank#1: 1.0, Rank#N: 1.0, Total: 1
보험 most_common: []
서비스업, Rank#1: 0.3333333333333333, Rank#N: 0.7777777777777778, Total: 9
Rank#1 avg: 0.5504279489573608, Rank#N avg: 0.9406538541342463


In [659]:
sectorname = '서비스업'
result = []

for sid, group in zip(sector2sidlist[sectorname], sector2group[sectorname]):
    result.append({
        'sectorname': sectorname,
        'sid': sid,
        'name': sid2name[sid],
        'group_id': group,
    })

In [660]:
df = pd.DataFrame(result)
df['count'] = df.groupby('group_id')['group_id'].transform('count')
df = df.sort_values('count', ascending=False)
df

Unnamed: 0,sectorname,sid,name,group_id,count
0,서비스업,35420,NAVER,4,3
1,서비스업,35720,카카오,4,3
2,서비스업,352820,하이브,4,3
3,서비스업,36570,엔씨소프트,5,2
5,서비스업,52690,한전기술,16,2
6,서비스업,251270,넷마블,5,2
8,서비스업,130660,한전산업,16,2
4,서비스업,28050,삼성엔지니어링,1,1
7,서비스업,36420,제이콘텐트리,18,1
