In [1]:
import pandas as pd
import sqlite3
import warnings
import re
from datetime import datetime
warnings.filterwarnings(action='ignore')

In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
con = sqlite3.connect("../db/hr.db")
df = pd.read_sql_query("SELECT 자료생성년월, 사업장명, 사업자등록번호, 사업장업종코드명, 가입자수, 신규취득자수, 상실가입자수, 당월고지금액  from pension", con)

In [4]:
df['자료생성년월'].unique()

array(['2023-01', '2022-12', '2022-11', '2022-10', '2022-09', '2022-08',
       'Jul-22', 'Jun-22', 'May-22', 'Apr-22', 'Mar-22', 'Feb-22',
       'Jan-22', '2023-02', '2023-03', 'Jan-21', 'Feb-21', 'Mar-21',
       'Apr-21', 'May-21', 'Jun-21', 'Jul-21', 'Aug-21', 'Sep-21',
       'Oct-21', 'Nov-21', 'Dec-21', '2023-04'], dtype=object)

In [5]:
import functools # not required, but helps in production
def unpack_df_columns(func):
    """
    A general use decorator to unpack a df[subset] of columns
    into a function which expects the values at those columns
    as arguments
    """
    
    @functools.wraps(func)
    def _unpack_df_columns(*args, **kwargs):
        
        # args[0] is a pandas series equal in length as the 
        # df[subset] to which the apply function is applied 
        series = args[0]

        # series.values holds the number of arguments expected
        # by func and is os length len(df[subset].columns)
        return func(*series.values)

    return _unpack_df_columns

In [6]:
def date_change(val):
    if val == 'Jul-22':
        return '2022-07'
    elif val == 'Jun-22':
        return '2022-06'
    elif val == 'May-22':
        return '2022-05'
    elif val == 'Apr-22':
        return '2022-04'
    elif val == 'Mar-22':
        return '2022-03'
    elif val == 'Feb-22':
        return '2022-02'
    elif val == 'Jan-22':
        return '2022-01'
    
    elif val == 'Jan-21':
        return '2021-01'
    elif val == 'Feb-21':
        return '2021-02'
    elif val == 'Mar-21':
        return '2021-03'
    elif val == 'Apr-21':
        return '2021-04'
    elif val == 'May-21':
        return '2021-05'
    elif val == 'Jun-21':
        return '2021-06'
    elif val == 'Jul-21':
        return '2021-07'
    elif val == 'Aug-21':
        return '2021-08'
    elif val == 'Sep-21':
        return '2021-09'
    elif val == 'Oct-21':
        return '2021-10'
    elif val == 'Nov-21':
        return '2021-11'
    elif val == 'Dec-21':
        return '2021-12'
    
    else:
        return val

In [7]:
def 월평균가입자수(가입자수):
    return int(가입자수/12)

In [8]:
@unpack_df_columns
def get_평균소득월액(고지액, 인원):
    평균소득월액 = (int(고지액) / int(인원)) / 0.09
    return int(평균소득월액)

In [9]:
# df1 = df.copy()
# df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15108651 entries, 0 to 15108650
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   자료생성년월    object
 1   사업장명      object
 2   사업자등록번호   int64 
 3   사업장업종코드명  object
 4   가입자수      int64 
 5   신규취득자수    int64 
 6   상실가입자수    int64 
 7   당월고지금액    int64 
dtypes: int64(5), object(3)
memory usage: 922.2+ MB


## 경쟁사 Data Load

In [13]:
comp_df = pd.read_excel("./company.xlsx")
comp_df

Unnamed: 0,사업장명,사업자등록번호,약식명
0,현대자동차(주),101810,현대차
1,현대모비스(주),101811,모비스
2,삼성전자(주),124810,삼성전자
3,두산밥캣주식회사,201863,두산밥캣
4,볼보그룹코리아(주),609813,볼보코리아
5,볼보그룹코리아(주)건설기계(서울),120851,볼보코리아
6,두산산업차량주식회사,802880,두산산차
7,(주)대동,514810,대동
8,(주)한양정밀,131810,한양정밀
9,(주)모트롤,263810,모트롤


In [14]:
# comp_df["사업장명"].tolist()

In [15]:
df = df.loc[(df1["사업장명"].isin(comp_df["사업장명"].tolist())) & (df["사업자등록번호"].isin(comp_df["사업자등록번호"].tolist()))]

In [17]:
df = pd.merge(df,comp_df, on=['사업장명','사업자등록번호'], how='left')

## 날짜 변환

In [10]:
df['자료생성년월'] = df['자료생성년월'].apply(date_change)

In [11]:
df['자료생성년월'].unique()

array(['2023-01', '2022-12', '2022-11', '2022-10', '2022-09', '2022-08',
       '2022-07', '2022-06', '2022-05', '2022-04', '2022-03', '2022-02',
       '2022-01', '2023-02', '2023-03', '2021-01', '2021-02', '2021-03',
       '2021-04', '2021-05', '2021-06', '2021-07', '2021-08', '2021-09',
       '2021-10', '2021-11', '2021-12', '2023-04'], dtype=object)

## 그룹핑

In [19]:
gdf = df.groupby(['약식명', '자료생성년월'])[['가입자수', '신규취득자수', '상실가입자수', '당월고지금액']].sum().reset_index()

In [20]:
gdf

Unnamed: 0,약식명,자료생성년월,가입자수,신규취득자수,상실가입자수,당월고지금액
0,HCE,2021-01,1355,7,14,567216740
1,HCE,2021-02,1374,33,12,576385980
2,HCE,2021-03,1364,2,10,572718340
3,HCE,2021-04,1391,37,10,579448720
4,HCE,2021-05,1390,9,15,578930280
5,HCE,2021-06,1412,37,11,586023040
6,HCE,2021-07,1444,43,16,608691280
7,HCE,2021-08,1464,36,12,616270700
8,HCE,2021-09,1467,15,11,617099080
9,HCE,2021-10,1524,68,14,640065040


In [21]:
gdf.columns = ["약식명", "기준일자", "직원수", "신규취득", "자격상실", "당월고지금액"]

In [22]:
gdf["평균소득월액"] = gdf[["당월고지금액","직원수"]].astype('str').apply(get_평균소득월액, axis=1)

In [23]:
gdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   약식명     349 non-null    object
 1   기준일자    349 non-null    object
 2   직원수     349 non-null    int64 
 3   신규취득    349 non-null    int64 
 4   자격상실    349 non-null    int64 
 5   당월고지금액  349 non-null    int64 
 6   평균소득월액  349 non-null    int64 
dtypes: int64(5), object(2)
memory usage: 19.2+ KB


In [24]:
gdf.head()

Unnamed: 0,약식명,기준일자,직원수,신규취득,자격상실,당월고지금액,평균소득월액
0,HCE,2021-01,1355,7,14,567216740,4651223
1,HCE,2021-02,1374,33,12,576385980,4661054
2,HCE,2021-03,1364,2,10,572718340,4665349
3,HCE,2021-04,1391,37,10,579448720,4628554
4,HCE,2021-05,1390,9,15,578930280,4627740


In [25]:
요약통계 = gdf.groupby(['약식명'])[['직원수', '신규취득', '자격상실', '평균소득월액']].mean().round().reset_index()
요약통계

Unnamed: 0,약식명,직원수,신규취득,자격상실,평균소득월액
0,HCE,1377.0,20.0,23.0,4854863.0
1,HDI,2708.0,29.0,34.0,5088650.0
2,HDX,417.0,40.0,5.0,5108059.0
3,대동,1188.0,47.0,38.0,4395464.0
4,두산밥캣,130.0,4.0,3.0,5099539.0
5,두산산차,599.0,38.0,9.0,4763478.0
6,모비스,10816.0,112.0,61.0,5067476.0
7,모트롤,498.0,5.0,5.0,5078225.0
8,볼보코리아,1434.0,12.0,10.0,4970988.0
9,삼성전자,113069.0,1049.0,514.0,5166166.0


In [26]:
gdf['기준일자'].unique()

array(['2021-01', '2021-02', '2021-03', '2021-04', '2021-05', '2021-06',
       '2021-07', '2021-08', '2021-09', '2021-10', '2021-11', '2021-12',
       '2022-01', '2022-02', '2022-03', '2022-04', '2022-05', '2022-06',
       '2022-07', '2022-08', '2022-09', '2022-10', '2022-11', '2022-12',
       '2023-01', '2023-02', '2023-03', '2023-04'], dtype=object)

In [27]:
import pickle
with open ("pickle_df2.pickle", 'wb') as pickle_filename:
    pickle.dump(gdf, pickle_filename)

In [4]:
import plotly.express as px
df = px.data.gapminder().query("continent == 'Oceania'")
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
60,Australia,Oceania,1952,69.12,8691212,10039.59564,AUS,36
61,Australia,Oceania,1957,70.33,9712569,10949.64959,AUS,36
62,Australia,Oceania,1962,70.93,10794968,12217.22686,AUS,36
63,Australia,Oceania,1967,71.1,11872264,14526.12465,AUS,36
64,Australia,Oceania,1972,71.93,13177000,16788.62948,AUS,36
65,Australia,Oceania,1977,73.49,14074100,18334.19751,AUS,36
66,Australia,Oceania,1982,74.74,15184200,19477.00928,AUS,36
67,Australia,Oceania,1987,76.32,16257249,21888.88903,AUS,36
68,Australia,Oceania,1992,77.56,17481977,23424.76683,AUS,36
69,Australia,Oceania,1997,78.83,18565243,26997.93657,AUS,36
