2021-05-06 [ADD] 네번째 분석 (이해강)

In [81]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

# matplotlib 한글 폰트 오류 해결
import platform
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='appleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system.... sorry.....')

# 데이터 불러오기

In [82]:
df = pd.read_csv('company_nps_data.csv',encoding='ansi')
df

Unnamed: 0,회사ID,연매출액,년도,월,월별_연금보험료,월별_직원수
0,233757,41688077,2018,7,8912380,36
1,123464,1760695,2018,3,942380,5
2,342124,3221341,2018,7,781180,4
3,386420,4815584,2015,11,3795900,14
4,129243,9799736,2018,10,40724680,151
...,...,...,...,...,...,...
99796,171439,4630622,2017,2,2402740,11
99797,239388,1045021,2018,10,1428800,9
99798,272304,181898,2018,3,392560,4
99799,362194,2100218,2018,12,874780,3


## 피벗 테이블로 '연매출액','회사ID','년도' 만 포함된 데이터프레임 생성

In [83]:
ndf = df.pivot_table('연매출액','회사ID','년도')
# 인덱스와 컬럼 자료형 변환
ndf.index = ndf.index.map(lambda x : str(x))
ndf.columns = ndf.columns.map(lambda x : str(x))
ndf

년도,2015,2016,2017,2018,2019
회사ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
63,,,,1.132878e+10,
95,,,,6.569474e+08,
102,,,,7.426589e+08,
295,,,,8.377670e+07,
414,,,,1.973277e+07,
...,...,...,...,...,...
2400284,,,,1.263406e+06,
2437574,,,,2.140160e+05,
2693684,,,,,238076.0
3312354,,,,,1451563.0


## 연평균 성장률 변수 중 기간(n)을 구하기

In [84]:
# '기간'이라는 칼럼 생성 후 임의의 값 부여
ndf['기간'] = 0

# 한 행씩 접근하며 널값이 아닌 데이터들의 개수를 세서 '기간' 칼럼에 저장
for i in range(len(ndf)):
    ndf['기간'][i] = ndf.iloc[i].notnull().sum()
    
# 그 후 2를 빼줌( "notnull()로 세어진'기간'칼럼" 과 "두 연도의 차이" )    
ndf['기간']=ndf['기간']-2
ndf

년도,2015,2016,2017,2018,2019,기간
회사ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
63,,,,1.132878e+10,,0
95,,,,6.569474e+08,,0
102,,,,7.426589e+08,,0
295,,,,8.377670e+07,,0
414,,,,1.973277e+07,,0
...,...,...,...,...,...,...
2400284,,,,1.263406e+06,,0
2437574,,,,2.140160e+05,,0
2693684,,,,,238076.0,0
3312354,,,,,1451563.0,0


## 2015년과 2019를 기준으로 결측치들을 앞과 뒤의 값들로 채워줌 (기간을 미리구해놨기 때문에 가능)

In [85]:
# '기간' 칼럼 제외한 새로운 데이터프레임으로 복사 (연도의 널값을 채워주기 위해) 
ndf1 = ndf[['2015','2016','2017','2018','2019']]

# 계산의 편의상 시작값(2015로)과 종료값(2019로)을 맞춰주기 위해 널값 채우기
ndf1.T.fillna(method = 'ffill',inplace=True)
ndf1.T.fillna(method = 'bfill',inplace=True)

# 널값 채운 후 다시 '기간'칼럼 붙여주기
ndf1['기간'] = ndf['기간']
ndf1

년도,2015,2016,2017,2018,2019,기간
회사ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
63,1.132878e+10,1.132878e+10,1.132878e+10,1.132878e+10,1.132878e+10,0
95,6.569474e+08,6.569474e+08,6.569474e+08,6.569474e+08,6.569474e+08,0
102,7.426589e+08,7.426589e+08,7.426589e+08,7.426589e+08,7.426589e+08,0
295,8.377670e+07,8.377670e+07,8.377670e+07,8.377670e+07,8.377670e+07,0
414,1.973277e+07,1.973277e+07,1.973277e+07,1.973277e+07,1.973277e+07,0
...,...,...,...,...,...,...
2400284,1.263406e+06,1.263406e+06,1.263406e+06,1.263406e+06,1.263406e+06,0
2437574,2.140160e+05,2.140160e+05,2.140160e+05,2.140160e+05,2.140160e+05,0
2693684,2.380760e+05,2.380760e+05,2.380760e+05,2.380760e+05,2.380760e+05,0
3312354,1.451563e+06,1.451563e+06,1.451563e+06,1.451563e+06,1.451563e+06,0


## 기간이 0인 데이터들 소거

In [86]:
ndf1 = ndf1[ndf1['기간'] != 0]
ndf1

년도,2015,2016,2017,2018,2019,기간
회사ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
14271,1199495.0,4257715.0,5005248.0,8388401.0,9039340.0,4
44851,939238.0,939238.0,939238.0,1189946.0,1189946.0,1
59533,194704.0,207762.0,124299.0,144614.0,144614.0,3
122968,37415377.0,26858719.0,32997924.0,31514468.0,31514468.0,3
122983,24179108.0,20292461.0,9470987.0,9470987.0,9470987.0,2
...,...,...,...,...,...,...
515562,254818.0,384628.0,384628.0,384628.0,384628.0,1
515837,770107.0,490090.0,490090.0,490090.0,490090.0,1
516607,2586020.0,2586020.0,2586020.0,1959259.0,1959259.0,1
516653,1745520.0,1745520.0,1956460.0,1500046.0,1500046.0,2


## 연평균 성장률 계산 후, 유니콘 데이터 기준 min()값인 -28.54 미만의 데이터 소거

In [87]:
ndf1['연평균 성장률(%)'] = round(((ndf1['2019']/ndf1['2015'])**(1/ndf1['기간'])-1)*100, 2)
ndf1 = ndf1[ndf1['연평균 성장률(%)'] >= -28.54]
ndf1

년도,2015,2016,2017,2018,2019,기간,연평균 성장률(%)
회사ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14271,1199495.0,4257715.0,5005248.0,8388401.0,9039340.0,4,65.69
44851,939238.0,939238.0,939238.0,1189946.0,1189946.0,1,26.69
59533,194704.0,207762.0,124299.0,144614.0,144614.0,3,-9.44
122968,37415377.0,26858719.0,32997924.0,31514468.0,31514468.0,3,-5.56
123082,6765218.0,6765218.0,6638972.0,6001508.0,6001508.0,2,-5.81
...,...,...,...,...,...,...,...
514934,384313.0,384313.0,422440.0,556216.0,556216.0,2,20.30
515264,1489321.0,1489321.0,2074820.0,2422405.0,2422405.0,2,27.53
515562,254818.0,384628.0,384628.0,384628.0,384628.0,1,50.94
516607,2586020.0,2586020.0,2586020.0,1959259.0,1959259.0,1,-24.24


## 유니콘 데이터 기준 연매출액 min() 값인 99100 미만 데이터 소거

In [88]:
ndf1 = ndf1.loc[(ndf1['2015'] >= 99100) & (ndf1['2016'] >= 99100) & (ndf1['2017'] >= 99100) & (ndf1['2018'] >= 99100) & (ndf1['2019'] >= 99100)]

## 1년씩 성장률 계산

In [89]:
ndf1['15-16년 성장률'] = round(((ndf1['2016']-ndf1['2015'])/ndf1['2015'])*100, 1)
ndf1['16-17년 성장률'] = round(((ndf1['2017']-ndf1['2016'])/ndf1['2016'])*100, 1)
ndf1['17-18년 성장률'] = round(((ndf1['2018']-ndf1['2017'])/ndf1['2017'])*100, 1)
ndf1['18-19년 성장률'] = round(((ndf1['2019']-ndf1['2018'])/ndf1['2018'])*100, 1)
ndf1

년도,2015,2016,2017,2018,2019,기간,연평균 성장률(%),15-16년 성장률,16-17년 성장률,17-18년 성장률,18-19년 성장률
회사ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
14271,1199495.0,4257715.0,5005248.0,8388401.0,9039340.0,4,65.69,255.0,17.6,67.6,7.8
44851,939238.0,939238.0,939238.0,1189946.0,1189946.0,1,26.69,0.0,0.0,26.7,0.0
59533,194704.0,207762.0,124299.0,144614.0,144614.0,3,-9.44,6.7,-40.2,16.3,0.0
122968,37415377.0,26858719.0,32997924.0,31514468.0,31514468.0,3,-5.56,-28.2,22.9,-4.5,0.0
123082,6765218.0,6765218.0,6638972.0,6001508.0,6001508.0,2,-5.81,0.0,-1.9,-9.6,0.0
...,...,...,...,...,...,...,...,...,...,...,...
514934,384313.0,384313.0,422440.0,556216.0,556216.0,2,20.30,0.0,9.9,31.7,0.0
515264,1489321.0,1489321.0,2074820.0,2422405.0,2422405.0,2,27.53,0.0,39.3,16.8,0.0
515562,254818.0,384628.0,384628.0,384628.0,384628.0,1,50.94,50.9,0.0,0.0,0.0
516607,2586020.0,2586020.0,2586020.0,1959259.0,1959259.0,1,-24.24,0.0,0.0,-24.2,0.0


## 계산된 성장률의 max()값으로 해당기간 성장률 최고치 계산

In [90]:
ndf1['성장률 최고치']=0.0
for i in range(len(ndf1)):
    ndf1['성장률 최고치'][i]=max(ndf1['15-16년 성장률'][i],ndf1['16-17년 성장률'][i],ndf1['17-18년 성장률'][i],ndf1['18-19년 성장률'][i])

## 유니콘 데이터 기준 성장률 최고치 min() 값인 39.2% 미만 데이터 소거

In [92]:
ndf1 = ndf1.loc[ndf1['성장률 최고치']>=39.2]
ndf1

년도,2015,2016,2017,2018,2019,기간,연평균 성장률(%),15-16년 성장률,16-17년 성장률,17-18년 성장률,18-19년 성장률,성장률 최고치
회사ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
14271,1199495.0,4257715.0,5005248.0,8388401.0,9039340.0,4,65.69,255.0,17.6,67.6,7.8,255.0
123548,1373237.0,1373237.0,1975096.0,1975096.0,1975096.0,1,43.83,0.0,43.8,0.0,0.0,43.8
123882,289034.0,289034.0,1044575.0,5293947.0,5293947.0,2,327.97,0.0,261.4,406.8,0.0,406.8
124223,2400795.0,3381835.0,3473757.0,3473757.0,3473757.0,2,20.29,40.9,2.7,0.0,0.0,40.9
124386,621771.0,621771.0,1295805.0,1010237.0,1010237.0,2,27.47,0.0,108.4,-22.0,0.0,108.4
...,...,...,...,...,...,...,...,...,...,...,...,...
507742,4078442.0,4215278.0,7932297.0,11163636.0,11163636.0,3,39.88,3.4,88.2,40.7,0.0,88.2
508524,480548.0,481276.0,481276.0,691497.0,691497.0,2,19.96,0.2,0.0,43.7,0.0,43.7
513441,496125.0,496125.0,496125.0,822899.0,822899.0,1,65.87,0.0,0.0,65.9,0.0,65.9
515264,1489321.0,1489321.0,2074820.0,2422405.0,2422405.0,2,27.53,0.0,39.3,16.8,0.0,39.3


### 최종 데이터가 5000개에서 768개로 줄어든 것을 알 수 있다.

# 분석결과 1.
- 연평균 성장률(%) TOP 50 데이터

In [93]:
ndf1['연평균 성장률(%)'].sort_values(ascending=False).head(50)

회사ID
177997    1451.48
237193     979.16
413403     834.29
432907     727.52
206055     700.00
243409     566.91
446886     443.21
359487     435.92
254150     432.73
310421     408.40
435512     391.43
267318     390.24
475113     386.21
237075     380.42
246630     377.87
281400     359.42
156945     351.47
123882     327.97
172962     326.08
251353     320.27
186856     302.97
448247     292.53
270460     291.33
258975     274.07
252545     265.96
408351     255.35
288613     255.05
250882     254.05
478219     245.80
233340     239.84
437305     238.84
336600     238.72
228689     238.49
357606     237.33
242470     235.06
394749     231.41
406773     231.08
332914     230.06
231400     221.81
412652     221.74
237311     216.01
151715     214.19
412523     213.80
257336     210.31
414905     203.12
306768     202.66
451016     200.39
146761     199.95
244614     199.58
216845     195.38
Name: 연평균 성장률(%), dtype: float64

# 분석결과 2.
- 성장률 최고치(%) TOP 50 데이터

In [94]:
ndf1['성장률 최고치'].sort_values(ascending=False).head(50)

회사ID
249266    3059.8
233340    1470.9
177997    1451.5
294620    1428.0
237075    1123.3
432907    1083.9
335237    1065.0
237193     979.2
435512     886.5
413403     834.3
441107     782.0
250016     715.6
250522     714.6
206055     700.0
311024     671.1
244964     648.4
243945     612.3
258975     591.8
243409     566.9
482272     554.5
247210     524.7
490685     512.1
442467     497.9
448247     495.8
431317     490.4
445499     455.1
446886     443.2
359487     435.9
394749     434.2
254150     432.7
429429     418.8
470994     411.7
206215     408.8
310421     408.4
123882     406.8
238192     405.8
131485     405.3
267318     390.2
475113     386.2
246630     377.9
430906     376.7
441735     376.3
281400     359.4
446390     358.2
156945     351.5
280586     348.4
280967     346.6
439585     341.3
239452     330.6
172962     326.1
Name: 성장률 최고치, dtype: float64