In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [43]:
df = pd.read_csv('data.csv', encoding='cp949')

In [44]:
df.head()

Unnamed: 0,기준년월,시군구명,읍면동명,성별,연령대,업종명,결제건수,결제금액,데이터기준일자
0,2020-11,권선구,고색동,남,10대,레져업소,20,106900,2020-12-11
1,2020-11,권선구,고색동,남,10대,보건위생,3,29000,2020-12-11
2,2020-11,권선구,고색동,남,10대,서적문구,1,5600,2020-12-11
3,2020-11,권선구,고색동,남,10대,약국,1,13600,2020-12-11
4,2020-11,권선구,고색동,남,10대,유통업영리,70,2160910,2020-12-11


## 데이터 전처리 및 요약

- 불필요한 열 제거('데이터기준일자')

In [45]:
df = df.drop('데이터기준일자', axis=1)
df.head()

Unnamed: 0,기준년월,시군구명,읍면동명,성별,연령대,업종명,결제건수,결제금액
0,2020-11,권선구,고색동,남,10대,레져업소,20,106900
1,2020-11,권선구,고색동,남,10대,보건위생,3,29000
2,2020-11,권선구,고색동,남,10대,서적문구,1,5600
3,2020-11,권선구,고색동,남,10대,약국,1,13600
4,2020-11,권선구,고색동,남,10대,유통업영리,70,2160910


- 파생변수 생성

In [50]:
df['건당 평균결제금액'] = round(df['결제금액']/df['결제건수'],2)
df.head()

Unnamed: 0,기준년월,시군구명,읍면동명,성별,연령대,업종명,결제건수,결제금액,건당 평균결제금액
0,2020-11,권선구,고색동,남,10대,레져업소,20,106900,5345.0
1,2020-11,권선구,고색동,남,10대,보건위생,3,29000,9666.67
2,2020-11,권선구,고색동,남,10대,서적문구,1,5600,5600.0
3,2020-11,권선구,고색동,남,10대,약국,1,13600,13600.0
4,2020-11,권선구,고색동,남,10대,유통업영리,70,2160910,30870.14


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128650 entries, 0 to 128649
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   기준년월       128650 non-null  object 
 1   시군구명       128650 non-null  object 
 2   읍면동명       128650 non-null  object 
 3   성별         128650 non-null  object 
 4   연령대        128650 non-null  object 
 5   업종명        128650 non-null  object 
 6   결제건수       128650 non-null  int64  
 7   결제금액       128650 non-null  int64  
 8   건당 평균결제금액  128650 non-null  float64
dtypes: float64(1), int64(2), object(6)
memory usage: 8.8+ MB


- data type은 잘들어가 있음

In [51]:
df.shape

(128650, 9)

In [52]:
df.describe(include='all')

Unnamed: 0,기준년월,시군구명,읍면동명,성별,연령대,업종명,결제건수,결제금액,건당 평균결제금액
count,128650,128650,128650,128650,128650,128650,128650.0,128650.0,128650.0
unique,20,4,56,2,7,34,,,
top,2020-05,팔달구,영통동,여,20대,일반·휴게음식,,,
freq,10752,41216,4458,64568,28124,11951,,,
mean,,,,,,,91.834046,1796752.0,47293.06
std,,,,,,,373.815329,6812299.0,82970.34
min,,,,,,,1.0,10.0,10.0
25%,,,,,,,2.0,47000.0,10200.45
50%,,,,,,,7.0,216935.0,19360.38
75%,,,,,,,32.0,939827.5,44712.28


- 수원시는 4개의 구와 56개의 읍면동으로 구성되어있음을 알 수 있음
- 성별은 남,여 구분
- 연령대는 10대 미만, 10대, 20대, ....50대, 60대 이상으로 범주화
- 업종은 총 34개로 구분
- 

## 정보 획득

In [133]:
df.drop_duplicates('시군구명')['시군구명']

0       권선구
2600    영통구
4265    장안구
6307    팔달구
Name: 시군구명, dtype: object

- 수원시는 권선구, 영통구, 장안구, 팔달구 4대의 구가 존재

In [129]:
df1 = df.groupby('연령대').sum()
df1 = df1.drop(['결제금액','건당 평균결제금액'],axis=1)
df1

Unnamed: 0_level_0,결제건수
연령대,Unnamed: 1_level_1
10대,171604
10세미만,1095
20대,3446492
30대,2461197
40대,3196420
50대,1871319
60대이상,666323


- 연령대별 결제건수

In [57]:
df.groupby(['기준년월','업종명']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,시군구명,읍면동명,성별,연령대,결제건수,결제금액,건당 평균결제금액
기준년월,업종명,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-04,건강식품,5,5,5,5,5,5,5
2019-04,건축자재,3,3,3,3,3,3,3
2019-04,광학제품,13,13,13,13,13,13,13
2019-04,기타,2,2,2,2,2,2,2
2019-04,기타의료기관,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...,...
2020-11,전기제품,109,109,109,109,109,109,109
2020-11,주방용구,92,92,92,92,92,92,92
2020-11,직물,170,170,170,170,170,170,170
2020-11,학원,388,388,388,388,388,388,388
