> ## koweps 데이터분석
> ###   목   차
> -------------------
> - 01_preprocessiong
> - 02_incomebysex
> - 03_incombyage 
> - 04_incomebyageg  
> - 05_incomebysexbyageg
> - 06_preprocessing_01 
> - 06_occupaton_preprocessing 
> - 07_jobfrequencebyjobbysex
> - 08_divorcereligionbyrate 
> - 08_divorcereligionbyratebyaegg
> - 09_ratiobyagegbyregion


In [None]:
library(tidyverse) 
library(sqldf)
library(foreign)

In [None]:
read.dta("../input/welfare/Koweps_hpc10_2015_beta3.dta")%>% 
          rename(sex=h10_g3) %>%
          rename(birth=h10_g4) %>% 
          rename(marriage=h10_g10) %>% 
          rename(income=p1002_8aq1) %>%
          rename(religion=h10_g11) %>%
          rename(code_job=h10_eco9) %>%
          rename(code_region=h10_reg7) ->welfare  


In [None]:
welfare %<>% mutate(sex=ifelse(sex==1, "male","female")) %>% 
             mutate(income=ifelse(income==0, NA,  income)) %>% 
             mutate(age = 2015-welfare$birth+1) %>% 
             mutate(ageg=ifelse(age < 30, 'young',
                     ifelse(age < 60, 'middle','old'))) %>% 
             mutate(code_job = as.character(welfare$code_job)) %>% 
             mutate(code_job=ifelse(str_length(code_job)==3,
                                    str_c("0",code_job),code_job)) %>% 
             mutate(religion=ifelse(religion ==1 ,"yes" , "no" )) %>%
             mutate(group_marriage=(ifelse(marriage==1, "marriage",
                                           ifelse(marriage==3,"divorce",NA)))) %>%  
             select(sex, 
                    birth,
                    age,
                    ageg,
                    marriage,
                    religion,
                    group_marriage,
                    income,
                    code_job,
                    code_region) 


In [None]:
region7=data.frame(code_region=c(1,2,3,4,5,6,7),
                   region=c("서울",
                            "수도권(인천/경기)",
                            "부산/경남/울산",
                            "대구/경북",
                            "대전/충남",
                            "강원/충북 ",
                            "광주/전남/전북/제주도"),stringsAsFactors=F)



In [None]:
save(welfare,region7,file="welfare.rda")

In [None]:
write_csv(welfare,"welfrae.csv")

In [None]:
write_csv(region7,"region7.csv")

In [None]:
load("welfare.rda")

### 02. Income*by*sex

In [None]:


# income by sex 
# 1. 평균 요약 테이블
## NA 제거 옵션 필수 추가 na.rm=T
# 1. 성별 월급 평균표 만들기
welfare %>% group_by(sex) %>% 
            summarise(mean_income=mean(income,na.rm = T))->incomebysex
ggplot(data = incomebysex, aes(x = sex, y = mean_income,fill=sex)) + geom_col()


### 03.incom*by*age

#### 1) incomebyage
> -   나이별 평균 월급요약표


In [None]:
welfare %>% filter(!is.na(income)) %>% 
    group_by(age) %>% 
    summarise(mean=mean(income)) ->incomebyage

 
> - 라인그래프 (나이를 시계열자료로)

In [None]:
ggplot(incomebyage,aes(age,mean))+geom_line()

#### 04.income*by*ageg

> - incombyageg  
  young : 1 ~ 30   
  middle : 31 ~ 60    
  old : 61 ~    

In [None]:
welfare %>% filter(!is.na(income)) %>% 
            select(ageg, income) %>% 
            group_by(ageg) %>% 
            summarise(mean_income=mean(income)) ->incombyageg

incombyageg %>% ggplot(aes(ageg,mean_income,fill=ageg))+
                geom_bar(stat='identity')+
                # 출력 순서를 변경
                scale_x_discrete(limits=c('young','middle','old')) 

#### 05. income*by*sex*by*ageg

> - 연령대 및 성별 월급 평균표 

In [None]:
welfare %>% filter(!is.na(income)) %>% 
            select(ageg,income,sex) %>% 
            group_by(ageg,sex) %>% 
            summarise(mean_income=mean(income)) ->incomebysexbyageg

> - 연령대 및 성별 월급 그래프

In [None]:
incomebysexbyageg %>% ggplot(aes(ageg,mean_income, fill=sex))+
                      geom_col(position='dodge') +
                      scale_x_discrete(limits=c('young','middle','old'))

#### 06. occupaton preprocessing 

#### 전처리
>- 코드 테이블(엑셀파일)과 조인해서....

In [None]:
library(readxl)
occupation1=read_excel('../input/codebook/Koweps_Codebook.xlsx',sheet=2)

>- code_job에 padding으로 "0" 추가

In [None]:
library(tidyverse)

In [None]:
occupation1$code_job =str_pad(occupation1$code_job, width=4, side="left", pad="0")

In [None]:
#data save
save(occupation1,file="occupation.rda")

 #### 06. jobfrequence*by*job

> - 남성/여성 직업상위 TOP10 요약테이블

In [None]:
#직종코드테이블
load("occupation.rda")

In [None]:
welfare %>% filter(!is.na(income)) %>% 
    select(code_job, income) %>% 
    group_by(code_job) %>% 
    summarise(mean_income=mean(income)) -> incomebyjob

incomebyjob %>% left_join(occupation1, by="code_job") %>% 
    arrange(desc(mean_income)) %>% 
    head(10) %>% 
    mutate(rank="top10") -> top10

incomebyjob %>% left_join(occupation1, by="code_job") %>% 
    arrange(desc(mean_income)) %>% 
    tail(10) %>% 
    mutate(rank="bottom10")-> bottom10

top10 %>% bind_rows(bottom10) -> top_bottom10  


In [None]:
#상위10위
top_bottom10 %>% filter(rank=="top10") %>% 
    ggplot(aes(reorder(code_job, mean_income), mean_income)) + 
    geom_col() +
    coord_flip()

In [None]:
#하위10위
top_bottom10 %>% filter(rank=="bottom10") %>%  
    ggplot(aes(reorder(code_job, mean_income), mean_income)) + 
    geom_col() +
    coord_flip()

In [None]:

top_bottom10 %>% ggplot(aes(reorder(code_job, mean_income), mean_income)) + 
    geom_col() +
    coord_flip() +
    # facet_grid(cols = vars(rank), scales = "free")
    facet_grid(rows = vars(rank), scales = "free")

#### 07. jobfrequence*by*job*by*sex

In [None]:
#남성 상위 10위
welfare %>% filter(!is.na(code_job)) %>% 
            filter(sex=="male") %>% 
            select(code_job) %>%  
            group_by(code_job) %>%
            summarise(n=n()) %>% 
            arrange(desc(n)) %>% 
            mutate(sex="male") %>% 
            head(10) ->male_top10 

#여성 상위 10위
welfare %>% filter(!is.na(code_job)) %>% 
            filter(sex=="female") %>% 
            select(code_job) %>%  
            count(code_job) %>% 
            arrange(desc(n)) %>% 
            mutate(sex="female") %>% 
            head(10)->female_top10

In [None]:


#남여상위, 하위 바인드
male_top10 %>% bind_rows(female_top10) %>% 
               left_join(occupation1, by="code_job") ->jobfrequencebysex


In [None]:
#2.그래프 만들기
#남성상위 10
jobfrequencebysex %>% filter(sex=="male") %>% 
                      ggplot(aes(reorder(code_job,n),n,fill=sex))+
                      geom_col(position="dodge")+
                      coord_flip()+
                      facet_grid(rows=vars(sex),scales="free")+
                      labs(y="종사인원",x="직종")

In [None]:
#여성상위 10
jobfrequencebysex %>% filter(sex=="female") %>% 
                      ggplot(aes(reorder(code_job,n),n,fill=sex))+
                      geom_col(position="dodge")+
                      coord_flip()+
                      facet_grid(rows=vars(sex),scales="free")+
                      labs(y="종사인원",x="직종")


In [None]:
#남여상위 10
jobfrequencebysex %>% filter(sex %in% c("male","female")) %>% 
                      ggplot(aes(reorder(code_job,n),n,fill=sex))+
                      geom_col(position="dodge")+
                      coord_flip()+
                      facet_grid(rows=vars(sex),scales="free")+
                      labs(y="종사인원",x="직종")

#### 08. divorce*by*religion*by*rate 

In [None]:
#종교유무 와 이혼율(use group_by)
welfare %>% filter(!is.na(group_marriage)) %>% 
    filter(!is.na(religion)) %>% 
    select(religion,group_marriage) %>% 
    group_by(religion,group_marriage) %>%  #tibble group => religion,group_marriage정보가 있음
    summarise(n=n()) %>% #하위 tibble group=> group_marriage 정보가 없어짐==>religion 그룹만남음
    mutate(total=sum(n)) %>%  #religion 그룹으로 각행에 ==> sum값.. 
    mutate(pct=round(n/total*100,1)) %>% 
    filter(group_marriage =='divorce') ->divorceratiobyreligion1


In [None]:

divorceratiobyreligion1 %>% 
     ggplot(aes(religion,pct))+geom_col()

In [None]:
    #divorcebyreligionbyageg
    welfare %>% filter(!is.na(ageg)) %>% 
        filter(!is.na(religion)) %>% 
        filter(!is.na(group_marriage)) %>% 
        group_by(ageg,religion,group_marriage) %>% 
        summarise(n=n()) %>% 
        mutate(total=sum(n)) %>% 
        mutate(ratio=n/total*100) %>% 
        filter(group_marriage=="divorce") ->divorcebyreligionbyageg1

In [None]:
divorcebyreligionbyageg1 %>% 
         ggplot(aes(ageg,ratio, fill=religion))+
         geom_col(position="dodge")+
         scale_x_discrete(limits=c('young','middle','old')) + 
         labs(x="ageg",y="ratio",title="Divoce ratio by ageg by religion")
    

#### 09. ratio*by*ageg*by*region

In [None]:
welfare %>% filter(!is.na(code_region)) %>% 
            filter(!is.na(ageg)) %>% 
            count(code_region, ageg) %>% 
            group_by(code_region) %>% 
            mutate(total=sum(n)) %>% 
            mutate(ratio=n/total*100) %>% 
            ungroup() %>% 
            left_join(region7, by="code_region") %>% 
            select(region,ageg,ratio) ->ratiobyagegbyregion1



In [None]:
region7 %>% arrange(desc(code_region)) ->sort1
ratiobyagegbyregion1 %>% 
    ggplot(aes(region,ratio,fill=ageg))+
    geom_col()+
    coord_flip()+
    scale_x_discrete(limits=sort1$region)

In [None]:
#old의 비율이 많은 순서
ratiobyagegbyregion1 %>% 
    arrange(factor(ageg,levels='old','middle','young'),ratio) %>% 
    filter(ageg=='old')->sort2

ratiobyagegbyregion1 %>% 
    ggplot(aes(region,ratio,fill=ageg))+
    geom_col()+
    coord_flip()+
    scale_x_discrete(limits=sort2$region)


In [None]:
list.files()

In [None]:
getwd()

In [None]:
#현재의 상위
list.files("../")

In [None]:
list.dirs("../")

In [None]:
dir("../")

In [None]:
library(fs)

In [None]:
dir_ls()  # ls
dir_ls("../")
dir_ls("../input")

In [None]:
dir_info()#ls-l, ll

In [None]:
dir_info("../")#ls-l, ll

In [None]:
dir_info("../",recursive = T)#ls-l, ll

In [None]:
dir_info("/")#ls-l, ll

In [None]:
dir_info("/kaggle")