## dplyr
* 데이터 전처리 작업은 동사로 정의된 각 함수를 통해서 이루어 짐
    + select    : 열 선택
    + filter    : 조건 검색
    + group_by  : 데이터 그룹화
    + mutate    : 행/열 추가
    + summrise  : 데이터 집계
    + arrange   : 데이터 정렬

* 이러한 함수들은 magrittr 패키지에 의해 구현된 파이프(>) 패러다임을 이용해서 체인형식으로 연산을 수행 할 수 있음
* 즉, 먼저 수행한 함수의 결과를 다음 실행할 함수의 입력으로 보낼 수 있음
    + sum(titanic, na.rm=T) # 기존방식
    + titanic %>% sum(na.rm=T) # 파이프 방식
* 파이프 기호(%>%)를 입력할 때는 단축키 ctrl + shift + M 를 사용함

In [2]:
library(dplyr)

In [3]:
load('sungjuk.rdata')

## arrange (데이터 정렬)

In [4]:
# 이름순으로 오름차순 정렬

sungjuk %>% arrange(name)

name,eng,mat,sci
<fct>,<dbl>,<dbl>,<dbl>
Jane,78,95,60
John,90,85,90
Mark,69,49,70
Tom,95,96,80


In [6]:
# 이름순으로 내림차순 정렬

sungjuk %>% arrange(desc(name))

name,eng,mat,sci
<fct>,<dbl>,<dbl>,<dbl>
Tom,95,96,80
Mark,69,49,70
John,90,85,90
Jane,78,95,60


In [7]:
# 영어/수학/과학 점수를 내림차순 정렬

sungjuk %>% arrange(eng, mat, sci)

name,eng,mat,sci
<fct>,<dbl>,<dbl>,<dbl>
Mark,69,49,70
Jane,78,95,60
John,90,85,90
Tom,95,96,80


## summarise (데이터 집계)

In [11]:
# 평균 영어 점수 출력

sungjuk %>% summarise(평균영어점수=mean(eng))

평균영어점수
<dbl>
83


In [13]:
# 각 과목별로 평균/최고/최소 점수 출력

sungjuk %>% summarise(평균영어점수=mean(eng), 평균수학점수=mean(mat), 평균과학점수=mean(sci),
            최고영어점수=max(eng), 최고수학점수=max(mat), 최고과학점수=max(sci),
            최소영어점수=min(eng), 최소수학점수=min(mat), 최소과학점수=min(sci))

평균영어점수,평균수학점수,평균과학점수,최고영어점수,최고수학점수,최고과학점수,최소영어점수,최소수학점수,최소과학점수
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
83,81.25,75,95,96,90,69,49,60


## group_by (데이터 그룹화)

In [15]:
# 새로운 컬럼 추가
sungjuk$class <- c(2,1,2,3)

In [19]:
# 클래스별 평균 영어점수 출력

sungjuk %>% group_by(class) %>% summarise(평균영어=mean(eng))

class,평균영어
<dbl>,<dbl>
1,95.0
2,79.5
3,78.0


In [20]:
# 클래스별 최고/최소 영어점수 출력
sungjuk %>% group_by(class) %>% summarise(max(eng), min(eng))

class,max(eng),min(eng)
<dbl>,<dbl>,<dbl>
1,95,95
2,90,69
3,78,78


In [21]:
# diamonds 데이터셋을 이용해서 다음 문제를 풀어보시오

library(ggplot2)

diamonds

carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31
0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
0.29,Premium,I,VS2,62.4,58,334,4.20,4.23,2.63
0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
0.24,Very Good,J,VVS2,62.8,57,336,3.94,3.96,2.48
0.24,Very Good,I,VVS1,62.3,57,336,3.95,3.98,2.47
0.26,Very Good,H,SI1,61.9,55,337,4.07,4.11,2.53
0.22,Fair,E,VS2,65.1,61,337,3.87,3.78,2.49
0.23,Very Good,H,VS1,59.4,61,338,4.00,4.05,2.39


In [92]:
# carat과 price 변수 출력

head(diamonds[, c('carat','price')])

carat,price
<dbl>,<int>
0.23,326
0.21,326
0.23,327
0.29,334
0.31,335
0.24,336


In [93]:
head(diamonds[, c(1,7)])

carat,price
<dbl>,<int>
0.23,326
0.21,326
0.23,327
0.29,334
0.31,335
0.24,336


In [49]:
diamonds %>% select(carat, price) %>% head()

carat,price
<dbl>,<int>
0.23,326
0.21,326
0.23,327
0.29,334
0.31,335
0.24,336


In [53]:
diamonds %>% select(1, 7) %>% head() 

carat,price
<dbl>,<int>
0.23,326
0.21,326
0.23,327
0.29,334
0.31,335
0.24,336


In [94]:
head(select(diamonds, 1, 7) )

carat,price
<dbl>,<int>
0.23,326
0.21,326
0.23,327
0.29,334
0.31,335
0.24,336


In [95]:
# 변수들 중 carat, price 제외하고 모두 출력

# diamonds[, -c('carat','price')] # 오류발생!
head(diamonds[, -c(1,7)])

cut,color,clarity,depth,table,x,y,z
<ord>,<ord>,<ord>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Ideal,E,SI2,61.5,55,3.95,3.98,2.43
Premium,E,SI1,59.8,61,3.89,3.84,2.31
Good,E,VS1,56.9,65,4.05,4.07,2.31
Premium,I,VS2,62.4,58,4.2,4.23,2.63
Good,J,SI2,63.3,58,4.34,4.35,2.75
Very Good,J,VVS2,62.8,57,3.94,3.96,2.48


In [59]:
diamonds %>% select(-carat, -price) %>% head()
diamonds %>% select(-c(carat, price)) %>% head()

cut,color,clarity,depth,table,x,y,z
<ord>,<ord>,<ord>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Ideal,E,SI2,61.5,55,3.95,3.98,2.43
Premium,E,SI1,59.8,61,3.89,3.84,2.31
Good,E,VS1,56.9,65,4.05,4.07,2.31
Premium,I,VS2,62.4,58,4.2,4.23,2.63
Good,J,SI2,63.3,58,4.34,4.35,2.75
Very Good,J,VVS2,62.8,57,3.94,3.96,2.48


cut,color,clarity,depth,table,x,y,z
<ord>,<ord>,<ord>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Ideal,E,SI2,61.5,55,3.95,3.98,2.43
Premium,E,SI1,59.8,61,3.89,3.84,2.31
Good,E,VS1,56.9,65,4.05,4.07,2.31
Premium,I,VS2,62.4,58,4.2,4.23,2.63
Good,J,SI2,63.3,58,4.34,4.35,2.75
Very Good,J,VVS2,62.8,57,3.94,3.96,2.48


In [60]:
diamonds %>% select(-1, -7) %>% head()
diamonds %>% select(-c(1,7)) %>% head() 

cut,color,clarity,depth,table,x,y,z
<ord>,<ord>,<ord>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Ideal,E,SI2,61.5,55,3.95,3.98,2.43
Premium,E,SI1,59.8,61,3.89,3.84,2.31
Good,E,VS1,56.9,65,4.05,4.07,2.31
Premium,I,VS2,62.4,58,4.2,4.23,2.63
Good,J,SI2,63.3,58,4.34,4.35,2.75
Very Good,J,VVS2,62.8,57,3.94,3.96,2.48


cut,color,clarity,depth,table,x,y,z
<ord>,<ord>,<ord>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Ideal,E,SI2,61.5,55,3.95,3.98,2.43
Premium,E,SI1,59.8,61,3.89,3.84,2.31
Good,E,VS1,56.9,65,4.05,4.07,2.31
Premium,I,VS2,62.4,58,4.2,4.23,2.63
Good,J,SI2,63.3,58,4.34,4.35,2.75
Very Good,J,VVS2,62.8,57,3.94,3.96,2.48


In [96]:
# select(diamonds, carat, price) 
# select(diamonds, 1, 7) 


head(select(diamonds, -c(carat, price)))
head(select(diamonds, -c(1, 7))) 

cut,color,clarity,depth,table,x,y,z
<ord>,<ord>,<ord>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Ideal,E,SI2,61.5,55,3.95,3.98,2.43
Premium,E,SI1,59.8,61,3.89,3.84,2.31
Good,E,VS1,56.9,65,4.05,4.07,2.31
Premium,I,VS2,62.4,58,4.2,4.23,2.63
Good,J,SI2,63.3,58,4.34,4.35,2.75
Very Good,J,VVS2,62.8,57,3.94,3.96,2.48


cut,color,clarity,depth,table,x,y,z
<ord>,<ord>,<ord>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Ideal,E,SI2,61.5,55,3.95,3.98,2.43
Premium,E,SI1,59.8,61,3.89,3.84,2.31
Good,E,VS1,56.9,65,4.05,4.07,2.31
Premium,I,VS2,62.4,58,4.2,4.23,2.63
Good,J,SI2,63.3,58,4.34,4.35,2.75
Very Good,J,VVS2,62.8,57,3.94,3.96,2.48


In [70]:
# 변수명중 c로 시작하는 것만 출력 : starts_with
# diamonds[, starts_with('c')]  # 실행오류!

diamonds %>% select(starts_with('c')) %>% head()

head(select(diamonds, starts_with('c')))

carat,cut,color,clarity
<dbl>,<ord>,<ord>,<ord>
0.23,Ideal,E,SI2
0.21,Premium,E,SI1
0.23,Good,E,VS1
0.29,Premium,I,VS2
0.31,Good,J,SI2
0.24,Very Good,J,VVS2


carat,cut,color,clarity
<dbl>,<ord>,<ord>,<ord>
0.23,Ideal,E,SI2
0.21,Premium,E,SI1
0.23,Good,E,VS1
0.29,Premium,I,VS2
0.31,Good,J,SI2
0.24,Very Good,J,VVS2


In [75]:
# 변수명중 e로 끝나는 것만 출력 : ends_with

diamonds %>% select(ends_with('e')) %>% head()

head(select(diamonds, ends_with('e')))

table,price
<dbl>,<int>
55,326
61,326
65,327
58,334
58,335
57,336


table,price
<dbl>,<int>
55,326
61,326
65,327
58,334
58,335
57,336


In [76]:
# 변수명중 l을 포함하는 것만 출력: contains/matches

diamonds %>% select(contains('l')) %>% head()

head(select(diamonds, contains('l')))

color,clarity,table
<ord>,<ord>,<dbl>
E,SI2,55
E,SI1,61
E,VS1,65
I,VS2,58
J,SI2,58
J,VVS2,57


color,clarity,table
<ord>,<ord>,<dbl>
E,SI2,55
E,SI1,61
E,VS1,65
I,VS2,58
J,SI2,58
J,VVS2,57


In [87]:
# 변수명중 r과 t를 포함한 것만 출력

diamonds %>% select(contains('r') & contains('t')) %>% head()


diamonds %>% select(matches('r.t')) %>% head()


diamonds %>% select(matches('r.+t')) %>% head()

carat,clarity
<dbl>,<ord>
0.23,SI2
0.21,SI1
0.23,VS1
0.29,VS2
0.31,SI2
0.24,VVS2


carat,clarity
<dbl>,<ord>
0.23,SI2
0.21,SI1
0.23,VS1
0.29,VS2
0.31,SI2
0.24,VVS2


carat,clarity
<dbl>,<ord>
0.23,SI2
0.21,SI1
0.23,VS1
0.29,VS2
0.31,SI2
0.24,VVS2


In [91]:
# cut 변수 값 중 'Ideal' 인 데이터 조회

head(diamonds[diamonds$cut == 'Ideal',])

diamonds %>% filter(cut == 'Ideal') %>% head()

head(filter(diamonds, cut == 'Ideal'))

carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.23,Ideal,J,VS1,62.8,56,340,3.93,3.9,2.46
0.31,Ideal,J,SI2,62.2,54,344,4.35,4.37,2.71
0.3,Ideal,I,SI2,62.0,54,348,4.31,4.34,2.68
0.33,Ideal,I,SI2,61.8,55,403,4.49,4.51,2.78
0.33,Ideal,I,SI2,61.2,56,403,4.49,4.5,2.75


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.23,Ideal,J,VS1,62.8,56,340,3.93,3.9,2.46
0.31,Ideal,J,SI2,62.2,54,344,4.35,4.37,2.71
0.3,Ideal,I,SI2,62.0,54,348,4.31,4.34,2.68
0.33,Ideal,I,SI2,61.8,55,403,4.49,4.51,2.78
0.33,Ideal,I,SI2,61.2,56,403,4.49,4.5,2.75


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.23,Ideal,J,VS1,62.8,56,340,3.93,3.9,2.46
0.31,Ideal,J,SI2,62.2,54,344,4.35,4.37,2.71
0.3,Ideal,I,SI2,62.0,54,348,4.31,4.34,2.68
0.33,Ideal,I,SI2,61.8,55,403,4.49,4.51,2.78
0.33,Ideal,I,SI2,61.2,56,403,4.49,4.5,2.75


In [100]:
# price 가 1000 이상인 데이터 조회

head(diamonds[diamonds$price >= 1000,])

diamonds %>% filter(price>= 1000) %>% head()

head(filter(diamonds, price >= 1000))

carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.7,Ideal,E,SI1,62.5,57,2757,5.7,5.72,3.57
0.86,Fair,E,SI2,55.1,69,2757,6.45,6.33,3.52
0.7,Ideal,G,VS2,61.6,56,2757,5.7,5.67,3.5
0.71,Very Good,E,VS2,62.4,57,2759,5.68,5.73,3.56
0.78,Very Good,G,SI2,63.8,56,2759,5.81,5.85,3.72
0.7,Good,E,VS2,57.5,58,2759,5.85,5.9,3.38


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.7,Ideal,E,SI1,62.5,57,2757,5.7,5.72,3.57
0.86,Fair,E,SI2,55.1,69,2757,6.45,6.33,3.52
0.7,Ideal,G,VS2,61.6,56,2757,5.7,5.67,3.5
0.71,Very Good,E,VS2,62.4,57,2759,5.68,5.73,3.56
0.78,Very Good,G,SI2,63.8,56,2759,5.81,5.85,3.72
0.7,Good,E,VS2,57.5,58,2759,5.85,5.9,3.38


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.7,Ideal,E,SI1,62.5,57,2757,5.7,5.72,3.57
0.86,Fair,E,SI2,55.1,69,2757,6.45,6.33,3.52
0.7,Ideal,G,VS2,61.6,56,2757,5.7,5.67,3.5
0.71,Very Good,E,VS2,62.4,57,2759,5.68,5.73,3.56
0.78,Very Good,G,SI2,63.8,56,2759,5.81,5.85,3.72
0.7,Good,E,VS2,57.5,58,2759,5.85,5.9,3.38


In [105]:
# carat이 2이상, price 가 14000 이하인 데이터 조회

head(diamonds[diamonds$carat >= 2 & diamonds$price <= 14000,])


diamonds %>% filter(carat >= 2 , price <= 14000) %>% head()


head(filter(diamonds, carat >=2 & price <= 14000))

carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
2.0,Premium,J,I1,61.5,59,5051,8.11,8.06,4.97
2.06,Premium,J,I1,61.2,58,5203,8.1,8.07,4.95
2.14,Fair,J,I1,69.4,57,5405,7.74,7.7,5.36
2.15,Fair,J,I1,65.5,57,5430,8.01,7.95,5.23
2.22,Fair,J,I1,66.7,56,5607,8.04,8.02,5.36
2.0,Fair,I,I1,66.0,60,5667,7.78,7.74,5.1


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
2.0,Premium,J,I1,61.5,59,5051,8.11,8.06,4.97
2.06,Premium,J,I1,61.2,58,5203,8.1,8.07,4.95
2.14,Fair,J,I1,69.4,57,5405,7.74,7.7,5.36
2.15,Fair,J,I1,65.5,57,5430,8.01,7.95,5.23
2.22,Fair,J,I1,66.7,56,5607,8.04,8.02,5.36
2.0,Fair,I,I1,66.0,60,5667,7.78,7.74,5.1


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
2.0,Premium,J,I1,61.5,59,5051,8.11,8.06,4.97
2.06,Premium,J,I1,61.2,58,5203,8.1,8.07,4.95
2.14,Fair,J,I1,69.4,57,5405,7.74,7.7,5.36
2.15,Fair,J,I1,65.5,57,5430,8.01,7.95,5.23
2.22,Fair,J,I1,66.7,56,5607,8.04,8.02,5.36
2.0,Fair,I,I1,66.0,60,5667,7.78,7.74,5.1


In [134]:
# carat이 1초과, 5 미만인 데이터 조회


head(diamonds[diamonds$carat >= 1 | diamonds$carat > 5,])


diamonds %>% filter(carat > 1 | carat > 5) %>% head()


head(filter(diamonds, carat > 1 | carat > 5))

carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
1.17,Very Good,J,I1,60.2,61,2774,6.83,6.9,4.13
1.01,Premium,F,I1,61.8,60,2781,6.39,6.36,3.94
1.01,Fair,E,I1,64.5,58,2788,6.29,6.21,4.03
1.01,Premium,H,SI2,62.7,59,2788,6.31,6.22,3.93
1.05,Very Good,J,SI2,63.2,56,2789,6.49,6.45,4.09
1.05,Fair,J,SI2,65.8,59,2789,6.41,6.27,4.18


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
1.17,Very Good,J,I1,60.2,61,2774,6.83,6.9,4.13
1.01,Premium,F,I1,61.8,60,2781,6.39,6.36,3.94
1.01,Fair,E,I1,64.5,58,2788,6.29,6.21,4.03
1.01,Premium,H,SI2,62.7,59,2788,6.31,6.22,3.93
1.05,Very Good,J,SI2,63.2,56,2789,6.49,6.45,4.09
1.05,Fair,J,SI2,65.8,59,2789,6.41,6.27,4.18


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
1.17,Very Good,J,I1,60.2,61,2774,6.83,6.9,4.13
1.01,Premium,F,I1,61.8,60,2781,6.39,6.36,3.94
1.01,Fair,E,I1,64.5,58,2788,6.29,6.21,4.03
1.01,Premium,H,SI2,62.7,59,2788,6.31,6.22,3.93
1.05,Very Good,J,SI2,63.2,56,2789,6.49,6.45,4.09
1.05,Fair,J,SI2,65.8,59,2789,6.41,6.27,4.18


In [135]:
# carat이 1미만이거나, 5초과인 데이터 조회

head(diamonds[diamonds$carat < 1 | diamonds$carat > 5,])


diamonds %>% filter(carat < 1 | carat > 5) %>% head()


head(filter(diamonds, carat < 1 | carat > 5))

carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31
0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63
0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
0.24,Very Good,J,VVS2,62.8,57,336,3.94,3.96,2.48


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31
0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63
0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
0.24,Very Good,J,VVS2,62.8,57,336,3.94,3.96,2.48


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31
0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63
0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
0.24,Very Good,J,VVS2,62.8,57,336,3.94,3.96,2.48


In [128]:
# cut이 'Ideal'이거나 'Good'인 데이터 조회


head(diamonds[diamonds$cut =='Ideal' | diamonds$cut == 'Good',])

head(diamonds[diamonds$cut %in% c('Ideal','Good'),])


diamonds %>% filter(cut == 'Ideal' | cut == 'Good') %>% head()


diamonds %>% filter(cut %in% c('Ideal','Good')) %>% head()

carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
0.3,Good,J,SI1,64.0,55,339,4.25,4.28,2.73
0.23,Ideal,J,VS1,62.8,56,340,3.93,3.9,2.46
0.31,Ideal,J,SI2,62.2,54,344,4.35,4.37,2.71


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
0.3,Good,J,SI1,64.0,55,339,4.25,4.28,2.73
0.23,Ideal,J,VS1,62.8,56,340,3.93,3.9,2.46
0.31,Ideal,J,SI2,62.2,54,344,4.35,4.37,2.71


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
0.3,Good,J,SI1,64.0,55,339,4.25,4.28,2.73
0.23,Ideal,J,VS1,62.8,56,340,3.93,3.9,2.46
0.31,Ideal,J,SI2,62.2,54,344,4.35,4.37,2.71


carat,cut,color,clarity,depth,table,price,x,y,z
<dbl>,<ord>,<ord>,<ord>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
0.3,Good,J,SI1,64.0,55,339,4.25,4.28,2.73
0.23,Ideal,J,VS1,62.8,56,340,3.93,3.9,2.46
0.31,Ideal,J,SI2,62.2,54,344,4.35,4.37,2.71


In [138]:
# price를 carat으로 나눈 것을 Ratio라는 컬럼으로
# Ratio 컬럼을 제곱한것을 Double이라는 컬럼으로 생성

dia <- diamonds
dia$Ratio <- dia$price / dia$carat
dia$Double <- dia$Ratio **2

head(dia[,c('carat','price','Ratio','Double')])

carat,price,Ratio,Double
<dbl>,<int>,<dbl>,<dbl>
0.23,326,1417.391,2008998
0.21,326,1552.381,2409887
0.23,327,1421.739,2021342
0.29,334,1151.724,1326468
0.31,335,1080.645,1167794
0.24,336,1400.0,1960000


In [139]:
dia <- diamonds

dia %>% mutate(Ratio = price / carat, Double = Ratio **2) %>% select('carat','price','Ratio','Double') %>% head()

carat,price,Ratio,Double
<dbl>,<int>,<dbl>,<dbl>
0.23,326,1417.391,2008998
0.21,326,1552.381,2409887
0.23,327,1421.739,2021342
0.29,334,1151.724,1326468
0.31,335,1080.645,1167794
0.24,336,1400.0,1960000


In [160]:
# 평균 가격price와 price의 중앙값, 평균 carat을 집계


c1 <- mean(diamonds$price)
c2 <- median(diamonds$price)
c3 <- mean(diamonds$carat)


# 여러컬럼을 하나로 묶음 : cbind, data.frame

df <- data.frame(c1,c2,c3)

colnames(df) = c('avgPrice', 'medPrice', 'avgCarat')

df

avgPrice,medPrice,avgCarat
<dbl>,<dbl>,<dbl>
3932.8,2401,0.7979397


In [146]:
diamonds %>% summarise(avgPrice=mean(price), medPrice=median(price), avgCarat=mean(carat))

avgPrice,medPrice,avgCarat
<dbl>,<dbl>,<dbl>
3932.8,2401,0.7979397


## 집계처리 : aggregate
+ aggregate(집계대상컬럼, by=그룹대상컬럼, 집계함수)
+ aggregate(인자식, 데이터, 집계함수)
+ 집계시 사용하는 인자식 예 : a ~ b + c
  - a : 집계하고자 하는 대상
  - b : 그룹으로 묶을 기준1
  - c : 그룹으로 묶을 기준2

In [162]:
# cut 기준으로 평균 price를 집계

aggregate(diamonds$price, by=list(diamonds$cut), FUN=mean)

Group.1,x
<ord>,<dbl>
Fair,4358.758
Good,3928.864
Very Good,3981.76
Premium,4584.258
Ideal,3457.542


In [166]:
aggregate(price ~ cut, diamonds, FUN=mean)

cut,price
<ord>,<dbl>
Fair,4358.758
Good,3928.864
Very Good,3981.76
Premium,4584.258
Ideal,3457.542


In [169]:
diamonds %>% group_by(cut) %>% summarise(avgPrice=mean(price))

cut,avgPrice
<ord>,<dbl>
Fair,4358.758
Good,3928.864
Very Good,3981.76
Premium,4584.258
Ideal,3457.542


In [177]:
# cut, color 기준으로 평균 가격을 avgPrice로, 총 carat을 sumCarat이라는 이름으로 집계

head(aggregate(price ~ cut + color, diamonds, FUN=mean))
head(aggregate(carat ~ cut + color, diamonds, FUN=sum))

Unnamed: 0_level_0,cut,color,price
Unnamed: 0_level_1,<ord>,<ord>,<dbl>
1,Fair,D,4291.061
2,Good,D,3405.382
3,Very Good,D,3470.467
4,Premium,D,3631.293
5,Ideal,D,2629.095
6,Fair,E,3682.312


Unnamed: 0_level_0,cut,color,carat
Unnamed: 0_level_1,<ord>,<ord>,<dbl>
1,Fair,D,149.98
2,Good,D,492.87
3,Very Good,D,1053.69
4,Premium,D,1156.64
5,Ideal,D,1603.38
6,Fair,E,191.88


In [183]:
# aggregate함수에 집계함수를 2개 이상 사용하려면
# plyr 패키지의 each 함수를 사용해야함
# 단, 하나의 집계대상 컬럼에만 적용가능
#install.packages('plyr')

library(plyr)

aggregate(carat ~ cut + color, diamonds, each(mean, sum))

cut,color,carat
<ord>,<ord>,"<dbl[,2]>"
Fair,D,"0.9201227, 149.98"
Good,D,"0.7445166, 492.87"
Very Good,D,"0.6964243, 1053.69"
Premium,D,"0.7215471, 1156.64"
Ideal,D,"0.5657657, 1603.38"
Fair,E,"0.8566071, 191.88"
Good,E,"0.7451340, 695.21"
Very Good,E,"0.6763167, 1623.16"
Premium,E,"0.7177450, 1677.37"
Ideal,E,"0.5784012, 2257.50"


In [190]:
diamonds %>% group_by(cut, color) %>% summarise(avgPrice=mean(price), sumCarat=sum(carat))

avgPrice,sumCarat
<dbl>,<dbl>
3932.8,43040.87


In [210]:
# plyr 패키지를 메모리에서 제거
#detach('package:plyr', unload=T)

In [208]:
# color 기준으로 평균 가격을 avgPrice로, 총 carat을 sumCarat이라는 이름으로 집계하고 평균가격으로 내림차순 정렬
dia <- diamonds
avgPrice <- aggregate(price ~ color,dia,FUN=mean)
avgPrice 

sumCarat <- aggregate(carat ~ color,dia,FUN=sum)
sumCarat 

df <- data.frame(avgPrice, carat=(sumCarat$carat))
colnames(df) <- c('color','avgPrice','sumCarat')
df

color,price
<ord>,<dbl>
D,3169.954
E,3076.752
F,3724.886
G,3999.136
H,4486.669
I,5091.875
J,5323.818


color,carat
<ord>,<dbl>
D,4456.56
E,6445.12
F,7028.05
G,8708.28
H,7571.58
I,5568.0
J,3263.28


color,avgPrice,sumCarat
<ord>,<dbl>,<dbl>
D,3169.954,4456.56
E,3076.752,6445.12
F,3724.886,7028.05
G,3999.136,8708.28
H,4486.669,7571.58
I,5091.875,5568.0
J,5323.818,3263.28


In [213]:
df[order(-df$avgPrice), ]  # 정렬 출력

Unnamed: 0_level_0,color,avgPrice,sumCarat
Unnamed: 0_level_1,<ord>,<dbl>,<dbl>
7,J,5323.818,3263.28
6,I,5091.875,5568.0
5,H,4486.669,7571.58
4,G,3999.136,8708.28
3,F,3724.886,7028.05
1,D,3169.954,4456.56
2,E,3076.752,6445.12


In [215]:
diamonds %>% group_by(color) %>% summarise(avgPrice=mean(price), sumCarat=sum(carat)) %>% arrange(desc(avgPrice))

color,avgPrice,sumCarat
<ord>,<dbl>,<dbl>
J,5323.818,3263.28
I,5091.875,5568.0
H,4486.669,7571.58
G,3999.136,8708.28
F,3724.886,7028.05
D,3169.954,4456.56
E,3076.752,6445.12


In [219]:
# color 기준으로 평균 가격을 avgPrice로, 총 carat을 sumCarat이라는 이름으로 집계하고,
# 평균가격은 내림차순 정렬, 총 carat은 오름차순으로 정렬


df[ order(-df$avgPrice, df$sumCarat),  ]


diamonds %>% group_by(color) %>% summarise(avgPrice=mean(price), sumCarat=sum(carat)) %>% arrange(desc(avgPrice), sumCarat)


Unnamed: 0_level_0,color,avgPrice,sumCarat
Unnamed: 0_level_1,<ord>,<dbl>,<dbl>
7,J,5323.818,3263.28
6,I,5091.875,5568.0
5,H,4486.669,7571.58
4,G,3999.136,8708.28
3,F,3724.886,7028.05
1,D,3169.954,4456.56
2,E,3076.752,6445.12


color,avgPrice,sumCarat
<ord>,<dbl>,<dbl>
J,5323.818,3263.28
I,5091.875,5568.0
H,4486.669,7571.58
G,3999.136,8708.28
F,3724.886,7028.05
D,3169.954,4456.56
E,3076.752,6445.12
