# Dplyr: Data Cleaning

## 1. Common techniques

### 1.1. Selection of variables

In [1]:
library(dplyr)

"package 'dplyr' was built under R version 3.6.3"

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [2]:
mtcars <- read.csv('data/mtcars.csv')
mtcars %>% head(3)

Unnamed: 0_level_0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<fct>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>
1,Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
2,Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
3,Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1


In [3]:
# drop columns 2 and 3
mtcars %>% select(-c(2,3)) %>% head(3)

Unnamed: 0_level_0,model,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<fct>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>
1,Mazda RX4,160,110,3.9,2.62,16.46,0,1,4,4
2,Mazda RX4 Wag,160,110,3.9,2.875,17.02,0,1,4,4
3,Datsun 710,108,93,3.85,2.32,18.61,1,1,4,1


In [4]:
# drop columns containing 'c'
mtcars %>% select(-contains('c')) %>% head(3)

Unnamed: 0_level_0,model,mpg,disp,hp,drat,wt,vs,am,gear
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<int>
1,Mazda RX4,21.0,160,110,3.9,2.62,0,1,4
2,Mazda RX4 Wag,21.0,160,110,3.9,2.875,0,1,4
3,Datsun 710,22.8,108,93,3.85,2.32,1,1,4


### 1.2. Renaming columns

In [5]:
library(dplyr)

In [6]:
mtcars <- read.csv('data/mtcars.csv')
mtcars %>% head(3)

Unnamed: 0_level_0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<fct>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>
1,Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
2,Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
3,Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1


In [7]:
mtcars %>% rename(weight=wt, cylinders=cyl) %>% head(3)

Unnamed: 0_level_0,model,mpg,cylinders,disp,hp,drat,weight,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<fct>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>
1,Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
2,Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
3,Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1


### 1.3. Standardizing data types

In [8]:
library(dplyr)

In [9]:
year <- c(2019, 2019, 2020., 2020, 2020, 2020)
date <- c(20191103, 20190812, 20200125, 20200129, 20200412, 20200220)
medal <- c('Gold', 'Bronze', 'Silver', 'Bronze', 'Silver', 'Silver')
name <- c('Wayne', 'Robert', 'Ashley', 'Jamie', 'Jessie', 'Sergio')
left_handed <- c(1, 0, 0, 0, 1, 0)

athletes <- data.frame(year, date, medal, name, left_handed)
athletes

year,date,medal,name,left_handed
<dbl>,<dbl>,<fct>,<fct>,<dbl>
2019,20191103,Gold,Wayne,1
2019,20190812,Bronze,Robert,0
2020,20200125,Silver,Ashley,0
2020,20200129,Bronze,Jamie,0
2020,20200412,Silver,Jessie,1
2020,20200220,Silver,Sergio,0


In [10]:
athletes$date %>% as.character %>% as.Date(format='%Y%m%d')

In [11]:
athletes$left_handed %>% as.logical

In [12]:
athletes$medal %>% factor(levels=c('Bronze', 'Silver', 'Gold'))

In [13]:
# modify the dataframe
athletes$date <- athletes$date %>%
    as.character %>%
    as.Date(format='%Y%m%d')

athletes$left_handed <- athletes$left_handed %>% as.logical
athletes$medal <- athletes$medal %>% factor(levels=c('Bronze', 'Silver', 'Gold'))

In [14]:
athletes %>% arrange(medal)

year,date,medal,name,left_handed
<dbl>,<date>,<fct>,<fct>,<lgl>
2019,2019-08-12,Bronze,Robert,False
2020,2020-01-29,Bronze,Jamie,False
2020,2020-01-25,Silver,Ashley,False
2020,2020-04-12,Silver,Jessie,True
2020,2020-02-20,Silver,Sergio,False
2019,2019-11-03,Gold,Wayne,True


### 1.4. Map

In [15]:
library(purrr)

"package 'purrr' was built under R version 3.6.3"


In [16]:
student_id <- c('010001', '030001', '070001', '080001', '110001', '120001')
grade <- c(1, 3, 7, 8, 11, 12)
gender <- c('Male', 'Female', 'Female', 'Female', 'Male', 'Female')

students <- data.frame(student_id, grade, gender)
students

student_id,grade,gender
<fct>,<dbl>,<fct>
10001,1,Male
30001,3,Female
70001,7,Female
80001,8,Female
110001,11,Male
120001,12,Female


In [17]:
stage <-function(grade){
    if (grade >= 1 & grade <6)
        return(stage='Primary')
    if (grade >= 6 & grade <10)
        return(stage='Secondary')
    if (grade >= 10 & grade <13)
        return(stage='High')
}

In [18]:
students$grade %>% map(stage)

## 2. Handling abnormal data

### 2.1. Missing data

In [19]:
library(tidyr)

"package 'tidyr' was built under R version 3.6.3"


In [20]:
country <- c('US', 'Russia', 'Brazil', 'UK', 'Spain', 'Italy', 'France', 'Germany', 'Turkey', 'Iran')
comfirmed <- c(1576, 317, 310, 252, 233, 228, 181, 179, 153, 129)
deaths <- c(94, 3, 20, 36, 27, 32, 28, 8, 4, 7)
recovered <- c(298, NA, 125, NA, 150, 134, NA, NA, NA, NA)

covid <- data.frame(country, comfirmed, deaths, recovered)
covid

country,comfirmed,deaths,recovered
<fct>,<dbl>,<dbl>,<dbl>
US,1576,94,298.0
Russia,317,3,
Brazil,310,20,125.0
UK,252,36,
Spain,233,27,150.0
Italy,228,32,134.0
France,181,28,
Germany,179,8,
Turkey,153,4,
Iran,129,7,


#### Number of missing data

In [21]:
covid %>% is.na %>% colSums

In [22]:
x <- covid %>% is.na %>% colMeans
x

#### Removing missing values

In [23]:
# drop columns having more than 50% of missing data
covid %>% select(-'recovered')

country,comfirmed,deaths
<fct>,<dbl>,<dbl>
US,1576,94
Russia,317,3
Brazil,310,20
UK,252,36
Spain,233,27
Italy,228,32
France,181,28
Germany,179,8
Turkey,153,4
Iran,129,7


In [24]:
# drop missing observation based on a specific column
covid %>% na.omit(recovered)

Unnamed: 0_level_0,country,comfirmed,deaths,recovered
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>
1,US,1576,94,298
3,Brazil,310,20,125
5,Spain,233,27,150
6,Italy,228,32,134


#### Filling missing data

In [25]:
# forward fill
covid %>% fill(recovered)

country,comfirmed,deaths,recovered
<fct>,<dbl>,<dbl>,<dbl>
US,1576,94,298
Russia,317,3,298
Brazil,310,20,125
UK,252,36,125
Spain,233,27,150
Italy,228,32,134
France,181,28,134
Germany,179,8,134
Turkey,153,4,134
Iran,129,7,134


In [26]:
# back fill
covid %>% fill(recovered, .direction='up')

country,comfirmed,deaths,recovered
<fct>,<dbl>,<dbl>,<dbl>
US,1576,94,298.0
Russia,317,3,125.0
Brazil,310,20,125.0
UK,252,36,150.0
Spain,233,27,150.0
Italy,228,32,134.0
France,181,28,
Germany,179,8,
Turkey,153,4,
Iran,129,7,


In [27]:
# fill with specific values
covid %>% replace_na(list(recovered=0))

country,comfirmed,deaths,recovered
<fct>,<dbl>,<dbl>,<dbl>
US,1576,94,298
Russia,317,3,0
Brazil,310,20,125
UK,252,36,0
Spain,233,27,150
Italy,228,32,134
France,181,28,0
Germany,179,8,0
Turkey,153,4,0
Iran,129,7,0


### 2.2. Duplicated values

In [28]:
library(dplyr)

In [29]:
fish <- read.csv('data/us_fishery_foreign_trade.csv')
fish %>% head

Unnamed: 0_level_0,year,month,product,country,value,feature,unit
Unnamed: 0_level_1,<int>,<int>,<fct>,<fct>,<int>,<fct>,<fct>
1,2010,1,SABLEFISH FRESH,UNITED ARAB EMIRATES,2297,EXP Quantity,kg
2,2010,1,SABLEFISH FRESH,JAPAN,16025,EXP Quantity,kg
3,2010,1,SABLEFISH FRESH,JAPAN,63437,EXP Quantity,kg
4,2010,1,MONKFISH FRESH,CANADA,579,EXP Quantity,kg
5,2010,1,MONKFISH FRESH,CANADA,7975,EXP Quantity,kg
6,2010,1,MONKFISH FRESH,NETHERLANDS,389,EXP Quantity,kg


The `duplicated()` function determines the dulicated values; it keeps only one first or last observation and removes the others. The `fromLast` parameter indicates which values should be kept: `fromLast=FALSE` (default) means keeping the first values.

In [30]:
# keep the highest value of each group
fish = fish %>% arrange(year, month, feature, product, country, value)
fish = fish[!(fish %>% select(year, month, feature, product, country) %>% duplicated(fromLast=TRUE)),]

fish %>% head

Unnamed: 0_level_0,year,month,product,country,value,feature,unit
Unnamed: 0_level_1,<int>,<int>,<fct>,<fct>,<int>,<fct>,<fct>
2,2010,1,MONKFISH FRESH,CANADA,7975,EXP Quantity,kg
5,2010,1,MONKFISH FRESH,FRANCE,4170,EXP Quantity,kg
8,2010,1,MONKFISH FRESH,ITALY,6026,EXP Quantity,kg
9,2010,1,MONKFISH FRESH,JAPAN,1125,EXP Quantity,kg
10,2010,1,MONKFISH FRESH,NETHERLANDS,389,EXP Quantity,kg
11,2010,1,MONKFISH FRESH,PORTUGAL,1295,EXP Quantity,kg


### 2.3. Outliers

In [31]:
library(dplyr)
library(readxl)

"package 'readxl' was built under R version 3.6.3"


#### Using z-score

In [32]:
outliers_zscore <- function(array, z){
    array = as.vector(array)
    mean = array %>% mean
    std = array %>% sd
    lower = mean - z*std
    upper = mean + z*std
    array[(array < lower) | (array > upper)] = NA
    return(array)
    
}

In [33]:
wine = readxl::read_excel('data/wine_quality.xlsx')
wine %>% head

fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6


In [34]:
for (i in names(wine)){
    wine[i] = outliers_zscore(flatten_dbl(wine[i]), z=3)
}

In [35]:
removed_count = wine %>% is.na %>% colSums
removed_rate = wine %>% is.na %>% colMeans *100

data.frame(removed_count, removed_rate)

Unnamed: 0_level_0,removed_count,removed_rate
Unnamed: 0_level_1,<dbl>,<dbl>
fixed_acidity,46,0.93915884
volatile_acidity,81,1.65373622
citric_acid,85,1.7354022
residual_sugar,9,0.18374847
chlorides,102,2.08248265
free_sulfur_dioxide,32,0.65332789
total_sulfur_dioxide,12,0.24499796
density,3,0.06124949
ph,32,0.65332789
sulphates,48,0.97999183


#### Using interquartile range

In [36]:
outliers_iqr <- function(array){
    array = as.vector(array)
    Q1 =  quantile(array, 0.25)
    Q3 =  quantile(array, 0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    array[(array < lower) | (array > upper)] = NA
    return(array)
}

In [37]:
wine = read_excel('data/wine_quality.xlsx')
wine %>% head

fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6


In [38]:
for (i in names(wine)){
    wine[i] = outliers_iqr(flatten_dbl(wine[i]))
}

In [39]:
removed_count = wine %>% is.na %>% colSums
removed_rate = wine %>% is.na %>% colMeans *100

data.frame(removed_count, removed_rate)

Unnamed: 0_level_0,removed_count,removed_rate
Unnamed: 0_level_1,<dbl>,<dbl>
fixed_acidity,119,2.4295631
volatile_acidity,186,3.7974684
citric_acid,270,5.5124541
residual_sugar,7,0.1429155
chlorides,208,4.2466313
free_sulfur_dioxide,50,1.0208248
total_sulfur_dioxide,19,0.3879134
density,5,0.1020825
ph,75,1.5312372
sulphates,124,2.5316456


## 3. Text manipulation

### 3.1. Space and punctuation

In [40]:
library(stringr)

In [41]:
year<- c(2017, 2018, 2019, 2020)
country <- c('United Kingdom_','___  United\nKingdom','_United   Kingdom','United Kingdom ____')
export <- c(5466, 8558, 8435, 8435)
import <- c(1546, 3546, 2007, 3574)

trade <- data.frame(year, country, export, import)
trade

year,country,export,import
<dbl>,<fct>,<dbl>,<dbl>
2017,United Kingdom_,5466,1546
2018,___ United Kingdom,8558,3546
2019,_United Kingdom,8435,2007
2020,United Kingdom ____,8435,3574


In [42]:
trade$country %>% unique

#### Removing punctuation

In [43]:
trade$country = trade$country %>% str_replace_all("[[:punct:]]", "")

#### Removing space, tab, new line

In [44]:
trade$country = trade$country %>%
    str_replace_all(" {1,}", " ") %>%
    str_replace_all('[[:space:]]', ' ') %>%
    str_trim() %>%
    unique

### 3.2. Standardization

In [45]:
library(stringr)

In [46]:
date <- c('2020-01-01', '2020-01-02', '2020-01-03')
commodity <- c('Shrimp, frozen, chem free', 'Shrimp, frz, chemical-free', 'Prawn, frz, chemical-free')
price <- c(10, 13, 14)
unit <- c('usd/kg', 'USD/KG', 'USD/kg')

shrimp <- data.frame(date, commodity, price, unit)
shrimp

date,commodity,price,unit
<fct>,<fct>,<dbl>,<fct>
2020-01-01,"Shrimp, frozen, chem free",10,usd/kg
2020-01-02,"Shrimp, frz, chemical-free",13,USD/KG
2020-01-03,"Prawn, frz, chemical-free",14,USD/kg


In [47]:
# working on the "commodity" column
shrimp$commodity = shrimp$commodity %>% str_replace('Prawn','Shrimp')
shrimp$commodity = shrimp$commodity %>% str_replace('frz', 'frozen')
shrimp$commodity = shrimp$commodity %>% str_replace('chem free', 'chemical-free')

# working on the "unit" column
shrimp$unit = shrimp$unit %>% str_replace('usd','USD')
shrimp$unit = shrimp$unit %>% str_replace('KG','kg')

shrimp

date,commodity,price,unit
<fct>,<chr>,<dbl>,<chr>
2020-01-01,"Shrimp, frozen, chemical-free",10,USD/kg
2020-01-02,"Shrimp, frozen, chemical-free",13,USD/kg
2020-01-03,"Shrimp, frozen, chemical-free",14,USD/kg


### 3.3. Padding

In [48]:
library(stringr)

In [49]:
customer_id <- c(3, 423, 5464)
phone <- c(363334444, 913334444, 123334444)
name <- c('Jack', 'James', 'Gabriel')
information <- c('England Male', 'Colombia Male', 'France Female')

info <- data.frame(customer_id, phone, name, information)
info

customer_id,phone,name,information
<dbl>,<dbl>,<fct>,<fct>
3,363334444,Jack,England Male
423,913334444,James,Colombia Male
5464,123334444,Gabriel,France Female


In [50]:
info$customer_id <- info$customer_id %>% 
    as.character() %>%
    str_pad(width=4, pad='0')

info

customer_id,phone,name,information
<chr>,<dbl>,<fct>,<fct>
3,363334444,Jack,England Male
423,913334444,James,Colombia Male
5464,123334444,Gabriel,France Female


### 3.4. Splitting a column

In [51]:
library(tidyr)

In [52]:
customer_id <- c(3, 423, 5464)
phone <- c(363334444, 913334444, 123334444)
name <- c('Jack', 'James', 'Gabriel')
information <- c('England Male', 'Colombia Male', 'France Female')

info <- data.frame(customer_id, phone, name, information)
info

customer_id,phone,name,information
<dbl>,<dbl>,<fct>,<fct>
3,363334444,Jack,England Male
423,913334444,James,Colombia Male
5464,123334444,Gabriel,France Female


In [53]:
info %>% separate(col=information, into=c('country', 'gender'), sep=' ' )

customer_id,phone,name,country,gender
<dbl>,<dbl>,<fct>,<chr>,<chr>
3,363334444,Jack,England,Male
423,913334444,James,Colombia,Male
5464,123334444,Gabriel,France,Female


### 3.5. Concatenating columns

In [54]:
library(tidyr)

In [55]:
first_name <- c('Wayne', 'Cristiano', 'Lionel')
last_name <- c('Rooney', 'Ronaldo', 'Messi')
position <- c('Second Striker', 'Left Winger', 'Right Winger')

football <- data.frame(first_name, last_name, position)
football

first_name,last_name,position
<fct>,<fct>,<fct>
Wayne,Rooney,Second Striker
Cristiano,Ronaldo,Left Winger
Lionel,Messi,Right Winger


In [56]:
football %>% unite(col=name, first_name, last_name, sep=' ')

name,position
<chr>,<fct>
Wayne Rooney,Second Striker
Cristiano Ronaldo,Left Winger
Lionel Messi,Right Winger


## 4. Date and time 

In [57]:
library(lubridate)

"package 'lubridate' was built under R version 3.6.3"

Attaching package: 'lubridate'


The following objects are masked from 'package:base':

    date, intersect, setdiff, union




In [58]:
today()

In [59]:
now()

[1] "2020-10-08 11:11:18 +07"

### 4.1. Converting to datetime

In [60]:
library(lubridate)

In [61]:
date <- c('2020.01.01', '2020.01.02', '2020.01.03')
date %>% ymd

In [62]:
date <- c('1/1/2005', '13-02/2009', '01-Jun-2007')
date %>% dmy

### 4.2. Extracting date part

In [63]:
library(lubridate)

In [64]:
date <- seq(as.Date('2020-01-01'), as.Date('2020-12-01'), by='month')
date

In [65]:
date %>% year

In [66]:
# quarter of year
date %>% quarter

In [67]:
date %>% month

In [68]:
# week in year
date %>% week

In [69]:
date %>% day

In [70]:
# day of year
date %>% yday

In [71]:
# day of quarter
date %>% qday

In [72]:
# weekday
date %>% wday(label=T, abbr=T, week_start=1)

### 4.3. Timedelta

In [73]:
library(lubridate)

In [74]:
date1 <- seq(as.Date('2020-01-01'),as.Date('2020-01-31'), by=3)
date2 <- seq(as.Date('2020-01-02'),as.Date('2020-03-15'), by='week')

delta <- data.frame(date1, date2)
delta

date1,date2
<date>,<date>
2020-01-01,2020-01-02
2020-01-04,2020-01-09
2020-01-07,2020-01-16
2020-01-10,2020-01-23
2020-01-13,2020-01-30
2020-01-16,2020-02-06
2020-01-19,2020-02-13
2020-01-22,2020-02-20
2020-01-25,2020-02-27
2020-01-28,2020-03-05


In [75]:
delta$timedelta = delta$date2 - delta$date1

In [76]:
delta$date3 = delta$date1 + days(15)
delta

date1,date2,timedelta,date3
<date>,<date>,<drtn>,<date>
2020-01-01,2020-01-02,1 days,2020-01-16
2020-01-04,2020-01-09,5 days,2020-01-19
2020-01-07,2020-01-16,9 days,2020-01-22
2020-01-10,2020-01-23,13 days,2020-01-25
2020-01-13,2020-01-30,17 days,2020-01-28
2020-01-16,2020-02-06,21 days,2020-01-31
2020-01-19,2020-02-13,25 days,2020-02-03
2020-01-22,2020-02-20,29 days,2020-02-06
2020-01-25,2020-02-27,33 days,2020-02-09
2020-01-28,2020-03-05,37 days,2020-02-12
