学习R语言{janitor}包。

In [4]:
library(tidyverse)
library(janitor)

-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.4     [32mv[39m [34mreadr    [39m 2.1.5
[32mv[39m [34mforcats  [39m 1.0.0     [32mv[39m [34mstringr  [39m 1.5.1
[32mv[39m [34mggplot2  [39m 3.5.1     [32mv[39m [34mtibble   [39m 3.2.1
[32mv[39m [34mlubridate[39m 1.9.4     [32mv[39m [34mtidyr    [39m 1.3.1
[32mv[39m [34mpurrr    [39m 1.0.2     
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mi[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: 'janitor'


The following objects are masked from 'package:stats':

    chisq.test, fisher.test




# 数据清洗

## clean_names()

In [2]:
# Create a data.frame with dirty names
test_df <- as.data.frame(matrix(ncol = 6))
names(test_df) <- c("firstName", "ábc@!*", "% successful (2009)",
                    "REPEAT VALUE", "REPEAT VALUE", "")
test_df

firstName,<U+00E1>bc@!*,% successful (2009),REPEAT VALUE,REPEAT VALUE,Unnamed: 5_level_0
<lgl>,<lgl>,<lgl>,<lgl>,<lgl>.1,<lgl>
,,,,,


In [5]:
test_df %>% clean_names()

"unable to translate '<U+00C4>' to native encoding"
"unable to translate '<U+00D6>' to native encoding"
"unable to translate '<U+00DC>' to native encoding"
"unable to translate '<U+00E4>' to native encoding"
"unable to translate '<U+00F6>' to native encoding"
"unable to translate '<U+00FC>' to native encoding"
"unable to translate '<U+00DF>' to native encoding"
"unable to translate '<U+00C6>' to native encoding"
"unable to translate '<U+00E6>' to native encoding"
"unable to translate '<U+00D8>' to native encoding"
"unable to translate '<U+00F8>' to native encoding"
"unable to translate '<U+00C5>' to native encoding"
"unable to translate '<U+00E5>' to native encoding"


first_name,u_00e1_bc,percent_successful_2009,repeat_value,repeat_value_2,x
<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
,,,,,


In [6]:
make.names(names(test_df))
#> [1] "firstName"            "ábc..."               "X..successful..2009."
#> [4] "REPEAT.VALUE"         "REPEAT.VALUE"         "X"

## compare_df_cols

In [7]:
df1 <- data.frame(a = 1:2, b = c("big", "small"))
df2 <- data.frame(a = 10:12, b = c("medium", "small", "big"), c = 0, stringsAsFactors = TRUE) # here, column b is a factor
df3 <- df1 %>%
  dplyr::mutate(b = as.character(b))

In [8]:
compare_df_cols(df1, df2, df3)

column_name,df1,df2,df3
<chr>,<chr>,<chr>,<chr>
a,integer,integer,integer
b,character,factor,character
c,,numeric,


# 数据探索

In [9]:
mtcars %>%
  tabyl(gear, cyl) %>%
  adorn_totals("col") %>%
  adorn_percentages("row") %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns() %>%
  adorn_title()
#>              cyl                                    
#>  gear          4          6           8        Total
#>     3  6.67% (1) 13.33% (2) 80.00% (12) 100.00% (15)
#>     4 66.67% (8) 33.33% (4)  0.00%  (0) 100.00% (12)
#>     5 40.00% (2) 20.00% (1) 40.00%  (2) 100.00%  (5)

Unnamed: 0_level_0,cyl,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
<chr>,<chr>,<chr>,<chr>,<chr>
gear,4,6,8,Total
3,6.67% (1),13.33% (2),80.00% (12),100.00% (15)
4,66.67% (8),33.33% (4),0.00% (0),100.00% (12)
5,40.00% (2),20.00% (1),40.00% (2),100.00% (5)


## get_dupes()

In [10]:
get_dupes(mtcars, wt, cyl) # or mtcars %>% get_dupes(wt, cyl) if you prefer to pipe
#>     wt cyl dupe_count  mpg  disp  hp drat  qsec vs am gear carb
#> 1 3.44   6          2 19.2 167.6 123 3.92 18.30  1  0    4    4
#> 2 3.44   6          2 17.8 167.6 123 3.92 18.90  1  0    4    4
#> 3 3.57   8          2 14.3 360.0 245 3.21 15.84  0  0    3    4
#> 4 3.57   8          2 15.0 301.0 335 3.54 14.60  0  1    5    8

wt,cyl,dupe_count,mpg,disp,hp,drat,qsec,vs,am,gear,carb
<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
3.44,6,2,19.2,167.6,123,3.92,18.3,1,0,4,4
3.44,6,2,17.8,167.6,123,3.92,18.9,1,0,4,4
3.57,8,2,14.3,360.0,245,3.21,15.84,0,0,3,4
3.57,8,2,15.0,301.0,335,3.54,14.6,0,1,5,8


## get_one_to_one()

In [11]:
library(dplyr)
starwars[1:4,] %>%
  get_one_to_one()
#> [[1]]
#> [1] "name"       "height"     "mass"       "skin_color" "birth_year"
#> [6] "films"     
#> 
#> [[2]]
#> [1] "hair_color" "starships" 
#> 
#> [[3]]
#> [1] "sex"     "species"

In [17]:
make_clean_names(c("First Name", "Age (Years)", "Income%", "Country#", "naïve"))