dacol provides utilities to add or modify columns in dataframe.
The utilities include:
- Statistical measures:
mode
,confident_interval
,ceiling
- Normalize a vector column:
cosine
,logistic
,zscore
- Compute distance between 2 vector columns:
euclidean
,pearson
,cosine
,canberra
- Manage outliers:
trim_outlier
,normalize_ptile
- Calculate percentile:
decile_band
,decile_ptile
,dc_rank_ptile
More info: https://ldanai.github.io/dacol/
You can install dacol from github with:
# install.packages("remotes")
remotes::install_github("ldanai/dacol")
This shows how to use dacol:
library(dacol)
library(dplyr)
max = 30
df = tibble(x1 = seq(-1.2*max, 1.2*max, length.out = 200),
x2 = seq(0, max, length.out = 200),
x3 = sample(200))
df
#> # A tibble: 200 x 3
#> x1 x2 x3
#> <dbl> <dbl> <int>
#> 1 -36 0 111
#> 2 -35.6 0.151 31
#> 3 -35.3 0.302 92
#> 4 -34.9 0.452 6
#> 5 -34.6 0.603 20
#> 6 -34.2 0.754 55
#> 7 -33.8 0.905 190
#> 8 -33.5 1.06 135
#> 9 -33.1 1.21 10
#> 10 -32.7 1.36 173
#> # ... with 190 more rows
df =
df %>%
mutate(
# Transformation
y_cosine = dc_cosine(x1, max),
y_logistic = dc_logistic(x2, max),
y_zcore = dc_zscore(x2),
# Distant between 2 vector columns
y_dist_canb = dc_dist_canberra(x2, x3),
y_dist_cos = dc_dist_cosine(x2, y_zcore),
y_dist_euc = dc_dist_euclidean(x2, y_zcore),
y_dist_pear = dc_dist_pearson(x2, y_zcore),
# Manage outliers
y_trim = dc_trim_outlier(x3, 0.01),
y_norm = dc_normalize_ptile(x3, 0.01),
# Stats measures
y_mode = dc_mode(x3),
y_ceil = dc_ceiling(x1, -1),
# Band segmentation
y_dec_band1 = dc_decile_band(x3),
y_dec_band2 = dc_decile_band(x3, c(seq(0, 0.9, 0.1))),
y_dec_ptile1 = dc_decile_ptile(x3),
y_dec_ptile2 = dc_decile_ptile(x3, c(seq(0, 0.9, 0.1))),
# Rank percentile
y_ranked1 = dc_rank_ptile(x3),
y_ranked2 = dc_rank_ptile(x3, c(seq(1, 100, 1)))
)
#> Warning in if (is.na(n)) n = max(dplyr::n_distinct(x), 10000): the condition has
#> length > 1 and only the first element will be used
df
#> # A tibble: 200 x 20
#> x1 x2 x3 y_cosine y_logistic y_zcore y_dist_canb y_dist_cos
#> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -36 0 111 0 0 -1.72 0 0.498
#> 2 -35.6 0.151 31 0 0.0251 -1.70 0.00484 0.498
#> 3 -35.3 0.302 92 0 0.0502 -1.68 0.00327 0.498
#> 4 -34.9 0.452 6 0 0.0752 -1.67 0.0701 0.498
#> 5 -34.6 0.603 20 0 0.100 -1.65 0.0293 0.498
#> 6 -34.2 0.754 55 0 0.125 -1.63 0.0135 0.498
#> 7 -33.8 0.905 190 0 0.150 -1.62 0.00474 0.498
#> 8 -33.5 1.06 135 0 0.174 -1.60 0.00776 0.498
#> 9 -33.1 1.21 10 0 0.198 -1.58 0.108 0.498
#> 10 -32.7 1.36 173 0 0.222 -1.56 0.00778 0.498
#> # ... with 190 more rows, and 12 more variables: y_dist_euc <dbl>,
#> # y_dist_pear <dbl>, y_trim <dbl>, y_norm <dbl>, y_mode <int>, y_ceil <dbl>,
#> # y_dec_band1 <int>, y_dec_band2 <int>, y_dec_ptile1 <dbl>,
#> # y_dec_ptile2 <dbl>, y_ranked1 <dbl>, y_ranked2 <dbl>
Please note that the ‘dacol’ project is released with a Contributor Code of Conduct. By contributing to this project, you agree to abide by its terms.