Skip to content

Commit

Permalink
Merge pull request #1569 from jqnatividad/apply_ops_gender_guesser
Browse files Browse the repository at this point in the history
`apply`: add `gender_guess` operation
  • Loading branch information
jqnatividad committed Jan 27, 2024
2 parents e32c710 + a3aa413 commit 6436e66
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 1 deletion.
6 changes: 6 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ flexi_logger = { version = "0.27", features = [
], default-features = false }
futures = "0.3"
futures-util = "0.3"
gender_guesser = { version = "0.2", optional = true }
geosuggest-core = { version = "0.6", optional = true }
geosuggest-utils = { version = "0.6", optional = true }
governor = { version = "0.6", optional = true }
Expand Down Expand Up @@ -235,6 +236,7 @@ serial_test = { version = "3.0", features = ["file_locks"] }
dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps" }
grex = { git = "https://github.com/pemistahl/grex", rev = "8f6b35cee5f911311c2e0ef6e56f333e4c896112" }
halfbrown = { git = "https://github.com/licenser/halfbrown", rev = "7cecc29422ae2775abe35a2e430f1678b4f1aa76" }
gender_guesser = { git = "https://github.com/jqnatividad/gender_guesser", branch = "bundle_namdict_txt"}

[features]
default = ["mimalloc"]
Expand All @@ -256,6 +258,7 @@ apply = [
"cpc",
"data-encoding",
"eudex",
"gender_guesser",
"hashbrown",
"qsv_currency",
"strsim",
Expand Down
24 changes: 23 additions & 1 deletion src/cmd/apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Apply a series of transformation functions to given CSV column/s. This can be us
perform typical data-wrangling tasks and/or to harmonize some values, etc.
It has five subcommands:
1. operations* - 36 string, format, currency, regex & NLP operators.
1. operations* - 37 string, format, currency, regex & NLP operators.
2. emptyreplace* - replace empty cells with <--replacement> string.
3. datefmt* - Formats recognized date/s (19 formats recognized) to
a specified date format using <--formatstr>.
Expand Down Expand Up @@ -73,6 +73,7 @@ It has 36 supported operations:
with --comparand. Automatically rounds values to two decimal places. Specify
"euro" formatting (e.g. 1.000,00 instead of 1,000.00 ) by setting --formatstr to "euro".
Specify conversion rate by setting --replacement to a number.
* gender_guess: Guess the gender of a name.
* copy: Mark a column for copying
* simdl: Damerau-Levenshtein similarity to --comparand
* simdln: Normalized Damerau-Levenshtein similarity to --comparand (between 0.0 & 1.0)
Expand Down Expand Up @@ -358,6 +359,7 @@ use cpc::{eval, units::Unit};
use data_encoding::BASE64;
use dynfmt::Format;
use eudex::Hash;
use gender_guesser::Gender;
use indicatif::{ProgressBar, ProgressDrawTarget};
use log::debug;
use qsv_currency::Currency;
Expand Down Expand Up @@ -402,6 +404,7 @@ enum Operations {
Encode,
Escape,
Eudex,
Gender_Guess,
Len,
Lower,
Ltrim,
Expand Down Expand Up @@ -999,6 +1002,13 @@ fn validate_operations(
}
whatlang_invokes = whatlang_invokes.saturating_add(1);
},
Operations::Gender_Guess => {
if flag_new_column.is_none() {
return fail_incorrectusage_clierror!(
"--new_column (-c) is required for Gender_Guess"
);
}
},
_ => {},
}
ops_vec.push(operation);
Expand Down Expand Up @@ -1083,6 +1093,18 @@ fn apply_operations(
Err(e) => format!("decoding error: {e:?}"),
};
},
Operations::Gender_Guess => {
let gender_detector = gender_guesser::Detector::new();
*cell = match gender_detector.get_gender(cell) {
Gender::Male => "Male".to_string(),
Gender::Female => "Female".to_string(),
Gender::MayBeMale => "MayBeMale".to_string(),
Gender::MayBeFemale => "MayBeFemale".to_string(),
Gender::BothMaleFemale => "BothMaleFemale".to_string(),
Gender::NotSure => "NotSure".to_string(),
Gender::NotFound => "NotFound".to_string(),
};
},
Operations::Escape => {
*cell = cell.escape_default().to_string();
},
Expand Down
55 changes: 55 additions & 0 deletions tests/test_apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,61 @@ fn apply_ops_upper() {
assert_eq!(got, expected);
}

#[test]
fn apply_ops_gender_guess() {
let wrk = Workdir::new("apply");
wrk.create(
"data.csv",
vec![
svec!["name"],
svec!["Peter"],
svec!["Michael"],
svec!["Joel"],
svec!["Hussein"],
svec!["Ian"],
svec!["Enrique"],
svec!["Ana"],
svec!["Olivia"],
svec!["Mackenzie"],
svec!["Adair"],
svec!["Aaf"],
svec!["Voldemort"],
svec!["Sami"],
svec!["Minhaj"],
svec!["Abdurrahman"],
svec!["Abbe"],
],
);
let mut cmd = wrk.command("apply");
cmd.arg("operations")
.arg("gender_guess")
.arg("name")
.args(["--new-column", "Gender"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["name", "Gender"],
svec!["Peter", "Male"],
svec!["Michael", "Male"],
svec!["Joel", "Male"],
svec!["Hussein", "Male"],
svec!["Ian", "Male"],
svec!["Enrique", "Male"],
svec!["Ana", "Female"],
svec!["Olivia", "Female"],
svec!["Mackenzie", "NotSure"],
svec!["Adair", "MayBeMale"],
svec!["Aaf", "MayBeFemale"],
svec!["Voldemort", "NotFound"],
svec!["Sami", "Male"],
svec!["Minhaj", "NotFound"],
svec!["Abdurrahman", "Male"],
svec!["Abbe", "NotSure"],
];
assert_eq!(got, expected);
}

#[test]
fn apply_ops_escape() {
let wrk = Workdir::new("apply");
Expand Down

0 comments on commit 6436e66

Please sign in to comment.