Skip to content

Commit

Permalink
imputation module redacted needs review and should use actual data to…
Browse files Browse the repository at this point in the history
… impute instead of simply setting them as missing + just continue working on the simulation module which is a hobby project and completely separate from my current work
  • Loading branch information
jeffersonfparil committed Nov 9, 2023
1 parent 4005297 commit 213d4f4
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 51 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,12 @@ Convert the most widely used genotype data format, [variant call format (`*.vcf`

Convert [synchronised pileup format](#Sync) into a matrix ($n$ pools x $p$ alleles across loci) and write into a comma-delimited (csv) file.

### impute
<!-- ### impute (redacted for now 2023-11-10)
Impute allele frequencies set to missing according to another minimum depth parameter, i.e. `--min-depth-set-to-missing`. Two imputation algorithms are currently available (a third one is in the works):
1. computationally efficient mean value imputation, and
2. adaptive linkage disequilibrium-based k-nearest neighbour imputation (an extension of [LD-kNNi](https://doi.org/10.1534/g3.115.021667)).
2. adaptive linkage disequilibrium-based k-nearest neighbour imputation (an extension of [LD-kNNi](https://doi.org/10.1534/g3.115.021667)). -->

### fisher_exact_test

Expand Down
96 changes: 48 additions & 48 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,24 +108,24 @@ struct Args {
/// Genomewide unbiased determination of modes of convergent evolution (gudmc), i.e. recombination rate in centiMorgan per megabase (default from cM/Mb estimate in maize from https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-9-r103#Sec7)
#[clap(long, default_value_t = 0.73)]
recombination_rate_cm_per_mb: f64,
/// Imputation parameter, i.e. minimum depth to set to missing data for imputation
#[clap(long, default_value_t = 5.00)]
min_depth_set_to_missing: f64,
/// Imputation parameter, i.e. fraction of the pools with missing data to be ommited after sorting by rate of missingness
#[clap(long, default_value_t = 0.10)]
frac_top_missing_pools: f64,
/// Imputation parameter, i.e. fraction of the loci with missing data to be ommited after sorting by rate of missingness
#[clap(long, default_value_t = 0.10)]
frac_top_missing_loci: f64,
/// Imputation parameter, i.e. imputation method, select from "mean" for simple imputation using mean allele frequencies across non-missing pools, or "aLD-kNNi" for adaptive linkage disequillibrium (estimated using correlations within a window) k-nearest neighbour weighted allele frequencies imputation
#[clap(long, default_value = "aLD-kNNi")]
imputation_method: String,
/// Imputation parameter, i.e. maximum number of top correlated loci within the window which are considered in linkage disequillibrium (LD) with the locus requiring imputation. The resulting loci will be used to calculate pairwise distances (adaptive if we have too much missing data in the window at which point we use all the loci within the window).
#[clap(long, default_value_t = 10)]
n_loci_to_estimate_distance: u64,
/// Imputation parameter, i.e. number of nearest neighbours from which the imputed weighted (weights based on distance from the pool requiring imputation) mean allele frequencies will be calculated from.
#[clap(long, default_value_t = 5)]
k_neighbours: u64,
// /// Imputation parameter, i.e. minimum depth to set to missing data for imputation
// #[clap(long, default_value_t = 5.00)]
// min_depth_set_to_missing: f64,
// /// Imputation parameter, i.e. fraction of the pools with missing data to be ommited after sorting by rate of missingness
// #[clap(long, default_value_t = 0.10)]
// frac_top_missing_pools: f64,
// /// Imputation parameter, i.e. fraction of the loci with missing data to be ommited after sorting by rate of missingness
// #[clap(long, default_value_t = 0.10)]
// frac_top_missing_loci: f64,
// /// Imputation parameter, i.e. imputation method, select from "mean" for simple imputation using mean allele frequencies across non-missing pools, or "aLD-kNNi" for adaptive linkage disequillibrium (estimated using correlations within a window) k-nearest neighbour weighted allele frequencies imputation
// #[clap(long, default_value = "aLD-kNNi")]
// imputation_method: String,
// /// Imputation parameter, i.e. maximum number of top correlated loci within the window which are considered in linkage disequillibrium (LD) with the locus requiring imputation. The resulting loci will be used to calculate pairwise distances (adaptive if we have too much missing data in the window at which point we use all the loci within the window).
// #[clap(long, default_value_t = 10)]
// n_loci_to_estimate_distance: u64,
// /// Imputation parameter, i.e. number of nearest neighbours from which the imputed weighted (weights based on distance from the pool requiring imputation) mean allele frequencies will be calculated from.
// #[clap(long, default_value_t = 5)]
// k_neighbours: u64,
}

/// # poolgen: quantitative and population genetics on pool sequencing (Pool-seq) data
Expand Down Expand Up @@ -319,36 +319,36 @@ fn main() {
&args.n_threads,
)
.unwrap();
} else if args.analysis == String::from("impute") {
let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
output = if &args.imputation_method == &"mean".to_owned() {
impute_mean(
&file_sync_phen,
&filter_stats,
&args.min_depth_set_to_missing,
&args.frac_top_missing_pools,
&args.frac_top_missing_loci,
&args.n_threads,
&args.output,
)
.unwrap()
} else {
impute_aLDkNN(
&file_sync_phen,
&filter_stats,
&args.min_depth_set_to_missing,
&args.frac_top_missing_pools,
&args.frac_top_missing_loci,
&args.window_size_bp,
&args.window_slide_size_bp,
&args.min_loci_per_window,
&args.n_loci_to_estimate_distance,
&args.k_neighbours,
&args.n_threads,
&args.output,
)
.unwrap()
}
// } else if args.analysis == String::from("impute") {
// let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
// output = if &args.imputation_method == &"mean".to_owned() {
// impute_mean(
// &file_sync_phen,
// &filter_stats,
// &args.min_depth_set_to_missing,
// &args.frac_top_missing_pools,
// &args.frac_top_missing_loci,
// &args.n_threads,
// &args.output,
// )
// .unwrap()
// } else {
// impute_aLDkNN(
// &file_sync_phen,
// &filter_stats,
// &args.min_depth_set_to_missing,
// &args.frac_top_missing_pools,
// &args.frac_top_missing_loci,
// &args.window_size_bp,
// &args.window_slide_size_bp,
// &args.min_loci_per_window,
// &args.n_loci_to_estimate_distance,
// &args.k_neighbours,
// &args.n_threads,
// &args.output,
// )
// .unwrap()
// }
} else if args.analysis == String::from("genomic_prediction_cross_validation") {
let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
let genotypes_and_phenotypes = file_sync_phen
Expand Down
2 changes: 1 addition & 1 deletion src/simulation/simulate_genotypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,6 @@ mod tests {
let max_bp = 2.2e9 as usize;
let r2_50_perc_bp = 10e6 as usize;
let q = simulate_genotypes(n, p, n_chr, max_bp, r2_50_perc_bp).unwrap();
assert_eq!(0, 1);
// assert_eq!(0, 1);
}
}

0 comments on commit 213d4f4

Please sign in to comment.