imputation module redacted needs review and should use actual data to…

… impute instead of simply setting them as missing + just continue working on the simulation module which is a hobby project and completely separate from my current work
jeffersonfparil · Nov 9, 2023 · 213d4f4 · 213d4f4
1 parent 4005297
commit 213d4f4
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -99,12 +99,12 @@ Convert the most widely used genotype data format, [variant call format (`*.vcf`
 
 Convert [synchronised pileup format](#Sync) into a matrix ($n$ pools x $p$ alleles across loci) and write into a comma-delimited (csv) file.
 
-### impute
+<!-- ### impute (redacted for now 2023-11-10)
 
 Impute allele frequencies set to missing according to another minimum depth parameter, i.e. `--min-depth-set-to-missing`. Two imputation algorithms are currently available (a third one is in the works):
 
 1. computationally efficient mean value imputation, and
-2. adaptive linkage disequilibrium-based k-nearest neighbour imputation (an extension of [LD-kNNi](https://doi.org/10.1534/g3.115.021667)).
+2. adaptive linkage disequilibrium-based k-nearest neighbour imputation (an extension of [LD-kNNi](https://doi.org/10.1534/g3.115.021667)). -->
 
 ### fisher_exact_test
 

diff --git a/src/main.rs b/src/main.rs
@@ -108,24 +108,24 @@ struct Args {
     /// Genomewide unbiased determination of modes of convergent evolution (gudmc), i.e. recombination rate in centiMorgan per megabase (default from cM/Mb estimate in maize from https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-9-r103#Sec7)
     #[clap(long, default_value_t = 0.73)]
     recombination_rate_cm_per_mb: f64,
-    /// Imputation parameter, i.e. minimum depth to set to missing data for imputation
-    #[clap(long, default_value_t = 5.00)]
-    min_depth_set_to_missing: f64,
-    /// Imputation parameter, i.e. fraction of the pools with missing data to be ommited after sorting by rate of missingness
-    #[clap(long, default_value_t = 0.10)]
-    frac_top_missing_pools: f64,
-    /// Imputation parameter, i.e. fraction of the loci with missing data to be ommited after sorting by rate of missingness
-    #[clap(long, default_value_t = 0.10)]
-    frac_top_missing_loci: f64,
-    /// Imputation parameter, i.e. imputation method, select from "mean" for simple imputation using mean allele frequencies across non-missing pools, or "aLD-kNNi" for adaptive linkage disequillibrium (estimated using correlations within a window) k-nearest neighbour weighted allele frequencies imputation
-    #[clap(long, default_value = "aLD-kNNi")]
-    imputation_method: String,
-    /// Imputation parameter, i.e. maximum number of top correlated loci within the window which are considered in linkage disequillibrium (LD) with the locus requiring imputation. The resulting loci will be used to calculate pairwise distances (adaptive if we have too much missing data in the window at which point we use all the loci within the window).
-    #[clap(long, default_value_t = 10)]
-    n_loci_to_estimate_distance: u64,
-    /// Imputation parameter, i.e. number of nearest neighbours from which the imputed weighted (weights based on distance from the pool requiring imputation) mean allele frequencies will be calculated from.
-    #[clap(long, default_value_t = 5)]
-    k_neighbours: u64,
+    // /// Imputation parameter, i.e. minimum depth to set to missing data for imputation
+    // #[clap(long, default_value_t = 5.00)]
+    // min_depth_set_to_missing: f64,
+    // /// Imputation parameter, i.e. fraction of the pools with missing data to be ommited after sorting by rate of missingness
+    // #[clap(long, default_value_t = 0.10)]
+    // frac_top_missing_pools: f64,
+    // /// Imputation parameter, i.e. fraction of the loci with missing data to be ommited after sorting by rate of missingness
+    // #[clap(long, default_value_t = 0.10)]
+    // frac_top_missing_loci: f64,
+    // /// Imputation parameter, i.e. imputation method, select from "mean" for simple imputation using mean allele frequencies across non-missing pools, or "aLD-kNNi" for adaptive linkage disequillibrium (estimated using correlations within a window) k-nearest neighbour weighted allele frequencies imputation
+    // #[clap(long, default_value = "aLD-kNNi")]
+    // imputation_method: String,
+    // /// Imputation parameter, i.e. maximum number of top correlated loci within the window which are considered in linkage disequillibrium (LD) with the locus requiring imputation. The resulting loci will be used to calculate pairwise distances (adaptive if we have too much missing data in the window at which point we use all the loci within the window).
+    // #[clap(long, default_value_t = 10)]
+    // n_loci_to_estimate_distance: u64,
+    // /// Imputation parameter, i.e. number of nearest neighbours from which the imputed weighted (weights based on distance from the pool requiring imputation) mean allele frequencies will be calculated from.
+    // #[clap(long, default_value_t = 5)]
+    // k_neighbours: u64,
 }
 
 /// # poolgen: quantitative and population genetics on pool sequencing (Pool-seq) data
@@ -319,36 +319,36 @@ fn main() {
                     &args.n_threads,
                 )
                 .unwrap();
-        } else if args.analysis == String::from("impute") {
-            let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
-            output = if &args.imputation_method == &"mean".to_owned() {
-                impute_mean(
-                    &file_sync_phen,
-                    &filter_stats,
-                    &args.min_depth_set_to_missing,
-                    &args.frac_top_missing_pools,
-                    &args.frac_top_missing_loci,
-                    &args.n_threads,
-                    &args.output,
-                )
-                .unwrap()
-            } else {
-                impute_aLDkNN(
-                    &file_sync_phen,
-                    &filter_stats,
-                    &args.min_depth_set_to_missing,
-                    &args.frac_top_missing_pools,
-                    &args.frac_top_missing_loci,
-                    &args.window_size_bp,
-                    &args.window_slide_size_bp,
-                    &args.min_loci_per_window,
-                    &args.n_loci_to_estimate_distance,
-                    &args.k_neighbours,
-                    &args.n_threads,
-                    &args.output,
-                )
-                .unwrap()
-            }
+        // } else if args.analysis == String::from("impute") {
+        //     let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
+        //     output = if &args.imputation_method == &"mean".to_owned() {
+        //         impute_mean(
+        //             &file_sync_phen,
+        //             &filter_stats,
+        //             &args.min_depth_set_to_missing,
+        //             &args.frac_top_missing_pools,
+        //             &args.frac_top_missing_loci,
+        //             &args.n_threads,
+        //             &args.output,
+        //         )
+        //         .unwrap()
+        //     } else {
+        //         impute_aLDkNN(
+        //             &file_sync_phen,
+        //             &filter_stats,
+        //             &args.min_depth_set_to_missing,
+        //             &args.frac_top_missing_pools,
+        //             &args.frac_top_missing_loci,
+        //             &args.window_size_bp,
+        //             &args.window_slide_size_bp,
+        //             &args.min_loci_per_window,
+        //             &args.n_loci_to_estimate_distance,
+        //             &args.k_neighbours,
+        //             &args.n_threads,
+        //             &args.output,
+        //         )
+        //         .unwrap()
+        //     }
         } else if args.analysis == String::from("genomic_prediction_cross_validation") {
             let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
             let genotypes_and_phenotypes = file_sync_phen

diff --git a/src/simulation/simulate_genotypes.rs b/src/simulation/simulate_genotypes.rs
@@ -66,6 +66,6 @@ mod tests {
         let max_bp = 2.2e9 as usize;
         let r2_50_perc_bp = 10e6 as usize;
         let q = simulate_genotypes(n, p, n_chr, max_bp, r2_50_perc_bp).unwrap();
-        assert_eq!(0, 1);
+        // assert_eq!(0, 1);
     }
 }