Skip to content

Commit

Permalink
ready to test imputation modules
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffersonfparil committed Nov 1, 2023
1 parent 57a9acc commit 40890d5
Show file tree
Hide file tree
Showing 12 changed files with 106 additions and 47 deletions.
16 changes: 8 additions & 8 deletions src/base/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ pub fn load_table(
data_end_col: &usize,
) -> io::Result<(Vec<String>, Vec<String>, Vec<Vec<f64>>)> {
let file = File::open(fname).unwrap();
let mut reader = BufReader::new(file);
let reader = BufReader::new(file);
let mut lines = reader.lines();
let column_labels = match lines.next() {
Some(x) => x
Expand Down Expand Up @@ -544,13 +544,13 @@ mod tests {
Array1::from_shape_vec(2, vec![2.5, 7.0]).unwrap(),
mean_axis_ignore_nan(&array2d, 1).unwrap()
);
let fname = "./tests/test.csv".to_owned();
let delimiter = ",".to_owned();
let chr_col = 0;
let pos_start_col = 1;
let pos_end_col = 1;
let data_start_col = 2;
let data_end_col = 6;
let _fname = "./tests/test.csv".to_owned();
let _delimiter = ",".to_owned();
let _chr_col = 0;
let _pos_start_col = 1;
let _pos_end_col = 1;
let _data_start_col = 2;
let _data_end_col = 6;
let (row_labels, column_labels, data) = load_table(
&"./tests/test.csv".to_owned(),
&",".to_owned(),
Expand Down
4 changes: 2 additions & 2 deletions src/base/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1268,7 +1268,7 @@ impl SaveCsv for GenotypesAndPhenotypes {
// Note: All input parameters are not used except for one - out, the rest are for other implementations of this trait i.e. filter_stats, keep_p_minus_1, and n_threads
// Sanity checks
let (n, p) = self.intercept_and_allele_frequencies.dim();
let (n_, l) = self.coverages.dim();
let (n_, _l) = self.coverages.dim();
let (n__, _k) = self.phenotypes.dim();
let n___ = self.pool_names.len();
let p_ = self.chromosome.len();
Expand Down Expand Up @@ -1438,7 +1438,7 @@ mod tests {
.unwrap();
let mut frequencies_matrix: Array2<f64> =
Array2::zeros((counts_matrix.nrows(), counts_matrix.ncols()));
let row_sums: Vec<f64> = counts_matrix
let _row_sums: Vec<f64> = counts_matrix
.sum_axis(Axis(1))
.into_iter()
.map(|x| x as f64)
Expand Down
10 changes: 5 additions & 5 deletions src/gp/penalise.rs
Original file line number Diff line number Diff line change
Expand Up @@ -671,17 +671,17 @@ fn penalised_lambda_path_with_k_fold_cross_validation(
#[cfg(test)]
mod tests {
use super::*;
use ndarray::concatenate;

use rand::distributions::{Bernoulli, Distribution};

#[test]
fn test_penalised() {
let n: usize = 100 as usize;
let p: usize = 10_000 as usize + 1;
let intercept: Array2<f64> = Array2::ones((50, 1));
let _intercept: Array2<f64> = Array2::ones((50, 1));
let d = Bernoulli::new(0.5).unwrap();
let frequencies = d.sample(&mut rand::thread_rng()) as u64;
let mut x = Array2::from_shape_fn((n, p), |(i, j)| {
let _frequencies = d.sample(&mut rand::thread_rng()) as u64;
let mut x = Array2::from_shape_fn((n, p), |(_i, _j)| {
d.sample(&mut rand::thread_rng()) as u64 as f64
});
for i in 0..n {
Expand All @@ -703,7 +703,7 @@ mod tests {
.unwrap();
let row_idx: Vec<usize> = (0..50).collect();

let (b_hat, name) = penalise_lasso_like(&x, &y, &row_idx).unwrap();
let (_b_hat, _name) = penalise_lasso_like(&x, &y, &row_idx).unwrap();
// assert_eq!(0.009667742247346768, b_hat[(0,0)]);

let b: Array2<f64> =
Expand Down
2 changes: 1 addition & 1 deletion src/gwas/correlation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pub fn pearsons_correlation(
}
// Only use the dangerous "pairwise.complete.obs" correlation computation method when asked.
// Use only the pairs of values with non-missing data across the pair of vectors.
let (x, y) = if (method != &"pairwise.complete.obs".to_owned()) {
let (x, y) = if method != &"pairwise.complete.obs".to_owned() {
let filtered_vectors: (Vec<f64>, Vec<f64>) = x
.iter()
.zip(y.iter())
Expand Down
2 changes: 1 addition & 1 deletion src/gwas/mle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ pub fn mle_with_covariate(
////////////////////////////////////////////////////////////////////////////////////////////////////////////
#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_mle() {}
}
4 changes: 2 additions & 2 deletions src/gwas/ols.rs
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ mod tests {
distributions::{Bernoulli, Distribution},
prelude::*,
};
use std::slice::Iter;

#[test]
fn test_ols() {
// Simulate
Expand All @@ -451,7 +451,7 @@ mod tests {
let p = 50 as usize + 1; // number of predictors including the intercept
let q = 10 as usize; // number of predictors with non-zero effects
let d = Bernoulli::new(0.5).unwrap();
let mut x = Array2::from_shape_fn((n, p), |(i, j)| {
let mut x = Array2::from_shape_fn((n, p), |(_i, _j)| {
d.sample(&mut rand::thread_rng()) as u64 as f64
});
for i in 0..n {
Expand Down
14 changes: 6 additions & 8 deletions src/imputation/adaptive_ld_knn_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ fn find_k_nearest_neighbours(
window_freqs: ArrayView2<f64>,
dist: ArrayView2<f64>,
) -> io::Result<(Vec<f64>, Vec<f64>, Array1<f64>)> {
let (n, p) = window_freqs.dim();
let (n, _p) = window_freqs.dim();
// Find the k-nearest neighbours
let mut idx_pools: Vec<usize> = (0..n).collect();
idx_pools.sort_by(|&j0, &j1| {
Expand Down Expand Up @@ -175,7 +175,6 @@ impl GenotypesAndPhenotypes {
)
.unwrap();
let w = idx_window_head.len();
let w_ = idx_window_tail.len();
// Parallel processing per window
let mut vec_windows_freqs: Vec<Array2<f64>> = vec![Array2::from_elem((1, 1), f64::NAN); w];
let idx_windows: Vec<usize> = (0..w).collect();
Expand Down Expand Up @@ -297,9 +296,9 @@ impl GenotypesAndPhenotypes {
} // Impute if we have missing data
} // Iterate across alleles across loci within the window
}); // Parallel processing across windows
// }
// println!("vec_windows_freqs[0]:\n{:?}", vec_windows_freqs[0]);
// Write-out the imputed data
// }
// println!("vec_windows_freqs[0]:\n{:?}", vec_windows_freqs[0]);
// Write-out the imputed data
for a in 0..w {
// Use the indexes of each locus
let idx_ini = loci_idx[idx_window_head[a]];
Expand Down Expand Up @@ -399,12 +398,11 @@ pub fn impute_aLDkNN(
genotypes_and_phenotypes.coverages.ncols(),
duration.as_secs()
);
// Removing missing data, i.e. loci that cannot be imputed
// First remove missing data
// Remove 100% of the loci with missing data
let start = std::time::SystemTime::now();
genotypes_and_phenotypes
.filter_out_top_missing_loci(&1.00)
.unwrap(); // Remove 100% of the loci with missing data
.unwrap();
let end = std::time::SystemTime::now();
let duration = end.duration_since(start).unwrap();
println!(
Expand Down
2 changes: 1 addition & 1 deletion src/imputation/filtering_missing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ impl GenotypesAndPhenotypes {
min_depth_set_to_missing: &f64,
) -> io::Result<&mut Self> {
self.check().unwrap();
let (n, p) = self.intercept_and_allele_frequencies.dim();
let (n, _p) = self.intercept_and_allele_frequencies.dim();
let (_n, l) = self.coverages.dim();
let (loci_idx, _loci_chr, _loci_pos) = self.count_loci().unwrap();
for i in 0..n {
Expand Down
89 changes: 74 additions & 15 deletions src/imputation/mean_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@ use std::io;

impl GenotypesAndPhenotypes {
pub fn mean_imputation(&mut self) -> io::Result<&mut Self> {
self.check().unwrap();
// We are assuming that all non-zero alleles across pools are kept, i.e. biallelic loci have 2 columns, triallelic have 3, and so on.
let (n, p) = self.intercept_and_allele_frequencies.dim();
let (n_, l) = self.coverages.dim();
let (loci_idx, loci_chr, loci_pos) = self.count_loci().unwrap();
let l_ = loci_idx.len();
assert_eq!(n, n_);
assert_eq!(l, l_ - 1); // less trailing locus
let (loci_idx, _loci_chr, _loci_pos) = self.count_loci().unwrap();
let l = loci_idx.len() - 1;
for j in 0..l {
// Use the indexes of each locus
let idx_ini = loci_idx[j] as usize;
Expand Down Expand Up @@ -41,15 +39,6 @@ impl GenotypesAndPhenotypes {
};
}
}
// println!("loci_idx={:?}", loci_idx);
// println!("n={}; p={}; n_={}; l={}; l_={}", n, p, n_, l, l_);
// println!("loci_idx.len()={:?}", loci_idx.len());
// println!("loci_idx[l-2]={:?}", loci_idx[l-2]);
// println!("loci_idx[l-1]={:?}", loci_idx[l-1]);
// println!("idx_ini={:?}", idx_ini);
// println!("idx_fin={:?}", idx_fin);
// println!("freqs={:?}", freqs);
// println!("self.intercept_and_allele_frequencies.slice(s![.., idx_ini..idx_fin])={:?}", self.intercept_and_allele_frequencies.slice(s![.., idx_ini..idx_fin]));
}
Ok(self)
}
Expand All @@ -60,21 +49,89 @@ pub fn impute_mean(
file_sync_phen: &FileSyncPhen,
filter_stats: &FilterStats,
min_depth_set_to_missing: &f64,
frac_top_missing_pools: &f64,
frac_top_missing_loci: &f64,
n_threads: &usize,
out: &String,
) -> io::Result<String> {
// All non-zero alleles across pools are kept, i.e. biallelic loci have 2 columns, triallelic have 3, and so on.
let keep_p_minus_1 = false;
let start = std::time::SystemTime::now();
let mut genotypes_and_phenotypes = file_sync_phen
.into_genotypes_and_phenotypes(filter_stats, keep_p_minus_1, n_threads)
.unwrap();
let end = std::time::SystemTime::now();
let duration = end.duration_since(start).unwrap();
println!(
"Parsed the sync file into allele frequncies: {} pools x {} loci | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
genotypes_and_phenotypes
.set_missing_by_depth(min_depth_set_to_missing)
.unwrap();
let end = std::time::SystemTime::now();
let duration = end.duration_since(start).unwrap();
println!(
"Set missing loci below the minimum depth: {} pools x {} loci | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
genotypes_and_phenotypes
.filter_out_top_missing_pools(frac_top_missing_pools)
.unwrap();
let end = std::time::SystemTime::now();
let duration = end.duration_since(start).unwrap();
println!(
"Filtered out sparsest pools: {} pools x {} loci | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
genotypes_and_phenotypes
.filter_out_top_missing_loci(frac_top_missing_loci)
.unwrap();
let end = std::time::SystemTime::now();
let duration = end.duration_since(start).unwrap();
println!(
"Filtered out sparsest loci: {} pools x {} loci | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
duration.as_secs()
);
let start = std::time::SystemTime::now();
genotypes_and_phenotypes.mean_imputation().unwrap();
let end = std::time::SystemTime::now();
let duration = end.duration_since(start).unwrap();
println!(
"Mean value imputation: {} pools x {} loci | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
duration.as_secs()
);
// Remove 100% of the loci with missing data
let start = std::time::SystemTime::now();
genotypes_and_phenotypes
.filter_out_top_missing_loci(&1.00)
.unwrap();
let end = std::time::SystemTime::now();
let duration = end.duration_since(start).unwrap();
println!(
"Missing data removed, i.e. loci which cannot be imputed because of extreme sparsity: {} pools x {} loci | Duration: {} seconds",
genotypes_and_phenotypes.coverages.nrows(),
genotypes_and_phenotypes.coverages.ncols(),
duration.as_secs()
);
// Output
let out = genotypes_and_phenotypes
.write_csv(filter_stats, keep_p_minus_1, out, n_threads)
.unwrap();

Ok(out)
}

Expand Down Expand Up @@ -131,6 +188,8 @@ mod tests {
&file_sync_phen,
&filter_stats,
&min_depth_set_to_missing,
&0.2,
&0.5,
&n_threads,
&"test-impute_mean.csv".to_owned(),
)
Expand Down Expand Up @@ -161,6 +220,6 @@ mod tests {
.mean()
.unwrap()
);
// assert_eq!(0, 1);
assert_eq!(0, 1);
}
}
2 changes: 2 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,8 @@ fn main() {
&file_sync_phen,
&filter_stats,
&args.min_depth_set_to_missing,
&args.frac_top_missing_pools,
&args.frac_top_missing_loci,
&args.n_threads,
&args.output,
)
Expand Down
6 changes: 3 additions & 3 deletions src/popgen/gudmc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ use crate::base::*;
use crate::popgen::*;
use argmin::core::{self, CostFunction, Executor};
use argmin::solver::neldermead::NelderMead;
use ndarray::{prelude::*, Zip};
use ndarray::prelude::*;
use statrs::distribution::{Continuous, ContinuousCDF, Normal};
use statrs::statistics::Statistics;

use std::fs;
use std::io::{self, prelude::*};
use std::time::{SystemTime, UNIX_EPOCH};
Expand Down Expand Up @@ -408,7 +408,7 @@ pub fn gudmc(
.open(&fname_output)
.expect(&error_writing_file);
// Header
let mut line: Vec<String> = vec![
let line: Vec<String> = vec![
"pop_a",
"pop_b",
"chr",
Expand Down
2 changes: 1 addition & 1 deletion src/popgen/pi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::base::*;
use ndarray::{prelude::*, Zip};
use std::fs::OpenOptions;
use std::io::{self, prelude::*};
use std::io::{Error, ErrorKind};

use std::time::{SystemTime, UNIX_EPOCH};

/// Unbiased multi-allelic nucleotide diversity per population ($\pi$ or $\theta_{\pi}=4N_{e}\mu$), which is similar to [Korunes & Samuk 2019](https://doi.org/10.1111/1755-0998.13326) which assumes biallelic loci
Expand Down

0 comments on commit 40890d5

Please sign in to comment.