Skip to content

Commit

Permalink
feat: Updated src/Snps/SNPs.php
Browse files Browse the repository at this point in the history
  • Loading branch information
sweep-ai[bot] committed Mar 14, 2024
1 parent 0d9c8bc commit 08f5f25
Showing 1 changed file with 96 additions and 3 deletions.
99 changes: 96 additions & 3 deletions src/Snps/SNPs.php
Expand Up @@ -41,9 +41,9 @@ class SNPs implements Countable, Iterator
private array $_duplicate = [];
private array $_discrepant_XY = [];
private array $_heterozygous_MT = [];


/**
private DataFrame $dataFrame;
private SNPAnalysis $snpAnalysis;
private MathOperations $mathOperations;

/**
* SNPs constructor.
Expand Down Expand Up @@ -96,6 +96,9 @@ public function __construct(
// $this->_parallelizer = new Parallelizer($parallelize, $processes);
$this->_cluster = "";
$this->_chip = "";
$this->dataFrame = new DataFrame();
$this->snpAnalysis = new SNPAnalysis();
$this->mathOperations = new MathOperations();
$this->_chip_version = "";

$this->ensemblRestClient = $ensemblRestClient ?? new Ensembl("https://api.ncbi.nlm.nih.gov", 1);
Expand Down Expand Up @@ -2971,3 +2974,93 @@ public function matchWith(SNPs $otherKit): SNPs


// }
/**
* Computes cluster overlap based on given threshold.
*
* @param float $cluster_overlap_threshold The threshold for cluster overlap.
* @return array The computed cluster overlap DataFrame.
*/
public function computeClusterOverlap($cluster_overlap_threshold = 0.95): array {
// Sample data for cluster overlap computation
$data = [
"cluster_id" => ["c1", "c3", "c4", "c5", "v5"],
"company_composition" => [
"23andMe-v4",
"AncestryDNA-v1, FTDNA, MyHeritage",
"23andMe-v3",
"AncestryDNA-v2",
"23andMe-v5, LivingDNA",
],
"chip_base_deduced" => [
"HTS iSelect HD",
"OmniExpress",
"OmniExpress plus",
"OmniExpress plus",
"Illumina GSAs",
],
"snps_in_cluster" => array_fill(0, 5, 0),
"snps_in_common" => array_fill(0, 5, 0),
];

// Create a DataFrame from the data and set "cluster_id" as the index
$df = new DataFrame($data);
$df->setIndex("cluster_id");

$to_remap = null;
if ($this->build != 37) {
// Create a clone of the current object for remapping
$to_remap = clone $this;
$to_remap->remap(37); // clusters are relative to Build 37
$self_snps = $to_remap->snps()->select(["chrom", "pos"])->dropDuplicates();
} else {
$self_snps = $this->snps()->select(["chrom", "pos"])->dropDuplicates();
}

// Retrieve chip clusters from resources
$chip_clusters = $this->resources->get_chip_clusters();

// Iterate over each cluster in the DataFrame
foreach ($df->indexValues() as $cluster) {
// Filter chip clusters based on the current cluster
$cluster_snps = $chip_clusters->filter(function ($row) use ($cluster) {
return strpos($row["clusters"], $cluster) !== false;
})->select(["chrom", "pos"]);

// Update the DataFrame with the number of SNPs in the cluster and in common with the current object
$df->loc[$cluster]["snps_in_cluster"] = count($cluster_snps);
$df->loc[$cluster]["snps_in_common"] = count($self_snps->merge($cluster_snps, "inner"));

// Calculate overlap ratios for cluster and self
$df["overlap_with_cluster"] = $df["snps_in_common"] / $df["snps_in_cluster"];
$df["overlap_with_self"] = $df["snps_in_common"] / count($self_snps);

// Find the cluster with the maximum overlap
$max_overlap = array_keys($df["overlap_with_cluster"], max($df["overlap_with_cluster"]))[0];

// Check if the maximum overlap exceeds the threshold for both cluster and self
if (
$df["overlap_with_cluster"][$max_overlap] > $cluster_overlap_threshold &&
$df["overlap_with_self"][$max_overlap] > $cluster_overlap_threshold
) {
// Update the current object's cluster and chip based on the maximum overlap
$this->cluster = $max_overlap;
$this->chip = $df["chip_base_deduced"][$max_overlap];

$company_composition = $df["company_composition"][$max_overlap];

// Check if the current object's source is present in the company composition
if (strpos($company_composition, $this->source) !== false) {
if ($this->source === "23andMe" || $this->source === "AncestryDNA") {
// Extract the chip version from the company composition
$i = strpos($company_composition, "v");
$this->chip_version = substr($company_composition, $i, $i + 2);
}
} else {
// Log a warning about the SNPs data source not found
}
}
}

// Return the computed cluster overlap DataFrame
return $df;
}

0 comments on commit 08f5f25

Please sign in to comment.