Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
238 lines (213 sloc) 7.68 KB
/**
* Copyright (c) 2010, Benjamin C. Meyer
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the Benjamin Meyer nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/**
* Original puzzle description from http://www.itasoftware.com/careers/
*
* BitVector Genealogy
*
* The BitVectors are an ancient and immortal race of 10,000, each with a
* 10,000 bit genome. The race evolved from a single individual by the
* following process: 9,999 times a BitVector chosen at random from amongst
* the population was cloned using an error-prone process that replicates
* each bit of the genome with 80% fidelity (any particular bit is flipped
* from parent to child 20% of the time, independent of other bits).
*
* Write a program to guess the reproductive history of BitVectors from their
* genetic material. The randomly-ordered file bitvectors-genes.data
* contains a 10,000 bit line for each individual. Your program's output
* should be, for each input line, the 0-based line number of that
* individual's parent, or -1 if it is the progenitor. Balance performance
* against probability of mistakes as you see fit.
*
* To help you test your program, here is a much smaller 500 x 500 input
* dataset: bitvectors-genes.data.small, along with its
* solution file: bitvectors-parents.data.small
*/
#include <QtCore/QtCore>
#define MUTATION_RATE 0.2
class BitVector;
QList<BitVector> genomes;
static int maximumDifference;
class BitVector {
public:
BitVector(const QString &line);
int parent;
// Use QVarLengthArray for speed over a bit array
QVarLengthArray<short> genome;
QVarLengthArray<int> hammingDistance;
bool findParents();
void findUnlikelyParents(int offset);
int computeHammingDistance(const BitVector &other) const;
};
BitVector::BitVector(const QString &line)
: parent(-1)
{
genome.resize(line.size());
hammingDistance.resize(line.size());
for (int i = 0; i < line.size(); ++i)
genome[i] = line[i] == '1' ? 1 : 0;
}
void computeDistanceMatrix() {
const int size = genomes.count();
for (int i = 0; i < size; ++i) {
for (int j = i; j < size; ++j) {
const int d = genomes[i].computeHammingDistance(genomes[j]);
genomes[i].hammingDistance[j] = d;
genomes[j].hammingDistance[i] = d;
}
}
}
int BitVector::computeHammingDistance(const BitVector &other) const
{
const int l = genome.count();
int d = 0;
for (int i = 0; i < l; ++i)
if (genome[i] != other.genome[i])
++d;
return d;
}
// If we only have one connection to another node mark it as our parent
// findParents can ignores nodes that have a parent set because
// they must be children from previous findParents runs.
bool BitVector::findParents()
{
if (parent != -1)
return false;
int bestParent = -1;
int c = 0;
for (int i = 0; i < hammingDistance.count(); ++i) {
if (hammingDistance[i] == 0 || genomes[i].parent != -1)
continue;
if (hammingDistance[i] > maximumDifference)
continue;
bestParent = i;
++c;
}
if (c == 1) {
parent = bestParent;
}
return c == 1;
}
// Return a list of all nodes that are descendent from offset
QList<int> findAllChildren(int offset)
{
QList<int> allChildren;
QList<int> todo;
todo.append(offset);
while (!todo.empty()) {
int f = todo.takeFirst();
for (int i = 0; i < genomes.count(); ++i) {
if (genomes[i].parent == f) {
allChildren.append(i);
todo.append(i);
}
}
}
return allChildren;
}
// Because we cannot assume genomes[i].parent we
// must make sure to not connect a node to its own descendent
// by using findAllChildren.
void BitVector::findUnlikelyParents(int offset)
{
QList<int> children = findAllChildren(offset);
for (int i = 0; i < hammingDistance.count(); ++i) {
if (hammingDistance[i] == 0)
continue;
if (hammingDistance[i] > maximumDifference)
continue;
if (children.contains(i))
continue;
parent = i;
return;
}
}
inline double binomialstd(double n, double p)
{
return sqrt(n * p * (1 - p));
}
int main(int argc, char **argv)
{
// Read in the genomes
QString fileName = "data/bitvectors-genes.data.small";
if (argc == 2)
fileName = argv[1];
QFile file(fileName);
if (!file.open(QFile::ReadOnly)) {
qDebug() << file.errorString();
return 1;
}
QTextStream stream(&file);
while (!stream.atEnd()) {
const QString line = stream.readLine();
if (!line.isEmpty())
genomes.append(BitVector(line));
}
// Determine the maximum distance from the parent we will
// tolerate at this point in time.
int size = genomes[0].genome.size();
double mutationRate = MUTATION_RATE;
double perfectDifference = size * mutationRate;
double std = binomialstd(size, mutationRate);
maximumDifference = perfectDifference + (3 * std);
computeDistanceMatrix();
// If a node only has one connection to another genome set it as the parent.
// Repeat by remove nodes that are already child nodes finding new nodes
// that only have one parent
int c = 0;
do {
c = 0;
for (int i = 0; i < genomes.count(); ++i) {
if (genomes[i].findParents())
++c;
}
} while (c > 0);
// For the final nodes increase the maximum distance to catch
// the best match for outliers.
maximumDifference = perfectDifference + (5 * std);
for (int i = 0; i < genomes.count(); ++i)
if (genomes[i].parent == -1)
genomes[i].findUnlikelyParents(i);
// Final lost gnomes
// Try swapping child/parents of any two node graphs to find better combinations.
for (int i = 0; i < genomes.count(); ++i)
if (genomes[i].parent == -1) {
QList<int> children = findAllChildren(i);
if (children.count() == 1) {
int other = children[0];
genomes[i].parent = genomes[other].parent;
genomes[other].parent = -1;
genomes[other].findUnlikelyParents(other);
}
}
// Output results
QTextStream out(stdout);
for (int i = 0; i < genomes.count(); ++i)
out << genomes[i].parent << endl;
return 0;
}